Merge tag 'mm-stable-2022-08-09' of git://git.kernel.org/pub/scm/linux/kernel/git...
[platform/kernel/linux-starfive.git] / mm / huge_memory.c
index 1596508..8a7c1b3 100644 (file)
@@ -70,21 +70,85 @@ static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
-bool transparent_hugepage_active(struct vm_area_struct *vma)
+bool hugepage_vma_check(struct vm_area_struct *vma,
+                       unsigned long vm_flags,
+                       bool smaps, bool in_pf)
 {
-       /* The addr is used to check if the vma size fits */
-       unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+       if (!vma->vm_mm)                /* vdso */
+               return false;
+
+       /*
+        * Explicitly disabled through madvise or prctl, or some
+        * architectures may disable THP for some mappings, for
+        * example, s390 kvm.
+        * */
+       if ((vm_flags & VM_NOHUGEPAGE) ||
+           test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+               return false;
+       /*
+        * If the hardware/firmware marked hugepage support disabled.
+        */
+       if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+               return false;
 
-       if (!transhuge_vma_suitable(vma, addr))
+       /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
+       if (vma_is_dax(vma))
+               return in_pf;
+
+       /*
+        * Special VMA and hugetlb VMA.
+        * Must be checked after dax since some dax mappings may have
+        * VM_MIXEDMAP set.
+        */
+       if (vm_flags & VM_NO_KHUGEPAGED)
                return false;
-       if (vma_is_anonymous(vma))
-               return __transparent_hugepage_enabled(vma);
-       if (vma_is_shmem(vma))
+
+       /*
+        * Check alignment for file vma and size for both file and anon vma.
+        *
+        * Skip the check for page fault. Huge fault does the check in fault
+        * handlers. And this check is not suitable for huge PUD fault.
+        */
+       if (!in_pf &&
+           !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
+               return false;
+
+       /*
+        * Enabled via shmem mount options or sysfs settings.
+        * Must be done before hugepage flags check since shmem has its
+        * own flags.
+        */
+       if (!in_pf && shmem_file(vma->vm_file))
                return shmem_huge_enabled(vma);
-       if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma))
+
+       if (!hugepage_flags_enabled())
+               return false;
+
+       /* THP settings require madvise. */
+       if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
+               return false;
+
+       /* Only regular file is valid */
+       if (!in_pf && file_thp_enabled(vma))
                return true;
 
-       return false;
+       if (!vma_is_anonymous(vma))
+               return false;
+
+       if (vma_is_temporary_stack(vma))
+               return false;
+
+       /*
+        * THPeligible bit of smaps should show 1 for proper VMAs even
+        * though anon_vma is not initialized yet.
+        *
+        * Allow page fault since anon_vma may be not initialized until
+        * the first page fault.
+        */
+       if (!vma->anon_vma)
+               return (smaps || in_pf);
+
+       return true;
 }
 
 static bool get_huge_zero_page(void)
@@ -213,8 +277,8 @@ static ssize_t enabled_store(struct kobject *kobj,
        }
        return ret;
 }
-static struct kobj_attribute enabled_attr =
-       __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
 
 ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
@@ -303,8 +367,7 @@ static ssize_t defrag_store(struct kobject *kobj,
 
        return count;
 }
-static struct kobj_attribute defrag_attr =
-       __ATTR(defrag, 0644, defrag_show, defrag_store);
+static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
 
 static ssize_t use_zero_page_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
@@ -318,8 +381,7 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
        return single_hugepage_flag_store(kobj, attr, buf, count,
                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 }
-static struct kobj_attribute use_zero_page_attr =
-       __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
+static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
 
 static ssize_t hpage_pmd_size_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
@@ -424,10 +486,10 @@ static int __init hugepage_init(void)
        if (err)
                goto err_slab;
 
-       err = register_shrinker(&huge_zero_page_shrinker);
+       err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
        if (err)
                goto err_hzp_shrinker;
-       err = register_shrinker(&deferred_split_shrinker);
+       err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
        if (err)
                goto err_split_shrinker;
 
@@ -520,7 +582,7 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page)
 void prep_transhuge_page(struct page *page)
 {
        /*
-        * we use page->mapping and page->indexlru in second tail page
+        * we use page->mapping and page->index in second tail page
         * as list_head: assuming THP order >= 2
         */
 
@@ -727,7 +789,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                return VM_FAULT_FALLBACK;
        if (unlikely(anon_vma_prepare(vma)))
                return VM_FAULT_OOM;
-       khugepaged_enter(vma, vma->vm_flags);
+       khugepaged_enter_vma(vma, vma->vm_flags);
 
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm) &&
@@ -957,15 +1019,15 @@ EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
-               pmd_t *pmd, int flags)
+                     pmd_t *pmd, bool write)
 {
        pmd_t _pmd;
 
        _pmd = pmd_mkyoung(*pmd);
-       if (flags & FOLL_WRITE)
+       if (write)
                _pmd = pmd_mkdirty(_pmd);
        if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-                               pmd, _pmd, flags & FOLL_WRITE))
+                                 pmd, _pmd, write))
                update_mmu_cache_pmd(vma, addr, pmd);
 }
 
@@ -998,7 +1060,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                return NULL;
 
        if (flags & FOLL_TOUCH)
-               touch_pmd(vma, addr, pmd, flags);
+               touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
 
        /*
         * device mapped pages can only be returned if the
@@ -1121,15 +1183,15 @@ out:
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
-               pud_t *pud, int flags)
+                     pud_t *pud, bool write)
 {
        pud_t _pud;
 
        _pud = pud_mkyoung(*pud);
-       if (flags & FOLL_WRITE)
+       if (write)
                _pud = pud_mkdirty(_pud);
        if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
-                               pud, _pud, flags & FOLL_WRITE))
+                                 pud, _pud, write))
                update_mmu_cache_pud(vma, addr, pud);
 }
 
@@ -1156,7 +1218,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
                return NULL;
 
        if (flags & FOLL_TOUCH)
-               touch_pud(vma, addr, pud, flags);
+               touch_pud(vma, addr, pud, flags & FOLL_WRITE);
 
        /*
         * device mapped pages can only be returned if the
@@ -1221,21 +1283,13 @@ out_unlock:
 
 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
 {
-       pud_t entry;
-       unsigned long haddr;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
 
        vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
        if (unlikely(!pud_same(*vmf->pud, orig_pud)))
                goto unlock;
 
-       entry = pud_mkyoung(orig_pud);
-       if (write)
-               entry = pud_mkdirty(entry);
-       haddr = vmf->address & HPAGE_PUD_MASK;
-       if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
-               update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
-
+       touch_pud(vmf->vma, vmf->address, vmf->pud, write);
 unlock:
        spin_unlock(vmf->ptl);
 }
@@ -1243,21 +1297,13 @@ unlock:
 
 void huge_pmd_set_accessed(struct vm_fault *vmf)
 {
-       pmd_t entry;
-       unsigned long haddr;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
-       pmd_t orig_pmd = vmf->orig_pmd;
 
        vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
-       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
+       if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
                goto unlock;
 
-       entry = pmd_mkyoung(orig_pmd);
-       if (write)
-               entry = pmd_mkdirty(entry);
-       haddr = vmf->address & HPAGE_PMD_MASK;
-       if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
-               update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
+       touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
 
 unlock:
        spin_unlock(vmf->ptl);
@@ -1393,7 +1439,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                return ERR_PTR(-ENOMEM);
 
        if (flags & FOLL_TOUCH)
-               touch_pmd(vma, addr, pmd, flags);
+               touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
 
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
@@ -1686,7 +1732,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                pmd = move_soft_dirty_pmd(pmd);
                set_pmd_at(mm, new_addr, new_pmd, pmd);
                if (force_flush)
-                       flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+                       flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
@@ -1843,10 +1889,10 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 }
 
 /*
- * Returns true if a given pud maps a thp, false otherwise.
+ * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
  *
- * Note that if it returns true, this routine returns without unlocking page
- * table lock. So callers must unlock it.
+ * Note that if it returns page table lock pointer, this routine returns without
+ * unlocking page table lock. So callers must unlock it.
  */
 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
 {
@@ -1868,12 +1914,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
        ptl = __pud_trans_huge_lock(pud, vma);
        if (!ptl)
                return 0;
-       /*
-        * For architectures like ppc64 we look at deposited pgtable
-        * when calling pudp_huge_get_and_clear. So do the
-        * pgtable_trans_huge_withdraw after finishing pudp related
-        * operations.
-        */
+
        pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
        tlb_remove_pud_tlb_entry(tlb, pud, addr);
        if (vma_is_special_huge(vma)) {
@@ -1938,7 +1979,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
         * replacing a zero pmd write protected page with a zero pte write
         * protected page.
         *
-        * See Documentation/vm/mmu_notifier.rst
+        * See Documentation/mm/mmu_notifier.rst
         */
        pmdp_huge_clear_flush(vma, haddr, pmd);
 
@@ -2195,6 +2236,10 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
        if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
            is_pmd_migration_entry(*pmd)) {
+               /*
+                * It's safe to call pmd_page when folio is set because it's
+                * guaranteed that pmd is present.
+                */
                if (folio && folio != page_folio(pmd_page(*pmd)))
                        goto out;
                __split_huge_pmd_locked(vma, pmd, range.start, freeze);
@@ -2502,7 +2547,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
                 * requires taking the lru_lock so we do the put_page
                 * of the tail pages after the split is complete.
                 */
-               put_page(subpage);
+               free_page_and_swap_cache(subpage);
        }
 }
 
@@ -2821,9 +2866,12 @@ static void split_huge_pages_all(void)
        unsigned long total = 0, split = 0;
 
        pr_debug("Split all THPs\n");
-       for_each_populated_zone(zone) {
+       for_each_zone(zone) {
+               if (!managed_zone(zone))
+                       continue;
                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+                       int nr_pages;
                        if (!pfn_valid(pfn))
                                continue;
 
@@ -2839,8 +2887,10 @@ static void split_huge_pages_all(void)
 
                        total++;
                        lock_page(page);
+                       nr_pages = thp_nr_pages(page);
                        if (!split_huge_page(page))
                                split++;
+                       pfn += nr_pages - 1;
                        unlock_page(page);
 next:
                        put_page(page);
@@ -2898,10 +2948,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
         * table filled with PTE-mapped THPs, each of which is distinct.
         */
        for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
-               struct vm_area_struct *vma = find_vma(mm, addr);
+               struct vm_area_struct *vma = vma_lookup(mm, addr);
                struct page *page;
 
-               if (!vma || addr < vma->vm_start)
+               if (!vma)
                        break;
 
                /* skip special VMA and hugetlb VMA */
@@ -2913,9 +2963,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
                /* FOLL_DUMP to ignore special (like zero) pages */
                page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
 
-               if (IS_ERR(page))
-                       continue;
-               if (!page)
+               if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
                        continue;
 
                if (!is_transparent_hugepage(page))
@@ -3137,7 +3185,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address = pvmw->address;
-       unsigned long mmun_start = address & HPAGE_PMD_MASK;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
        pmd_t pmde;
        swp_entry_t entry;
 
@@ -3146,7 +3194,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 
        entry = pmd_to_swp_entry(*pvmw->pmd);
        get_page(new);
-       pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
+       pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)));
        if (pmd_swp_soft_dirty(*pvmw->pmd))
                pmde = pmd_mksoft_dirty(pmde);
        if (is_writable_migration_entry(entry))
@@ -3160,12 +3208,12 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
                if (!is_readable_migration_entry(entry))
                        rmap_flags |= RMAP_EXCLUSIVE;
 
-               page_add_anon_rmap(new, vma, mmun_start, rmap_flags);
+               page_add_anon_rmap(new, vma, haddr, rmap_flags);
        } else {
                page_add_file_rmap(new, vma, true);
        }
        VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
-       set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
+       set_pmd_at(mm, haddr, pvmw->pmd, pmde);
 
        /* No need to invalidate - it was non-present before */
        update_mmu_cache_pmd(vma, address, pvmw->pmd);