mm/memory-failure: cast index to loff_t before shifting it
[platform/kernel/linux-starfive.git] / mm / memory.c
index 1ec1ef3..517221f 100644 (file)
@@ -77,7 +77,6 @@
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/sysctl.h>
-#include <linux/net_mm.h>
 
 #include <trace/events/kmem.h>
 
@@ -361,12 +360,10 @@ void free_pgd_range(struct mmu_gather *tlb,
        } while (pgd++, addr = next, addr != end);
 }
 
-void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked)
 {
-       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
-
        do {
                unsigned long addr = vma->vm_start;
                struct vm_area_struct *next;
@@ -375,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
                 * be 0.  This will underflow and is okay.
                 */
-               next = mas_find(&mas, ceiling - 1);
+               next = mas_find(mas, ceiling - 1);
 
                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
@@ -396,7 +393,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
-                               next = mas_find(&mas, ceiling - 1);
+                               next = mas_find(mas, ceiling - 1);
                                if (mm_wr_locked)
                                        vma_start_write(vma);
                                unlink_anon_vmas(vma);
@@ -860,8 +857,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                        return -EBUSY;
                return -ENOENT;
        } else if (is_pte_marker_entry(entry)) {
-               if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
-                       set_pte_at(dst_mm, addr, dst_pte, pte);
+               pte_marker marker = copy_pte_marker(entry, dst_vma);
+
+               if (marker)
+                       set_pte_at(dst_mm, addr, dst_pte,
+                                  make_pte_marker(marker));
                return 0;
        }
        if (!userfaultfd_wp(dst_vma))
@@ -1312,7 +1312,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
                 * Use the raw variant of the seqcount_t write API to avoid
                 * lockdep complaining about preemptibility.
                 */
-               mmap_assert_write_locked(src_mm);
+               vma_assert_write_locked(src_vma);
                raw_write_seqcount_begin(&src_mm->write_protect_seq);
        }
 
@@ -1430,11 +1430,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                continue;
                        ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
+                       arch_check_zapped_pte(vma, ptent);
                        tlb_remove_tlb_entry(tlb, pte, addr);
                        zap_install_uffd_wp_if_needed(vma, addr, pte, details,
                                                      ptent);
-                       if (unlikely(!page))
+                       if (unlikely(!page)) {
+                               ksm_might_unmap_zero_page(mm, ptent);
                                continue;
+                       }
 
                        delay_rmap = 0;
                        if (!PageAnon(page)) {
@@ -1500,7 +1503,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                            !zap_drop_file_uffd_wp(details))
                                continue;
                } else if (is_hwpoison_entry(entry) ||
-                          is_swapin_error_entry(entry)) {
+                          is_poisoned_swp_entry(entry)) {
                        if (!should_zap_cows(details))
                                continue;
                } else {
@@ -1680,7 +1683,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                        if (vma->vm_file) {
                                zap_flags_t zap_flags = details ?
                                    details->zap_flags : 0;
-                               __unmap_hugepage_range_final(tlb, vma, start, end,
+                               __unmap_hugepage_range(tlb, vma, start, end,
                                                             NULL, zap_flags);
                        }
                } else
@@ -1691,10 +1694,12 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 /**
  * unmap_vmas - unmap a range of memory covered by a list of vma's
  * @tlb: address of the caller's struct mmu_gather
- * @mt: the maple tree
+ * @mas: the maple state
  * @vma: the starting vma
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
+ * @tree_end: The maximum index to check
+ * @mm_wr_locked: lock flag
  *
  * Unmap all pages in the vma list.
  *
@@ -1707,9 +1712,10 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
-void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *vma, unsigned long start_addr,
-               unsigned long end_addr, bool mm_wr_locked)
+               unsigned long end_addr, unsigned long tree_end,
+               bool mm_wr_locked)
 {
        struct mmu_notifier_range range;
        struct zap_details details = {
@@ -1717,15 +1723,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
                /* Careful - we need to zap private pages too! */
                .even_cows = true,
        };
-       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
 
        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
                                start_addr, end_addr);
        mmu_notifier_invalidate_range_start(&range);
        do {
-               unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
+               unsigned long start = start_addr;
+               unsigned long end = end_addr;
+               hugetlb_zap_begin(vma, &start, &end);
+               unmap_single_vma(tlb, vma, start, end, &details,
                                 mm_wr_locked);
-       } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
+               hugetlb_zap_end(vma, &details);
+       } while ((vma = mas_find(mas, tree_end - 1)) != NULL);
        mmu_notifier_invalidate_range_end(&range);
 }
 
@@ -1748,9 +1757,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
        lru_add_drain();
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, end);
-       if (is_vm_hugetlb_page(vma))
-               adjust_range_if_pmd_sharing_possible(vma, &range.start,
-                                                    &range.end);
+       hugetlb_zap_begin(vma, &range.start, &range.end);
        tlb_gather_mmu(&tlb, vma->vm_mm);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
@@ -1761,6 +1768,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
        unmap_single_vma(&tlb, vma, address, end, details, false);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
+       hugetlb_zap_end(vma, details);
 }
 
 /**
@@ -1865,7 +1873,6 @@ out:
        return retval;
 }
 
-#ifdef pte_index
 static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
 {
@@ -1880,7 +1887,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
 }
 
 /* insert_pages() amortizes the cost of spinlock operations
- * when inserting pages in a loop. Arch *must* define pte_index.
+ * when inserting pages in a loop.
  */
 static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num, pgprot_t prot)
@@ -1939,7 +1946,6 @@ out:
        *num = remaining_pages_total;
        return ret;
 }
-#endif  /* ifdef pte_index */
 
 /**
  * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
@@ -1959,7 +1965,6 @@ out:
 int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num)
 {
-#ifdef pte_index
        const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
 
        if (addr < vma->vm_start || end_addr >= vma->vm_end)
@@ -1971,18 +1976,6 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
        }
        /* Defer page refcount checking till we're about to map that page. */
        return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
-#else
-       unsigned long idx = 0, pgcount = *num;
-       int err = -EINVAL;
-
-       for (; idx < pgcount; ++idx) {
-               err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
-               if (err)
-                       break;
-       }
-       *num = pgcount - idx;
-       return err;
-#endif  /* ifdef pte_index */
 }
 EXPORT_SYMBOL(vm_insert_pages);
 
@@ -2858,7 +2851,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
 
                entry = pte_mkyoung(vmf->orig_pte);
                if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
-                       update_mmu_cache(vma, addr, vmf->pte);
+                       update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
        }
 
        /*
@@ -2927,10 +2920,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
  *
  * We do this without the lock held, so that it can sleep if it needs to.
  */
-static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
 {
        vm_fault_t ret;
-       struct page *page = vmf->page;
        unsigned int old_flags = vmf->flags;
 
        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
@@ -2945,14 +2937,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
-               lock_page(page);
-               if (!page->mapping) {
-                       unlock_page(page);
+               folio_lock(folio);
+               if (!folio->mapping) {
+                       folio_unlock(folio);
                        return 0; /* retry */
                }
                ret |= VM_FAULT_LOCKED;
        } else
-               VM_BUG_ON_PAGE(!PageLocked(page), page);
+               VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        return ret;
 }
 
@@ -2965,20 +2957,20 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping;
-       struct page *page = vmf->page;
+       struct folio *folio = page_folio(vmf->page);
        bool dirtied;
        bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
 
-       dirtied = set_page_dirty(page);
-       VM_BUG_ON_PAGE(PageAnon(page), page);
+       dirtied = folio_mark_dirty(folio);
+       VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
        /*
-        * Take a local copy of the address_space - page.mapping may be zeroed
-        * by truncate after unlock_page().   The address_space itself remains
-        * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
+        * Take a local copy of the address_space - folio.mapping may be zeroed
+        * by truncate after folio_unlock().   The address_space itself remains
+        * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
-       mapping = page_rmapping(page);
-       unlock_page(page);
+       mapping = folio_raw_mapping(folio);
+       folio_unlock(folio);
 
        if (!page_mkwrite)
                file_update_time(vma->vm_file);
@@ -3036,7 +3028,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
        entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
-               update_mmu_cache(vma, vmf->address, vmf->pte);
+               update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        count_vm_event(PGREUSE);
 }
@@ -3128,6 +3120,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                                inc_mm_counter(mm, MM_ANONPAGES);
                        }
                } else {
+                       ksm_might_unmap_zero_page(mm, vmf->orig_pte);
                        inc_mm_counter(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
@@ -3149,7 +3142,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                 * that left a window where the new PTE could be loaded into
                 * some TLBs while the old PTE remains in others.
                 */
-               ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+               ptep_clear_flush(vma, vmf->address, vmf->pte);
                folio_add_new_anon_rmap(new_folio, vma, vmf->address);
                folio_add_lru_vma(new_folio, vma);
                /*
@@ -3159,7 +3152,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                 */
                BUG_ON(unshare && pte_write(entry));
                set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
-               update_mmu_cache(vma, vmf->address, vmf->pte);
+               update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
                if (old_folio) {
                        /*
                         * Only after switching the pte to the new page may
@@ -3195,11 +3188,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        }
 
-       /*
-        * No need to double call mmu_notifier->invalidate_range() callback as
-        * the above ptep_clear_flush_notify() did already call it.
-        */
-       mmu_notifier_invalidate_range_only_end(&range);
+       mmu_notifier_invalidate_range_end(&range);
 
        if (new_folio)
                folio_put(new_folio);
@@ -3269,6 +3258,11 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
                vm_fault_t ret;
 
                pte_unmap_unlock(vmf->pte, vmf->ptl);
+               if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+                       vma_end_read(vmf->vma);
+                       return VM_FAULT_RETRY;
+               }
+
                vmf->flags |= FAULT_FLAG_MKWRITE;
                ret = vma->vm_ops->pfn_mkwrite(vmf);
                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
@@ -3279,36 +3273,42 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
        return 0;
 }
 
-static vm_fault_t wp_page_shared(struct vm_fault *vmf)
+static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
 {
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;
 
-       get_page(vmf->page);
+       folio_get(folio);
 
        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                vm_fault_t tmp;
 
                pte_unmap_unlock(vmf->pte, vmf->ptl);
-               tmp = do_page_mkwrite(vmf);
+               if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+                       folio_put(folio);
+                       vma_end_read(vmf->vma);
+                       return VM_FAULT_RETRY;
+               }
+
+               tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                       put_page(vmf->page);
+                       folio_put(folio);
                        return tmp;
                }
                tmp = finish_mkwrite_fault(vmf);
                if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
-                       unlock_page(vmf->page);
-                       put_page(vmf->page);
+                       folio_unlock(folio);
+                       folio_put(folio);
                        return tmp;
                }
        } else {
                wp_page_reuse(vmf);
-               lock_page(vmf->page);
+               folio_lock(folio);
        }
        ret |= fault_dirty_shared_page(vmf);
-       put_page(vmf->page);
+       folio_put(folio);
 
        return ret;
 }
@@ -3359,6 +3359,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 
        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
 
+       if (vmf->page)
+               folio = page_folio(vmf->page);
+
        /*
         * Shared mapping: we are guaranteed to have VM_WRITE and
         * FAULT_FLAG_WRITE set at this point.
@@ -3373,12 +3376,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
                 */
                if (!vmf->page)
                        return wp_pfn_shared(vmf);
-               return wp_page_shared(vmf);
+               return wp_page_shared(vmf, folio);
        }
 
-       if (vmf->page)
-               folio = page_folio(vmf->page);
-
        /*
         * Private mapping: create an exclusive anonymous page copy if reuse
         * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
@@ -3432,6 +3432,12 @@ reuse:
                return 0;
        }
 copy:
+       if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) {
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+               vma_end_read(vmf->vma);
+               return VM_FAULT_RETRY;
+       }
+
        /*
         * Ok, we need to copy. Oh, well..
         */
@@ -3495,7 +3501,7 @@ void unmap_mapping_folio(struct folio *folio)
        VM_BUG_ON(!folio_test_locked(folio));
 
        first_index = folio->index;
-       last_index = folio->index + folio_nr_pages(folio) - 1;
+       last_index = folio_next_index(folio) - 1;
 
        details.even_cows = false;
        details.single_folio = folio;
@@ -3582,6 +3588,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
        struct folio *folio = page_folio(vmf->page);
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
+       vm_fault_t ret;
 
        /*
         * We need a reference to lock the folio because we don't hold
@@ -3594,9 +3601,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
        if (!folio_try_get(folio))
                return 0;
 
-       if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+       ret = folio_lock_or_retry(folio, vmf);
+       if (ret) {
                folio_put(folio);
-               return VM_FAULT_RETRY;
+               return ret;
        }
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                vma->vm_mm, vmf->address & PAGE_MASK,
@@ -3647,7 +3655,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
         * none pte.  Otherwise it means the pte could have changed, so retry.
         *
         * This should also cover the case where e.g. the pte changed
-        * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
+        * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
         * So is_pte_marker() check is not enough to safely drop the pte.
         */
        if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
@@ -3693,8 +3701,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
                return VM_FAULT_SIGBUS;
 
        /* Higher priority than uffd-wp when data corrupted */
-       if (marker & PTE_MARKER_SWAPIN_ERROR)
-               return VM_FAULT_SIGBUS;
+       if (marker & PTE_MARKER_POISONED)
+               return VM_FAULT_HWPOISON;
 
        if (pte_marker_entry_uffd_wp(entry))
                return pte_marker_handle_uffd_wp(vmf);
@@ -3721,18 +3729,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        bool exclusive = false;
        swp_entry_t entry;
        pte_t pte;
-       int locked;
        vm_fault_t ret = 0;
        void *shadow = NULL;
 
        if (!pte_unmap_same(vmf))
                goto out;
 
-       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-               ret = VM_FAULT_RETRY;
-               goto out;
-       }
-
        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
@@ -3742,6 +3744,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                        vmf->page = pfn_swap_entry_to_page(entry);
                        ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
+                       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+                               /*
+                                * migrate_to_ram is not yet ready to operate
+                                * under VMA lock.
+                                */
+                               vma_end_read(vma);
+                               ret = VM_FAULT_RETRY;
+                               goto out;
+                       }
+
                        vmf->page = pfn_swap_entry_to_page(entry);
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
@@ -3805,7 +3817,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                                folio_add_lru(folio);
 
                                /* To provide entry to swap_readpage() */
-                               folio_set_swap_entry(folio, entry);
+                               folio->swap = entry;
                                swap_readpage(page, true, NULL);
                                folio->private = NULL;
                        }
@@ -3843,12 +3855,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                goto out_release;
        }
 
-       locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
-
-       if (!locked) {
-               ret |= VM_FAULT_RETRY;
+       ret |= folio_lock_or_retry(folio, vmf);
+       if (ret & VM_FAULT_RETRY)
                goto out_release;
-       }
 
        if (swapcache) {
                /*
@@ -3859,7 +3868,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 * changed.
                 */
                if (unlikely(!folio_test_swapcache(folio) ||
-                            page_private(page) != entry.val))
+                            page_swap_entry(page).val != entry.val))
                        goto out_page;
 
                /*
@@ -4026,7 +4035,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        }
 
        /* No need to invalidate - it was non-present before */
-       update_mmu_cache(vma, vmf->address, vmf->pte);
+       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
 unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4119,7 +4128,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
        entry = mk_pte(&folio->page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (vma->vm_flags & VM_WRITE)
-               entry = pte_mkwrite(pte_mkdirty(entry));
+               entry = pte_mkwrite(pte_mkdirty(entry), vma);
 
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
@@ -4150,7 +4159,7 @@ setpte:
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
        /* No need to invalidate - it was non-present before */
-       update_mmu_cache(vma, vmf->address, vmf->pte);
+       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
 unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4245,7 +4254,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
-       int i;
        vm_fault_t ret = VM_FAULT_FALLBACK;
 
        if (!transhuge_vma_suitable(vma, haddr))
@@ -4278,8 +4286,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
        if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;
 
-       for (i = 0; i < HPAGE_PMD_NR; i++)
-               flush_icache_page(vma, page + i);
+       flush_icache_pages(vma, page, HPAGE_PMD_NR);
 
        entry = mk_huge_pmd(page, vma->vm_page_prot);
        if (write)
@@ -4312,15 +4319,24 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 }
 #endif
 
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+/**
+ * set_pte_range - Set a range of PTEs to point to pages in a folio.
+ * @vmf: Fault decription.
+ * @folio: The folio that contains @page.
+ * @page: The first page to create a PTE for.
+ * @nr: The number of PTEs to create.
+ * @addr: The first address to create a PTE for.
+ */
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+               struct page *page, unsigned int nr, unsigned long addr)
 {
        struct vm_area_struct *vma = vmf->vma;
        bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
        bool write = vmf->flags & FAULT_FLAG_WRITE;
-       bool prefault = vmf->address != addr;
+       bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
        pte_t entry;
 
-       flush_icache_page(vma, page);
+       flush_icache_pages(vma, page, nr);
        entry = mk_pte(page, vma->vm_page_prot);
 
        if (prefault && arch_wants_old_prefaulted_pte())
@@ -4334,14 +4350,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
                entry = pte_mkuffd_wp(entry);
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
-               inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-               page_add_new_anon_rmap(page, vma, addr);
-               lru_cache_add_inactive_or_unevictable(page, vma);
+               add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
+               VM_BUG_ON_FOLIO(nr != 1, folio);
+               folio_add_new_anon_rmap(folio, vma, addr);
+               folio_add_lru_vma(folio, vma);
        } else {
-               inc_mm_counter(vma->vm_mm, mm_counter_file(page));
-               page_add_file_rmap(page, vma, false);
+               add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+               folio_add_file_rmap_range(folio, page, nr, vma, false);
        }
-       set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+       set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
+
+       /* no need to invalidate: a not-present page won't be cached */
+       update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
 }
 
 static bool vmf_pte_changed(struct vm_fault *vmf)
@@ -4409,11 +4429,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 
        /* Re-check under ptl */
        if (likely(!vmf_pte_changed(vmf))) {
-               do_set_pte(vmf, page, vmf->address);
-
-               /* no need to invalidate: a not-present page won't be cached */
-               update_mmu_cache(vma, vmf->address, vmf->pte);
+               struct folio *folio = page_folio(page);
 
+               set_pte_range(vmf, folio, page, 1, vmf->address);
                ret = 0;
        } else {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
@@ -4532,6 +4550,7 @@ static inline bool should_fault_around(struct vm_fault *vmf)
 static vm_fault_t do_read_fault(struct vm_fault *vmf)
 {
        vm_fault_t ret = 0;
+       struct folio *folio;
 
        /*
         * Let's call ->map_pages() first and use ->fault() as fallback
@@ -4544,14 +4563,20 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
                        return ret;
        }
 
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+               vma_end_read(vmf->vma);
+               return VM_FAULT_RETRY;
+       }
+
        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
        ret |= finish_fault(vmf);
-       unlock_page(vmf->page);
+       folio = page_folio(vmf->page);
+       folio_unlock(folio);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
-               put_page(vmf->page);
+               folio_put(folio);
        return ret;
 }
 
@@ -4560,6 +4585,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;
 
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+               vma_end_read(vma);
+               return VM_FAULT_RETRY;
+       }
+
        if (unlikely(anon_vma_prepare(vma)))
                return VM_FAULT_OOM;
 
@@ -4598,21 +4628,29 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret, tmp;
+       struct folio *folio;
+
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+               vma_end_read(vma);
+               return VM_FAULT_RETRY;
+       }
 
        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
+       folio = page_folio(vmf->page);
+
        /*
         * Check if the backing address space wants to know that the page is
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
-               unlock_page(vmf->page);
-               tmp = do_page_mkwrite(vmf);
+               folio_unlock(folio);
+               tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                       put_page(vmf->page);
+                       folio_put(folio);
                        return tmp;
                }
        }
@@ -4620,8 +4658,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
        ret |= finish_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
-               unlock_page(vmf->page);
-               put_page(vmf->page);
+               folio_unlock(folio);
+               folio_put(folio);
                return ret;
        }
 
@@ -4808,45 +4846,47 @@ out_map:
        pte = pte_modify(old_pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
        if (writable)
-               pte = pte_mkwrite(pte);
+               pte = pte_mkwrite(pte, vma);
        ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
-       update_mmu_cache(vma, vmf->address, vmf->pte);
+       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        goto out;
 }
 
 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
 {
-       if (vma_is_anonymous(vmf->vma))
+       struct vm_area_struct *vma = vmf->vma;
+       if (vma_is_anonymous(vma))
                return do_huge_pmd_anonymous_page(vmf);
-       if (vmf->vma->vm_ops->huge_fault)
-               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+       if (vma->vm_ops->huge_fault)
+               return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
        return VM_FAULT_FALLBACK;
 }
 
 /* `inline' is required to avoid gcc 4.1.2 build error */
 static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 {
+       struct vm_area_struct *vma = vmf->vma;
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        vm_fault_t ret;
 
-       if (vma_is_anonymous(vmf->vma)) {
+       if (vma_is_anonymous(vma)) {
                if (likely(!unshare) &&
-                   userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
+                   userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd))
                        return handle_userfault(vmf, VM_UFFD_WP);
                return do_huge_pmd_wp_page(vmf);
        }
 
-       if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
-               if (vmf->vma->vm_ops->huge_fault) {
-                       ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+       if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+               if (vma->vm_ops->huge_fault) {
+                       ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }
 
        /* COW or write-notify handled on pte level: split pmd. */
-       __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
+       __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
 
        return VM_FAULT_FALLBACK;
 }
@@ -4855,11 +4895,12 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
 {
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                    \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+       struct vm_area_struct *vma = vmf->vma;
        /* No support for anonymous transparent PUD pages yet */
-       if (vma_is_anonymous(vmf->vma))
+       if (vma_is_anonymous(vma))
                return VM_FAULT_FALLBACK;
-       if (vmf->vma->vm_ops->huge_fault)
-               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+       if (vma->vm_ops->huge_fault)
+               return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return VM_FAULT_FALLBACK;
 }
@@ -4868,21 +4909,22 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
 {
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                    \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+       struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;
 
        /* No support for anonymous transparent PUD pages yet */
-       if (vma_is_anonymous(vmf->vma))
+       if (vma_is_anonymous(vma))
                goto split;
-       if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
-               if (vmf->vma->vm_ops->huge_fault) {
-                       ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+       if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+               if (vma->vm_ops->huge_fault) {
+                       ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }
 split:
        /* COW or write-notify not handled on PUD level: split pud.*/
-       __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
+       __split_huge_pud(vma, vmf->pud, vmf->address);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
        return VM_FAULT_FALLBACK;
 }
@@ -4959,7 +5001,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                vmf->flags & FAULT_FLAG_WRITE)) {
-               update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+               update_mmu_cache_range(vmf, vmf->vma, vmf->address,
+                               vmf->pte, 1);
        } else {
                /* Skip spurious TLB flush for retried page fault */
                if (vmf->flags & FAULT_FLAG_TRIED)
@@ -4980,10 +5023,10 @@ unlock:
 }
 
 /*
- * By the time we get here, we already hold the mm semaphore
- *
- * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __folio_lock_or_retry().
+ * On entry, we hold either the VMA lock or the mmap_lock
+ * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
+ * the result, the mmap_lock is not held on exit.  See filemap_fault()
+ * and __folio_lock_or_retry().
  */
 static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
@@ -5081,7 +5124,7 @@ retry_pud:
 
 /**
  * mm_account_fault - Do page fault accounting
- *
+ * @mm: mm from which memcg should be extracted. It can be NULL.
  * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
  *        of perf event counters, but we'll still do the per-task accounting to
  *        the task who triggered this page fault.
@@ -5189,6 +5232,17 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
                                 !is_cow_mapping(vma->vm_flags)))
                        return VM_FAULT_SIGSEGV;
        }
+#ifdef CONFIG_PER_VMA_LOCK
+       /*
+        * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
+        * the assumption that lock is dropped on VM_FAULT_RETRY.
+        */
+       if (WARN_ON_ONCE((*flags &
+                       (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
+                       (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
+               return VM_FAULT_SIGSEGV;
+#endif
+
        return 0;
 }
 
@@ -5257,11 +5311,8 @@ EXPORT_SYMBOL_GPL(handle_mm_fault);
 
 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
 {
-       /* Even if this succeeds, make it clear we *might* have slept */
-       if (likely(mmap_read_trylock(mm))) {
-               might_sleep();
+       if (likely(mmap_read_trylock(mm)))
                return true;
-       }
 
        if (regs && !user_mode(regs)) {
                unsigned long ip = instruction_pointer(regs);
@@ -5389,10 +5440,6 @@ retry:
        if (!vma)
                goto inval;
 
-       /* Only anonymous and tcp vmas are supported for now */
-       if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
-               goto inval;
-
        if (!vma_start_read(vma))
                goto inval;
 
@@ -5402,14 +5449,7 @@ retry:
         * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
         * from its anon_vma.
         */
-       if (unlikely(!vma->anon_vma && !vma_is_tcp(vma)))
-               goto inval_end_read;
-
-       /*
-        * Due to the possibility of userfault handler dropping mmap_lock, avoid
-        * it for now and fall back to page fault handling under mmap_lock.
-        */
-       if (userfaultfd_armed(vma))
+       if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
                goto inval_end_read;
 
        /* Check since vm_start/vm_end might change before we lock the VMA */
@@ -6062,19 +6102,19 @@ void __init ptlock_cache_init(void)
                        SLAB_PANIC, NULL);
 }
 
-bool ptlock_alloc(struct page *page)
+bool ptlock_alloc(struct ptdesc *ptdesc)
 {
        spinlock_t *ptl;
 
        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
-       page->ptl = ptl;
+       ptdesc->ptl = ptl;
        return true;
 }
 
-void ptlock_free(struct page *page)
+void ptlock_free(struct ptdesc *ptdesc)
 {
-       kmem_cache_free(page_ptl_cachep, page->ptl);
+       kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
 }
 #endif