mm/memory-failure: cast index to loff_t before shifting it

[platform/kernel/linux-starfive.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 1ec1ef3..517221f 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,7 +77,6 @@
  #include <linux/ptrace.h>
  #include <linux/vmalloc.h>
  #include <linux/sched/sysctl.h>
-#include <linux/net_mm.h>
  
  #include <trace/events/kmem.h>
  
@@ -361,12 +360,10 @@ void free_pgd_range(struct mmu_gather *tlb,
         } while (pgd++, addr = next, addr != end);
  }
  
-void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                    struct vm_area_struct *vma, unsigned long floor,
                    unsigned long ceiling, bool mm_wr_locked)
  {
-       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
-
         do {
                 unsigned long addr = vma->vm_start;
                 struct vm_area_struct *next;
@@ -375,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                  * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
                  * be 0.  This will underflow and is okay.
                  */
-               next = mas_find(&mas, ceiling - 1);
+               next = mas_find(mas, ceiling - 1);
  
                 /*
                  * Hide vma from rmap and truncate_pagecache before freeing
@@ -396,7 +393,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                         while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                                && !is_vm_hugetlb_page(next)) {
                                 vma = next;
-                               next = mas_find(&mas, ceiling - 1);
+                               next = mas_find(mas, ceiling - 1);
                                 if (mm_wr_locked)
                                         vma_start_write(vma);
                                 unlink_anon_vmas(vma);
@@ -860,8 +857,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                         return -EBUSY;
                 return -ENOENT;
         } else if (is_pte_marker_entry(entry)) {
-               if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
-                       set_pte_at(dst_mm, addr, dst_pte, pte);
+               pte_marker marker = copy_pte_marker(entry, dst_vma);
+
+               if (marker)
+                       set_pte_at(dst_mm, addr, dst_pte,
+                                  make_pte_marker(marker));
                 return 0;
         }
         if (!userfaultfd_wp(dst_vma))
@@ -1312,7 +1312,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
                  * Use the raw variant of the seqcount_t write API to avoid
                  * lockdep complaining about preemptibility.
                  */
-               mmap_assert_write_locked(src_mm);
+               vma_assert_write_locked(src_vma);
                 raw_write_seqcount_begin(&src_mm->write_protect_seq);
         }
  
@@ -1430,11 +1430,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                 continue;
                         ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                         tlb->fullmm);
+                       arch_check_zapped_pte(vma, ptent);
                         tlb_remove_tlb_entry(tlb, pte, addr);
                         zap_install_uffd_wp_if_needed(vma, addr, pte, details,
                                                       ptent);
-                       if (unlikely(!page))
+                       if (unlikely(!page)) {
+                               ksm_might_unmap_zero_page(mm, ptent);
                                 continue;
+                       }
  
                         delay_rmap = 0;
                         if (!PageAnon(page)) {
@@ -1500,7 +1503,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                             !zap_drop_file_uffd_wp(details))
                                 continue;
                 } else if (is_hwpoison_entry(entry) ||
-                          is_swapin_error_entry(entry)) {
+                          is_poisoned_swp_entry(entry)) {
                         if (!should_zap_cows(details))
                                 continue;
                 } else {
@@ -1680,7 +1683,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         if (vma->vm_file) {
                                 zap_flags_t zap_flags = details ?
                                     details->zap_flags : 0;
-                               __unmap_hugepage_range_final(tlb, vma, start, end,
+                               __unmap_hugepage_range(tlb, vma, start, end,
                                                              NULL, zap_flags);
                         }
                 } else
@@ -1691,10 +1694,12 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  /**
   * unmap_vmas - unmap a range of memory covered by a list of vma's
   * @tlb: address of the caller's struct mmu_gather
- * @mt: the maple tree
+ * @mas: the maple state
   * @vma: the starting vma
   * @start_addr: virtual address at which to start unmapping
   * @end_addr: virtual address at which to end unmapping
+ * @tree_end: The maximum index to check
+ * @mm_wr_locked: lock flag
   *
   * Unmap all pages in the vma list.
   *
@@ -1707,9 +1712,10 @@ static void unmap_single_vma(struct mmu_gather *tlb,
   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
   * drops the lock and schedules.
   */
-void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                 struct vm_area_struct *vma, unsigned long start_addr,
-               unsigned long end_addr, bool mm_wr_locked)
+               unsigned long end_addr, unsigned long tree_end,
+               bool mm_wr_locked)
  {
         struct mmu_notifier_range range;
         struct zap_details details = {
@@ -1717,15 +1723,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
                 /* Careful - we need to zap private pages too! */
                 .even_cows = true,
         };
-       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
  
         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
                                 start_addr, end_addr);
         mmu_notifier_invalidate_range_start(&range);
         do {
-               unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
+               unsigned long start = start_addr;
+               unsigned long end = end_addr;
+               hugetlb_zap_begin(vma, &start, &end);
+               unmap_single_vma(tlb, vma, start, end, &details,
                                  mm_wr_locked);
-       } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
+               hugetlb_zap_end(vma, &details);
+       } while ((vma = mas_find(mas, tree_end - 1)) != NULL);
         mmu_notifier_invalidate_range_end(&range);
  }
  
@@ -1748,9 +1757,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
         lru_add_drain();
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                 address, end);
-       if (is_vm_hugetlb_page(vma))
-               adjust_range_if_pmd_sharing_possible(vma, &range.start,
-                                                    &range.end);
+       hugetlb_zap_begin(vma, &range.start, &range.end);
         tlb_gather_mmu(&tlb, vma->vm_mm);
         update_hiwater_rss(vma->vm_mm);
         mmu_notifier_invalidate_range_start(&range);
@@ -1761,6 +1768,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
         unmap_single_vma(&tlb, vma, address, end, details, false);
         mmu_notifier_invalidate_range_end(&range);
         tlb_finish_mmu(&tlb);
+       hugetlb_zap_end(vma, details);
  }
  
  /**
@@ -1865,7 +1873,6 @@ out:
         return retval;
  }
  
-#ifdef pte_index
  static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
                         unsigned long addr, struct page *page, pgprot_t prot)
  {
@@ -1880,7 +1887,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
  }
  
  /* insert_pages() amortizes the cost of spinlock operations
- * when inserting pages in a loop. Arch *must* define pte_index.
+ * when inserting pages in a loop.
   */
  static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
                         struct page **pages, unsigned long *num, pgprot_t prot)
@@ -1939,7 +1946,6 @@ out:
         *num = remaining_pages_total;
         return ret;
  }
-#endif  /* ifdef pte_index */
  
  /**
   * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
@@ -1959,7 +1965,6 @@ out:
  int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                         struct page **pages, unsigned long *num)
  {
-#ifdef pte_index
         const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
  
         if (addr < vma->vm_start || end_addr >= vma->vm_end)
@@ -1971,18 +1976,6 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
         }
         /* Defer page refcount checking till we're about to map that page. */
         return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
-#else
-       unsigned long idx = 0, pgcount = *num;
-       int err = -EINVAL;
-
-       for (; idx < pgcount; ++idx) {
-               err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
-               if (err)
-                       break;
-       }
-       *num = pgcount - idx;
-       return err;
-#endif  /* ifdef pte_index */
  }
  EXPORT_SYMBOL(vm_insert_pages);
  
@@ -2858,7 +2851,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
  
                 entry = pte_mkyoung(vmf->orig_pte);
                 if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
-                       update_mmu_cache(vma, addr, vmf->pte);
+                       update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
         }
  
         /*
@@ -2927,10 +2920,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
   *
   * We do this without the lock held, so that it can sleep if it needs to.
   */
-static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
  {
         vm_fault_t ret;
-       struct page *page = vmf->page;
         unsigned int old_flags = vmf->flags;
  
         vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
@@ -2945,14 +2937,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                 return ret;
         if (unlikely(!(ret & VM_FAULT_LOCKED))) {
-               lock_page(page);
-               if (!page->mapping) {
-                       unlock_page(page);
+               folio_lock(folio);
+               if (!folio->mapping) {
+                       folio_unlock(folio);
                         return 0; /* retry */
                 }
                 ret |= VM_FAULT_LOCKED;
         } else
-               VM_BUG_ON_PAGE(!PageLocked(page), page);
+               VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
         return ret;
  }
  
@@ -2965,20 +2957,20 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
         struct address_space *mapping;
-       struct page *page = vmf->page;
+       struct folio *folio = page_folio(vmf->page);
         bool dirtied;
         bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
  
-       dirtied = set_page_dirty(page);
-       VM_BUG_ON_PAGE(PageAnon(page), page);
+       dirtied = folio_mark_dirty(folio);
+       VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
         /*
-        * Take a local copy of the address_space - page.mapping may be zeroed
-        * by truncate after unlock_page().   The address_space itself remains
-        * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
+        * Take a local copy of the address_space - folio.mapping may be zeroed
+        * by truncate after folio_unlock().   The address_space itself remains
+        * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
          * release semantics to prevent the compiler from undoing this copying.
          */
-       mapping = page_rmapping(page);
-       unlock_page(page);
+       mapping = folio_raw_mapping(folio);
+       folio_unlock(folio);
  
         if (!page_mkwrite)
                 file_update_time(vma->vm_file);
@@ -3036,7 +3028,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
         entry = pte_mkyoung(vmf->orig_pte);
         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
         if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
-               update_mmu_cache(vma, vmf->address, vmf->pte);
+               update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
         pte_unmap_unlock(vmf->pte, vmf->ptl);
         count_vm_event(PGREUSE);
  }
@@ -3128,6 +3120,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                                 inc_mm_counter(mm, MM_ANONPAGES);
                         }
                 } else {
+                       ksm_might_unmap_zero_page(mm, vmf->orig_pte);
                         inc_mm_counter(mm, MM_ANONPAGES);
                 }
                 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
@@ -3149,7 +3142,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                  * that left a window where the new PTE could be loaded into
                  * some TLBs while the old PTE remains in others.
                  */
-               ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+               ptep_clear_flush(vma, vmf->address, vmf->pte);
                 folio_add_new_anon_rmap(new_folio, vma, vmf->address);
                 folio_add_lru_vma(new_folio, vma);
                 /*
@@ -3159,7 +3152,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                  */
                 BUG_ON(unshare && pte_write(entry));
                 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
-               update_mmu_cache(vma, vmf->address, vmf->pte);
+               update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
                 if (old_folio) {
                         /*
                          * Only after switching the pte to the new page may
@@ -3195,11 +3188,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
         }
  
-       /*
-        * No need to double call mmu_notifier->invalidate_range() callback as
-        * the above ptep_clear_flush_notify() did already call it.
-        */
-       mmu_notifier_invalidate_range_only_end(&range);
+       mmu_notifier_invalidate_range_end(&range);
  
         if (new_folio)
                 folio_put(new_folio);
@@ -3269,6 +3258,11 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
                 vm_fault_t ret;
  
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
+               if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+                       vma_end_read(vmf->vma);
+                       return VM_FAULT_RETRY;
+               }
+
                 vmf->flags |= FAULT_FLAG_MKWRITE;
                 ret = vma->vm_ops->pfn_mkwrite(vmf);
                 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
@@ -3279,36 +3273,42 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
         return 0;
  }
  
-static vm_fault_t wp_page_shared(struct vm_fault *vmf)
+static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
         __releases(vmf->ptl)
  {
         struct vm_area_struct *vma = vmf->vma;
         vm_fault_t ret = 0;
  
-       get_page(vmf->page);
+       folio_get(folio);
  
         if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                 vm_fault_t tmp;
  
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
-               tmp = do_page_mkwrite(vmf);
+               if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+                       folio_put(folio);
+                       vma_end_read(vmf->vma);
+                       return VM_FAULT_RETRY;
+               }
+
+               tmp = do_page_mkwrite(vmf, folio);
                 if (unlikely(!tmp || (tmp &
                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                       put_page(vmf->page);
+                       folio_put(folio);
                         return tmp;
                 }
                 tmp = finish_mkwrite_fault(vmf);
                 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
-                       unlock_page(vmf->page);
-                       put_page(vmf->page);
+                       folio_unlock(folio);
+                       folio_put(folio);
                         return tmp;
                 }
         } else {
                 wp_page_reuse(vmf);
-               lock_page(vmf->page);
+               folio_lock(folio);
         }
         ret |= fault_dirty_shared_page(vmf);
-       put_page(vmf->page);
+       folio_put(folio);
  
         return ret;
  }
@@ -3359,6 +3359,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
  
         vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
  
+       if (vmf->page)
+               folio = page_folio(vmf->page);
+
         /*
          * Shared mapping: we are guaranteed to have VM_WRITE and
          * FAULT_FLAG_WRITE set at this point.
@@ -3373,12 +3376,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
                  */
                 if (!vmf->page)
                         return wp_pfn_shared(vmf);
-               return wp_page_shared(vmf);
+               return wp_page_shared(vmf, folio);
         }
  
-       if (vmf->page)
-               folio = page_folio(vmf->page);
-
         /*
          * Private mapping: create an exclusive anonymous page copy if reuse
          * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
@@ -3432,6 +3432,12 @@ reuse:
                 return 0;
         }
  copy:
+       if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) {
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+               vma_end_read(vmf->vma);
+               return VM_FAULT_RETRY;
+       }
+
         /*
          * Ok, we need to copy. Oh, well..
          */
@@ -3495,7 +3501,7 @@ void unmap_mapping_folio(struct folio *folio)
         VM_BUG_ON(!folio_test_locked(folio));
  
         first_index = folio->index;
-       last_index = folio->index + folio_nr_pages(folio) - 1;
+       last_index = folio_next_index(folio) - 1;
  
         details.even_cows = false;
         details.single_folio = folio;
@@ -3582,6 +3588,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
         struct folio *folio = page_folio(vmf->page);
         struct vm_area_struct *vma = vmf->vma;
         struct mmu_notifier_range range;
+       vm_fault_t ret;
  
         /*
          * We need a reference to lock the folio because we don't hold
@@ -3594,9 +3601,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
         if (!folio_try_get(folio))
                 return 0;
  
-       if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+       ret = folio_lock_or_retry(folio, vmf);
+       if (ret) {
                 folio_put(folio);
-               return VM_FAULT_RETRY;
+               return ret;
         }
         mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                 vma->vm_mm, vmf->address & PAGE_MASK,
@@ -3647,7 +3655,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
          * none pte.  Otherwise it means the pte could have changed, so retry.
          *
          * This should also cover the case where e.g. the pte changed
-        * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
+        * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
          * So is_pte_marker() check is not enough to safely drop the pte.
          */
         if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
@@ -3693,8 +3701,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
                 return VM_FAULT_SIGBUS;
  
         /* Higher priority than uffd-wp when data corrupted */
-       if (marker & PTE_MARKER_SWAPIN_ERROR)
-               return VM_FAULT_SIGBUS;
+       if (marker & PTE_MARKER_POISONED)
+               return VM_FAULT_HWPOISON;
  
         if (pte_marker_entry_uffd_wp(entry))
                 return pte_marker_handle_uffd_wp(vmf);
@@ -3721,18 +3729,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         bool exclusive = false;
         swp_entry_t entry;
         pte_t pte;
-       int locked;
         vm_fault_t ret = 0;
         void *shadow = NULL;
  
         if (!pte_unmap_same(vmf))
                 goto out;
  
-       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-               ret = VM_FAULT_RETRY;
-               goto out;
-       }
-
         entry = pte_to_swp_entry(vmf->orig_pte);
         if (unlikely(non_swap_entry(entry))) {
                 if (is_migration_entry(entry)) {
@@ -3742,6 +3744,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                         vmf->page = pfn_swap_entry_to_page(entry);
                         ret = remove_device_exclusive_entry(vmf);
                 } else if (is_device_private_entry(entry)) {
+                       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+                               /*
+                                * migrate_to_ram is not yet ready to operate
+                                * under VMA lock.
+                                */
+                               vma_end_read(vma);
+                               ret = VM_FAULT_RETRY;
+                               goto out;
+                       }
+
                         vmf->page = pfn_swap_entry_to_page(entry);
                         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                         vmf->address, &vmf->ptl);
@@ -3805,7 +3817,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                                 folio_add_lru(folio);
  
                                 /* To provide entry to swap_readpage() */
-                               folio_set_swap_entry(folio, entry);
+                               folio->swap = entry;
                                 swap_readpage(page, true, NULL);
                                 folio->private = NULL;
                         }
@@ -3843,12 +3855,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 goto out_release;
         }
  
-       locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
-
-       if (!locked) {
-               ret |= VM_FAULT_RETRY;
+       ret |= folio_lock_or_retry(folio, vmf);
+       if (ret & VM_FAULT_RETRY)
                 goto out_release;
-       }
  
         if (swapcache) {
                 /*
@@ -3859,7 +3868,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                  * changed.
                  */
                 if (unlikely(!folio_test_swapcache(folio) ||
-                            page_private(page) != entry.val))
+                            page_swap_entry(page).val != entry.val))
                         goto out_page;
  
                 /*
@@ -4026,7 +4035,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         }
  
         /* No need to invalidate - it was non-present before */
-       update_mmu_cache(vma, vmf->address, vmf->pte);
+       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
  unlock:
         if (vmf->pte)
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4119,7 +4128,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
         entry = mk_pte(&folio->page, vma->vm_page_prot);
         entry = pte_sw_mkyoung(entry);
         if (vma->vm_flags & VM_WRITE)
-               entry = pte_mkwrite(pte_mkdirty(entry));
+               entry = pte_mkwrite(pte_mkdirty(entry), vma);
  
         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                         &vmf->ptl);
@@ -4150,7 +4159,7 @@ setpte:
         set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
  
         /* No need to invalidate - it was non-present before */
-       update_mmu_cache(vma, vmf->address, vmf->pte);
+       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
  unlock:
         if (vmf->pte)
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4245,7 +4254,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
         bool write = vmf->flags & FAULT_FLAG_WRITE;
         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
         pmd_t entry;
-       int i;
         vm_fault_t ret = VM_FAULT_FALLBACK;
  
         if (!transhuge_vma_suitable(vma, haddr))
@@ -4278,8 +4286,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
         if (unlikely(!pmd_none(*vmf->pmd)))
                 goto out;
  
-       for (i = 0; i < HPAGE_PMD_NR; i++)
-               flush_icache_page(vma, page + i);
+       flush_icache_pages(vma, page, HPAGE_PMD_NR);
  
         entry = mk_huge_pmd(page, vma->vm_page_prot);
         if (write)
@@ -4312,15 +4319,24 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
  }
  #endif
  
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+/**
+ * set_pte_range - Set a range of PTEs to point to pages in a folio.
+ * @vmf: Fault decription.
+ * @folio: The folio that contains @page.
+ * @page: The first page to create a PTE for.
+ * @nr: The number of PTEs to create.
+ * @addr: The first address to create a PTE for.
+ */
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+               struct page *page, unsigned int nr, unsigned long addr)
  {
         struct vm_area_struct *vma = vmf->vma;
         bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
         bool write = vmf->flags & FAULT_FLAG_WRITE;
-       bool prefault = vmf->address != addr;
+       bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
         pte_t entry;
  
-       flush_icache_page(vma, page);
+       flush_icache_pages(vma, page, nr);
         entry = mk_pte(page, vma->vm_page_prot);
  
         if (prefault && arch_wants_old_prefaulted_pte())
@@ -4334,14 +4350,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
                 entry = pte_mkuffd_wp(entry);
         /* copy-on-write page */
         if (write && !(vma->vm_flags & VM_SHARED)) {
-               inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-               page_add_new_anon_rmap(page, vma, addr);
-               lru_cache_add_inactive_or_unevictable(page, vma);
+               add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
+               VM_BUG_ON_FOLIO(nr != 1, folio);
+               folio_add_new_anon_rmap(folio, vma, addr);
+               folio_add_lru_vma(folio, vma);
         } else {
-               inc_mm_counter(vma->vm_mm, mm_counter_file(page));
-               page_add_file_rmap(page, vma, false);
+               add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+               folio_add_file_rmap_range(folio, page, nr, vma, false);
         }
-       set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+       set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
+
+       /* no need to invalidate: a not-present page won't be cached */
+       update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
  }
  
  static bool vmf_pte_changed(struct vm_fault *vmf)
@@ -4409,11 +4429,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
  
         /* Re-check under ptl */
         if (likely(!vmf_pte_changed(vmf))) {
-               do_set_pte(vmf, page, vmf->address);
-
-               /* no need to invalidate: a not-present page won't be cached */
-               update_mmu_cache(vma, vmf->address, vmf->pte);
+               struct folio *folio = page_folio(page);
  
+               set_pte_range(vmf, folio, page, 1, vmf->address);
                 ret = 0;
         } else {
                 update_mmu_tlb(vma, vmf->address, vmf->pte);
@@ -4532,6 +4550,7 @@ static inline bool should_fault_around(struct vm_fault *vmf)
  static vm_fault_t do_read_fault(struct vm_fault *vmf)
  {
         vm_fault_t ret = 0;
+       struct folio *folio;
  
         /*
          * Let's call ->map_pages() first and use ->fault() as fallback
@@ -4544,14 +4563,20 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
                         return ret;
         }
  
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+               vma_end_read(vmf->vma);
+               return VM_FAULT_RETRY;
+       }
+
         ret = __do_fault(vmf);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                 return ret;
  
         ret |= finish_fault(vmf);
-       unlock_page(vmf->page);
+       folio = page_folio(vmf->page);
+       folio_unlock(folio);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
-               put_page(vmf->page);
+               folio_put(folio);
         return ret;
  }
  
@@ -4560,6 +4585,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
         struct vm_area_struct *vma = vmf->vma;
         vm_fault_t ret;
  
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+               vma_end_read(vma);
+               return VM_FAULT_RETRY;
+       }
+
         if (unlikely(anon_vma_prepare(vma)))
                 return VM_FAULT_OOM;
  
@@ -4598,21 +4628,29 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
         vm_fault_t ret, tmp;
+       struct folio *folio;
+
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+               vma_end_read(vma);
+               return VM_FAULT_RETRY;
+       }
  
         ret = __do_fault(vmf);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                 return ret;
  
+       folio = page_folio(vmf->page);
+
         /*
          * Check if the backing address space wants to know that the page is
          * about to become writable
          */
         if (vma->vm_ops->page_mkwrite) {
-               unlock_page(vmf->page);
-               tmp = do_page_mkwrite(vmf);
+               folio_unlock(folio);
+               tmp = do_page_mkwrite(vmf, folio);
                 if (unlikely(!tmp ||
                                 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                       put_page(vmf->page);
+                       folio_put(folio);
                         return tmp;
                 }
         }
@@ -4620,8 +4658,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
         ret |= finish_fault(vmf);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                         VM_FAULT_RETRY))) {
-               unlock_page(vmf->page);
-               put_page(vmf->page);
+               folio_unlock(folio);
+               folio_put(folio);
                 return ret;
         }
  
@@ -4808,45 +4846,47 @@ out_map:
         pte = pte_modify(old_pte, vma->vm_page_prot);
         pte = pte_mkyoung(pte);
         if (writable)
-               pte = pte_mkwrite(pte);
+               pte = pte_mkwrite(pte, vma);
         ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
-       update_mmu_cache(vma, vmf->address, vmf->pte);
+       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
         pte_unmap_unlock(vmf->pte, vmf->ptl);
         goto out;
  }
  
  static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
  {
-       if (vma_is_anonymous(vmf->vma))
+       struct vm_area_struct *vma = vmf->vma;
+       if (vma_is_anonymous(vma))
                 return do_huge_pmd_anonymous_page(vmf);
-       if (vmf->vma->vm_ops->huge_fault)
-               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+       if (vma->vm_ops->huge_fault)
+               return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
         return VM_FAULT_FALLBACK;
  }
  
  /* `inline' is required to avoid gcc 4.1.2 build error */
  static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
  {
+       struct vm_area_struct *vma = vmf->vma;
         const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
         vm_fault_t ret;
  
-       if (vma_is_anonymous(vmf->vma)) {
+       if (vma_is_anonymous(vma)) {
                 if (likely(!unshare) &&
-                   userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
+                   userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd))
                         return handle_userfault(vmf, VM_UFFD_WP);
                 return do_huge_pmd_wp_page(vmf);
         }
  
-       if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
-               if (vmf->vma->vm_ops->huge_fault) {
-                       ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+       if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+               if (vma->vm_ops->huge_fault) {
+                       ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
                         if (!(ret & VM_FAULT_FALLBACK))
                                 return ret;
                 }
         }
  
         /* COW or write-notify handled on pte level: split pmd. */
-       __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
+       __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
  
         return VM_FAULT_FALLBACK;
  }
@@ -4855,11 +4895,12 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
  {
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                    \
         defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+       struct vm_area_struct *vma = vmf->vma;
         /* No support for anonymous transparent PUD pages yet */
-       if (vma_is_anonymous(vmf->vma))
+       if (vma_is_anonymous(vma))
                 return VM_FAULT_FALLBACK;
-       if (vmf->vma->vm_ops->huge_fault)
-               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+       if (vma->vm_ops->huge_fault)
+               return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
         return VM_FAULT_FALLBACK;
  }
@@ -4868,21 +4909,22 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
  {
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                    \
         defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+       struct vm_area_struct *vma = vmf->vma;
         vm_fault_t ret;
  
         /* No support for anonymous transparent PUD pages yet */
-       if (vma_is_anonymous(vmf->vma))
+       if (vma_is_anonymous(vma))
                 goto split;
-       if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
-               if (vmf->vma->vm_ops->huge_fault) {
-                       ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+       if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+               if (vma->vm_ops->huge_fault) {
+                       ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
                         if (!(ret & VM_FAULT_FALLBACK))
                                 return ret;
                 }
         }
  split:
         /* COW or write-notify not handled on PUD level: split pud.*/
-       __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
+       __split_huge_pud(vma, vmf->pud, vmf->address);
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
         return VM_FAULT_FALLBACK;
  }
@@ -4959,7 +5001,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
         entry = pte_mkyoung(entry);
         if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                 vmf->flags & FAULT_FLAG_WRITE)) {
-               update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+               update_mmu_cache_range(vmf, vmf->vma, vmf->address,
+                               vmf->pte, 1);
         } else {
                 /* Skip spurious TLB flush for retried page fault */
                 if (vmf->flags & FAULT_FLAG_TRIED)
@@ -4980,10 +5023,10 @@ unlock:
  }
  
  /*
- * By the time we get here, we already hold the mm semaphore
- *
- * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __folio_lock_or_retry().
+ * On entry, we hold either the VMA lock or the mmap_lock
+ * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
+ * the result, the mmap_lock is not held on exit.  See filemap_fault()
+ * and __folio_lock_or_retry().
   */
  static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                 unsigned long address, unsigned int flags)
@@ -5081,7 +5124,7 @@ retry_pud:
  
  /**
   * mm_account_fault - Do page fault accounting
- *
+ * @mm: mm from which memcg should be extracted. It can be NULL.
   * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
   *        of perf event counters, but we'll still do the per-task accounting to
   *        the task who triggered this page fault.
@@ -5189,6 +5232,17 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
                                  !is_cow_mapping(vma->vm_flags)))
                         return VM_FAULT_SIGSEGV;
         }
+#ifdef CONFIG_PER_VMA_LOCK
+       /*
+        * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
+        * the assumption that lock is dropped on VM_FAULT_RETRY.
+        */
+       if (WARN_ON_ONCE((*flags &
+                       (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
+                       (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
+               return VM_FAULT_SIGSEGV;
+#endif
+
         return 0;
  }
  
@@ -5257,11 +5311,8 @@ EXPORT_SYMBOL_GPL(handle_mm_fault);
  
  static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
  {
-       /* Even if this succeeds, make it clear we *might* have slept */
-       if (likely(mmap_read_trylock(mm))) {
-               might_sleep();
+       if (likely(mmap_read_trylock(mm)))
                 return true;
-       }
  
         if (regs && !user_mode(regs)) {
                 unsigned long ip = instruction_pointer(regs);
@@ -5389,10 +5440,6 @@ retry:
         if (!vma)
                 goto inval;
  
-       /* Only anonymous and tcp vmas are supported for now */
-       if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
-               goto inval;
-
         if (!vma_start_read(vma))
                 goto inval;
  
@@ -5402,14 +5449,7 @@ retry:
          * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
          * from its anon_vma.
          */
-       if (unlikely(!vma->anon_vma && !vma_is_tcp(vma)))
-               goto inval_end_read;
-
-       /*
-        * Due to the possibility of userfault handler dropping mmap_lock, avoid
-        * it for now and fall back to page fault handling under mmap_lock.
-        */
-       if (userfaultfd_armed(vma))
+       if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
                 goto inval_end_read;
  
         /* Check since vm_start/vm_end might change before we lock the VMA */
@@ -6062,19 +6102,19 @@ void __init ptlock_cache_init(void)
                         SLAB_PANIC, NULL);
  }
  
-bool ptlock_alloc(struct page *page)
+bool ptlock_alloc(struct ptdesc *ptdesc)
  {
         spinlock_t *ptl;
  
         ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
         if (!ptl)
                 return false;
-       page->ptl = ptl;
+       ptdesc->ptl = ptl;
         return true;
  }
  
-void ptlock_free(struct page *page)
+void ptlock_free(struct ptdesc *ptdesc)
  {
-       kmem_cache_free(page_ptl_cachep, page->ptl);
+       kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
  }
  #endif