Merge tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kerne...

[platform/kernel/linux-rpi.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 410bbb0..7c468ac 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -370,7 +370,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
  }
  
  static inline long
-hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
+hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
                      long to, struct hstate *h, struct hugetlb_cgroup *cg,
                      long *regions_needed)
  {
@@ -379,7 +379,7 @@ hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
         if (!regions_needed) {
                 nrg = get_file_region_entry_from_cache(map, from, to);
                 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
-               list_add(&nrg->link, rg->link.prev);
+               list_add(&nrg->link, rg);
                 coalesce_file_region(map, nrg);
         } else
                 *regions_needed += 1;
@@ -402,47 +402,52 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
         long add = 0;
         struct list_head *head = &resv->regions;
         long last_accounted_offset = f;
-       struct file_region *rg = NULL, *trg = NULL;
+       struct file_region *iter, *trg = NULL;
+       struct list_head *rg = NULL;
  
         if (regions_needed)
                 *regions_needed = 0;
  
         /* In this loop, we essentially handle an entry for the range
-        * [last_accounted_offset, rg->from), at every iteration, with some
+        * [last_accounted_offset, iter->from), at every iteration, with some
          * bounds checking.
          */
-       list_for_each_entry_safe(rg, trg, head, link) {
+       list_for_each_entry_safe(iter, trg, head, link) {
                 /* Skip irrelevant regions that start before our range. */
-               if (rg->from < f) {
+               if (iter->from < f) {
                         /* If this region ends after the last accounted offset,
                          * then we need to update last_accounted_offset.
                          */
-                       if (rg->to > last_accounted_offset)
-                               last_accounted_offset = rg->to;
+                       if (iter->to > last_accounted_offset)
+                               last_accounted_offset = iter->to;
                         continue;
                 }
  
                 /* When we find a region that starts beyond our range, we've
                  * finished.
                  */
-               if (rg->from >= t)
+               if (iter->from >= t) {
+                       rg = iter->link.prev;
                         break;
+               }
  
-               /* Add an entry for last_accounted_offset -> rg->from, and
+               /* Add an entry for last_accounted_offset -> iter->from, and
                  * update last_accounted_offset.
                  */
-               if (rg->from > last_accounted_offset)
-                       add += hugetlb_resv_map_add(resv, rg,
+               if (iter->from > last_accounted_offset)
+                       add += hugetlb_resv_map_add(resv, iter->link.prev,
                                                     last_accounted_offset,
-                                                   rg->from, h, h_cg,
+                                                   iter->from, h, h_cg,
                                                     regions_needed);
  
-               last_accounted_offset = rg->to;
+               last_accounted_offset = iter->to;
         }
  
         /* Handle the case where our range extends beyond
          * last_accounted_offset.
          */
+       if (!rg)
+               rg = head->prev;
         if (last_accounted_offset < t)
                 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
                                             t, h, h_cg, regions_needed);
@@ -1535,7 +1540,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                 return;
  
-       if (alloc_huge_page_vmemmap(h, page)) {
+       if (hugetlb_vmemmap_alloc(h, page)) {
                 spin_lock_irq(&hugetlb_lock);
                 /*
                  * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1612,7 +1617,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
  
  static inline void flush_free_hpage_work(struct hstate *h)
  {
-       if (free_vmemmap_pages_per_hpage(h))
+       if (hugetlb_optimize_vmemmap_pages(h))
                 flush_work(&free_hpage_work);
  }
  
@@ -1672,6 +1677,8 @@ void free_huge_page(struct page *page)
         VM_BUG_ON_PAGE(page_mapcount(page), page);
  
         hugetlb_set_page_subpool(page, NULL);
+       if (PageAnon(page))
+               __ClearPageAnonExclusive(page);
         page->mapping = NULL;
         restore_reserve = HPageRestoreReserve(page);
         ClearHPageRestoreReserve(page);
@@ -1732,7 +1739,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
  
  static void __prep_new_huge_page(struct hstate *h, struct page *page)
  {
-       free_huge_page_vmemmap(h, page);
+       hugetlb_vmemmap_free(h, page);
         INIT_LIST_HEAD(&page->lru);
         set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
         hugetlb_set_page_subpool(page, NULL);
@@ -2105,7 +2112,7 @@ retry:
                  * Attempt to allocate vmemmmap here so that we can take
                  * appropriate action on failure.
                  */
-               rc = alloc_huge_page_vmemmap(h, head);
+               rc = hugetlb_vmemmap_alloc(h, head);
                 if (!rc) {
                         /*
                          * Move PageHWPoison flag from head page to the raw
@@ -2979,8 +2986,6 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
         struct huge_bootmem_page *m = NULL; /* initialize for clang */
         int nr_nodes, node;
  
-       if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
-               return 0;
         /* do node specific alloc */
         if (nid != NUMA_NO_NODE) {
                 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
@@ -3088,7 +3093,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
         }
  
         /* do node specific alloc */
-       for (i = 0; i < nr_online_nodes; i++) {
+       for_each_online_node(i) {
                 if (h->max_huge_pages_node[i] > 0) {
                         hugetlb_hstate_alloc_pages_onenode(h, i);
                         node_specific_alloc = true;
@@ -3420,7 +3425,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
         remove_hugetlb_page_for_demote(h, page, false);
         spin_unlock_irq(&hugetlb_lock);
  
-       rc = alloc_huge_page_vmemmap(h, page);
+       rc = hugetlb_vmemmap_alloc(h, page);
         if (rc) {
                 /* Allocation of vmemmmap failed, we can not demote page */
                 spin_lock_irq(&hugetlb_lock);
@@ -4052,7 +4057,7 @@ static int __init hugetlb_init(void)
                         default_hstate.max_huge_pages =
                                 default_hstate_max_huge_pages;
  
-                       for (i = 0; i < nr_online_nodes; i++)
+                       for_each_online_node(i)
                                 default_hstate.max_huge_pages_node[i] =
                                         default_hugepages_in_node[i];
                 }
@@ -4119,6 +4124,20 @@ bool __init __weak hugetlb_node_alloc_supported(void)
  {
         return true;
  }
+
+static void __init hugepages_clear_pages_in_node(void)
+{
+       if (!hugetlb_max_hstate) {
+               default_hstate_max_huge_pages = 0;
+               memset(default_hugepages_in_node, 0,
+                       MAX_NUMNODES * sizeof(unsigned int));
+       } else {
+               parsed_hstate->max_huge_pages = 0;
+               memset(parsed_hstate->max_huge_pages_node, 0,
+                       MAX_NUMNODES * sizeof(unsigned int));
+       }
+}
+
  /*
   * hugepages command line processing
   * hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -4138,7 +4157,7 @@ static int __init hugepages_setup(char *s)
         if (!parsed_valid_hugepagesz) {
                 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
                 parsed_valid_hugepagesz = true;
-               return 0;
+               return 1;
         }
  
         /*
@@ -4154,7 +4173,7 @@ static int __init hugepages_setup(char *s)
  
         if (mhp == last_mhp) {
                 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
-               return 0;
+               return 1;
         }
  
         while (*p) {
@@ -4165,11 +4184,11 @@ static int __init hugepages_setup(char *s)
                 if (p[count] == ':') {
                         if (!hugetlb_node_alloc_supported()) {
                                 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
-                               return 0;
+                               return 1;
                         }
-                       if (tmp >= nr_online_nodes)
+                       if (tmp >= MAX_NUMNODES || !node_online(tmp))
                                 goto invalid;
-                       node = array_index_nospec(tmp, nr_online_nodes);
+                       node = array_index_nospec(tmp, MAX_NUMNODES);
                         p += count + 1;
                         /* Parse hugepages */
                         if (sscanf(p, "%lu%n", &tmp, &count) != 1)
@@ -4206,7 +4225,8 @@ static int __init hugepages_setup(char *s)
  
  invalid:
         pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
-       return 0;
+       hugepages_clear_pages_in_node();
+       return 1;
  }
  __setup("hugepages=", hugepages_setup);
  
@@ -4227,7 +4247,7 @@ static int __init hugepagesz_setup(char *s)
  
         if (!arch_hugetlb_valid_size(size)) {
                 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
-               return 0;
+               return 1;
         }
  
         h = size_to_hstate(size);
@@ -4242,7 +4262,7 @@ static int __init hugepagesz_setup(char *s)
                 if (!parsed_default_hugepagesz ||  h != &default_hstate ||
                     default_hstate.max_huge_pages) {
                         pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
-                       return 0;
+                       return 1;
                 }
  
                 /*
@@ -4273,14 +4293,14 @@ static int __init default_hugepagesz_setup(char *s)
         parsed_valid_hugepagesz = false;
         if (parsed_default_hugepagesz) {
                 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
-               return 0;
+               return 1;
         }
  
         size = (unsigned long)memparse(s, NULL);
  
         if (!arch_hugetlb_valid_size(size)) {
                 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
-               return 0;
+               return 1;
         }
  
         hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@ -4297,7 +4317,7 @@ static int __init default_hugepagesz_setup(char *s)
          */
         if (default_hstate_max_huge_pages) {
                 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
-               for (i = 0; i < nr_online_nodes; i++)
+               for_each_online_node(i)
                         default_hstate.max_huge_pages_node[i] =
                                 default_hugepages_in_node[i];
                 if (hstate_is_gigantic(&default_hstate))
@@ -4699,24 +4719,27 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
  }
  
  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-                           struct vm_area_struct *vma)
+                           struct vm_area_struct *dst_vma,
+                           struct vm_area_struct *src_vma)
  {
         pte_t *src_pte, *dst_pte, entry, dst_entry;
         struct page *ptepage;
         unsigned long addr;
-       bool cow = is_cow_mapping(vma->vm_flags);
-       struct hstate *h = hstate_vma(vma);
+       bool cow = is_cow_mapping(src_vma->vm_flags);
+       struct hstate *h = hstate_vma(src_vma);
         unsigned long sz = huge_page_size(h);
         unsigned long npages = pages_per_huge_page(h);
-       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct address_space *mapping = src_vma->vm_file->f_mapping;
         struct mmu_notifier_range range;
         int ret = 0;
  
         if (cow) {
-               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
-                                       vma->vm_start,
-                                       vma->vm_end);
+               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
+                                       src_vma->vm_start,
+                                       src_vma->vm_end);
                 mmu_notifier_invalidate_range_start(&range);
+               mmap_assert_write_locked(src);
+               raw_write_seqcount_begin(&src->write_protect_seq);
         } else {
                 /*
                  * For shared mappings i_mmap_rwsem must be held to call
@@ -4727,12 +4750,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                 i_mmap_lock_read(mapping);
         }
  
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+       for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
                 spinlock_t *src_ptl, *dst_ptl;
                 src_pte = huge_pte_offset(src, addr, sz);
                 if (!src_pte)
                         continue;
-               dst_pte = huge_pte_alloc(dst, vma, addr, sz);
+               dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
                 if (!dst_pte) {
                         ret = -ENOMEM;
                         break;
@@ -4767,8 +4790,9 @@ again:
                 } else if (unlikely(is_hugetlb_entry_migration(entry) ||
                                     is_hugetlb_entry_hwpoisoned(entry))) {
                         swp_entry_t swp_entry = pte_to_swp_entry(entry);
+                       bool uffd_wp = huge_pte_uffd_wp(entry);
  
-                       if (is_writable_migration_entry(swp_entry) && cow) {
+                       if (!is_readable_migration_entry(swp_entry) && cow) {
                                 /*
                                  * COW mappings require pages in both
                                  * parent and child to be set to read.
@@ -4776,38 +4800,53 @@ again:
                                 swp_entry = make_readable_migration_entry(
                                                         swp_offset(swp_entry));
                                 entry = swp_entry_to_pte(swp_entry);
+                               if (userfaultfd_wp(src_vma) && uffd_wp)
+                                       entry = huge_pte_mkuffd_wp(entry);
                                 set_huge_swap_pte_at(src, addr, src_pte,
                                                      entry, sz);
                         }
+                       if (!userfaultfd_wp(dst_vma) && uffd_wp)
+                               entry = huge_pte_clear_uffd_wp(entry);
                         set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+               } else if (unlikely(is_pte_marker(entry))) {
+                       /*
+                        * We copy the pte marker only if the dst vma has
+                        * uffd-wp enabled.
+                        */
+                       if (userfaultfd_wp(dst_vma))
+                               set_huge_pte_at(dst, addr, dst_pte, entry);
                 } else {
                         entry = huge_ptep_get(src_pte);
                         ptepage = pte_page(entry);
                         get_page(ptepage);
  
                         /*
-                        * This is a rare case where we see pinned hugetlb
-                        * pages while they're prone to COW.  We need to do the
-                        * COW earlier during fork.
+                        * Failing to duplicate the anon rmap is a rare case
+                        * where we see pinned hugetlb pages while they're
+                        * prone to COW. We need to do the COW earlier during
+                        * fork.
                          *
                          * When pre-allocating the page or copying data, we
                          * need to be without the pgtable locks since we could
                          * sleep during the process.
                          */
-                       if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+                       if (!PageAnon(ptepage)) {
+                               page_dup_file_rmap(ptepage, true);
+                       } else if (page_try_dup_anon_rmap(ptepage, true,
+                                                         src_vma)) {
                                 pte_t src_pte_old = entry;
                                 struct page *new;
  
                                 spin_unlock(src_ptl);
                                 spin_unlock(dst_ptl);
                                 /* Do not use reserve as it's private owned */
-                               new = alloc_huge_page(vma, addr, 1);
+                               new = alloc_huge_page(dst_vma, addr, 1);
                                 if (IS_ERR(new)) {
                                         put_page(ptepage);
                                         ret = PTR_ERR(new);
                                         break;
                                 }
-                               copy_user_huge_page(new, ptepage, addr, vma,
+                               copy_user_huge_page(new, ptepage, addr, dst_vma,
                                                     npages);
                                 put_page(ptepage);
  
@@ -4817,13 +4856,13 @@ again:
                                 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                                 entry = huge_ptep_get(src_pte);
                                 if (!pte_same(src_pte_old, entry)) {
-                                       restore_reserve_on_error(h, vma, addr,
+                                       restore_reserve_on_error(h, dst_vma, addr,
                                                                 new);
                                         put_page(new);
                                         /* dst_entry won't change as in child */
                                         goto again;
                                 }
-                               hugetlb_install_page(vma, dst_pte, addr, new);
+                               hugetlb_install_page(dst_vma, dst_pte, addr, new);
                                 spin_unlock(src_ptl);
                                 spin_unlock(dst_ptl);
                                 continue;
@@ -4841,7 +4880,6 @@ again:
                                 entry = huge_pte_wrprotect(entry);
                         }
  
-                       page_dup_rmap(ptepage, true);
                         set_huge_pte_at(dst, addr, dst_pte, entry);
                         hugetlb_count_add(npages, dst);
                 }
@@ -4849,10 +4887,12 @@ again:
                 spin_unlock(dst_ptl);
         }
  
-       if (cow)
+       if (cow) {
+               raw_write_seqcount_end(&src->write_protect_seq);
                 mmu_notifier_invalidate_range_end(&range);
-       else
+       } else {
                 i_mmap_unlock_read(mapping);
+       }
  
         return ret;
  }
@@ -4896,10 +4936,17 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
         unsigned long old_addr_copy;
         pte_t *src_pte, *dst_pte;
         struct mmu_notifier_range range;
+       bool shared_pmd = false;
  
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
                                 old_end);
         adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+       /*
+        * In case of shared PMDs, we should cover the maximum possible
+        * range.
+        */
+       flush_cache_range(vma, range.start, range.end);
+
         mmu_notifier_invalidate_range_start(&range);
         /* Prevent race with file truncation */
         i_mmap_lock_write(mapping);
@@ -4916,8 +4963,10 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
                  */
                 old_addr_copy = old_addr;
  
-               if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
+               if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+                       shared_pmd = true;
                         continue;
+               }
  
                 dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
                 if (!dst_pte)
@@ -4925,7 +4974,11 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
  
                 move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
         }
-       flush_tlb_range(vma, old_end - len, old_end);
+
+       if (shared_pmd)
+               flush_tlb_range(vma, range.start, range.end);
+       else
+               flush_tlb_range(vma, old_end - len, old_end);
         mmu_notifier_invalidate_range_end(&range);
         i_mmap_unlock_write(mapping);
  
@@ -4934,7 +4987,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
  
  static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                    unsigned long start, unsigned long end,
-                                  struct page *ref_page)
+                                  struct page *ref_page, zap_flags_t zap_flags)
  {
         struct mm_struct *mm = vma->vm_mm;
         unsigned long address;
@@ -4990,7 +5043,18 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                  * unmapped and its refcount is dropped, so just clear pte here.
                  */
                 if (unlikely(!pte_present(pte))) {
-                       huge_pte_clear(mm, address, ptep, sz);
+                       /*
+                        * If the pte was wr-protected by uffd-wp in any of the
+                        * swap forms, meanwhile the caller does not want to
+                        * drop the uffd-wp bit in this zap, then replace the
+                        * pte with a marker.
+                        */
+                       if (pte_swp_uffd_wp_any(pte) &&
+                           !(zap_flags & ZAP_FLAG_DROP_MARKER))
+                               set_huge_pte_at(mm, address, ptep,
+                                               make_pte_marker(PTE_MARKER_UFFD_WP));
+                       else
+                               huge_pte_clear(mm, address, ptep, sz);
                         spin_unlock(ptl);
                         continue;
                 }
@@ -5018,7 +5082,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                 if (huge_pte_dirty(pte))
                         set_page_dirty(page);
-
+               /* Leave a uffd-wp pte marker if needed */
+               if (huge_pte_uffd_wp(pte) &&
+                   !(zap_flags & ZAP_FLAG_DROP_MARKER))
+                       set_huge_pte_at(mm, address, ptep,
+                                       make_pte_marker(PTE_MARKER_UFFD_WP));
                 hugetlb_count_sub(pages_per_huge_page(h), mm);
                 page_remove_rmap(page, vma, true);
  
@@ -5052,9 +5120,10 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
  
  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                           struct vm_area_struct *vma, unsigned long start,
-                         unsigned long end, struct page *ref_page)
+                         unsigned long end, struct page *ref_page,
+                         zap_flags_t zap_flags)
  {
-       __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+       __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
  
         /*
          * Clear this flag so that x86's huge_pmd_share page_table_shareable
@@ -5070,12 +5139,13 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
  }
  
  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                         unsigned long end, struct page *ref_page)
+                         unsigned long end, struct page *ref_page,
+                         zap_flags_t zap_flags)
  {
         struct mmu_gather tlb;
  
         tlb_gather_mmu(&tlb, vma->vm_mm);
-       __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+       __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
         tlb_finish_mmu(&tlb);
  }
  
@@ -5130,21 +5200,22 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                  */
                 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                         unmap_hugepage_range(iter_vma, address,
-                                            address + huge_page_size(h), page);
+                                            address + huge_page_size(h), page, 0);
         }
         i_mmap_unlock_write(mapping);
  }
  
  /*
- * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * hugetlb_wp() should be called with page lock of the original hugepage held.
   * Called with hugetlb_fault_mutex_table held and pte_page locked so we
   * cannot race with other handlers or page migration.
   * Keep the pte_same checks anyway to make transition from the mutex easier.
   */
-static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                      unsigned long address, pte_t *ptep,
+static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
+                      unsigned long address, pte_t *ptep, unsigned int flags,
                        struct page *pagecache_page, spinlock_t *ptl)
  {
+       const bool unshare = flags & FAULT_FLAG_UNSHARE;
         pte_t pte;
         struct hstate *h = hstate_vma(vma);
         struct page *old_page, *new_page;
@@ -5153,17 +5224,26 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
         unsigned long haddr = address & huge_page_mask(h);
         struct mmu_notifier_range range;
  
+       VM_BUG_ON(unshare && (flags & FOLL_WRITE));
+       VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
+
         pte = huge_ptep_get(ptep);
         old_page = pte_page(pte);
  
  retry_avoidcopy:
-       /* If no-one else is actually using this page, avoid the copy
-        * and just make the page writable */
+       /*
+        * If no-one else is actually using this page, we're the exclusive
+        * owner and can reuse this page.
+        */
         if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
-               page_move_anon_rmap(old_page, vma);
-               set_huge_ptep_writable(vma, haddr, ptep);
+               if (!PageAnonExclusive(old_page))
+                       page_move_anon_rmap(old_page, vma);
+               if (likely(!unshare))
+                       set_huge_ptep_writable(vma, haddr, ptep);
                 return 0;
         }
+       VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
+                      old_page);
  
         /*
          * If the process that created a MAP_PRIVATE mapping is about to
@@ -5262,13 +5342,13 @@ retry_avoidcopy:
         if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
                 ClearHPageRestoreReserve(new_page);
  
-               /* Break COW */
+               /* Break COW or unshare */
                 huge_ptep_clear_flush(vma, haddr, ptep);
                 mmu_notifier_invalidate_range(mm, range.start, range.end);
                 page_remove_rmap(old_page, vma, true);
                 hugepage_add_new_anon_rmap(new_page, vma, haddr);
                 set_huge_pte_at(mm, haddr, ptep,
-                               make_huge_pte(vma, new_page, 1));
+                               make_huge_pte(vma, new_page, !unshare));
                 SetHPageMigratable(new_page);
                 /* Make the old page be freed below */
                 new_page = old_page;
@@ -5276,7 +5356,10 @@ retry_avoidcopy:
         spin_unlock(ptl);
         mmu_notifier_invalidate_range_end(&range);
  out_release_all:
-       /* No restore in case of successful pagetable update (Break COW) */
+       /*
+        * No restore in case of successful pagetable update (Break COW or
+        * unshare)
+        */
         if (new_page != old_page)
                 restore_reserve_on_error(h, vma, haddr, new_page);
         put_page(new_page);
@@ -5386,7 +5469,8 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                         struct vm_area_struct *vma,
                         struct address_space *mapping, pgoff_t idx,
-                       unsigned long address, pte_t *ptep, unsigned int flags)
+                       unsigned long address, pte_t *ptep,
+                       pte_t old_pte, unsigned int flags)
  {
         struct hstate *h = hstate_vma(vma);
         vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -5401,7 +5485,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         /*
          * Currently, we are forced to kill the process in the event the
          * original mapper has unmapped pages from the child due to a failed
-        * COW. Warn that such a situation has occurred as it may not be obvious
+        * COW/unsharing. Warn that such a situation has occurred as it may not
+        * be obvious.
          */
         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
@@ -5512,22 +5597,29 @@ retry:
  
         ptl = huge_pte_lock(h, mm, ptep);
         ret = 0;
-       if (!huge_pte_none(huge_ptep_get(ptep)))
+       /* If pte changed from under us, retry */
+       if (!pte_same(huge_ptep_get(ptep), old_pte))
                 goto backout;
  
         if (anon_rmap) {
                 ClearHPageRestoreReserve(page);
                 hugepage_add_new_anon_rmap(page, vma, haddr);
         } else
-               page_dup_rmap(page, true);
+               page_dup_file_rmap(page, true);
         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                 && (vma->vm_flags & VM_SHARED)));
+       /*
+        * If this pte was previously wr-protected, keep it wr-protected even
+        * if populated.
+        */
+       if (unlikely(pte_marker_uffd_wp(old_pte)))
+               new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
         set_huge_pte_at(mm, haddr, ptep, new_pte);
  
         hugetlb_count_add(pages_per_huge_page(h), mm);
         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                 /* Optimization, do the COW without a second fault */
-               ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+               ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
         }
  
         spin_unlock(ptl);
@@ -5639,8 +5731,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
         entry = huge_ptep_get(ptep);
-       if (huge_pte_none(entry)) {
-               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
+       /* PTE markers should be handled the same way as none pte */
+       if (huge_pte_none_mostly(entry)) {
+               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+                                     entry, flags);
                 goto out_mutex;
         }
  
@@ -5657,14 +5751,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 goto out_mutex;
  
         /*
-        * If we are going to COW the mapping later, we examine the pending
-        * reservations for this page now. This will ensure that any
+        * If we are going to COW/unshare the mapping later, we examine the
+        * pending reservations for this page now. This will ensure that any
          * allocations necessary to record that reservation occur outside the
          * spinlock. For private mappings, we also lookup the pagecache
          * page now as it is used to determine if a reservation has been
          * consumed.
          */
-       if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+       if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+           !huge_pte_write(entry)) {
                 if (vma_needs_reservation(h, vma, haddr) < 0) {
                         ret = VM_FAULT_OOM;
                         goto out_mutex;
@@ -5679,12 +5774,32 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         ptl = huge_pte_lock(h, mm, ptep);
  
-       /* Check for a racing update before calling hugetlb_cow */
+       /* Check for a racing update before calling hugetlb_wp() */
         if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
                 goto out_ptl;
  
+       /* Handle userfault-wp first, before trying to lock more pages */
+       if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
+           (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+               struct vm_fault vmf = {
+                       .vma = vma,
+                       .address = haddr,
+                       .real_address = address,
+                       .flags = flags,
+               };
+
+               spin_unlock(ptl);
+               if (pagecache_page) {
+                       unlock_page(pagecache_page);
+                       put_page(pagecache_page);
+               }
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+               i_mmap_unlock_read(mapping);
+               return handle_userfault(&vmf, VM_UFFD_WP);
+       }
+
         /*
-        * hugetlb_cow() requires page locks of pte_page(entry) and
+        * hugetlb_wp() requires page locks of pte_page(entry) and
          * pagecache_page, so here we need take the former one
          * when page != pagecache_page or !pagecache_page.
          */
@@ -5697,13 +5812,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         get_page(page);
  
-       if (flags & FAULT_FLAG_WRITE) {
+       if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                 if (!huge_pte_write(entry)) {
-                       ret = hugetlb_cow(mm, vma, address, ptep,
-                                         pagecache_page, ptl);
+                       ret = hugetlb_wp(mm, vma, address, ptep, flags,
+                                        pagecache_page, ptl);
                         goto out_put_page;
+               } else if (likely(flags & FAULT_FLAG_WRITE)) {
+                       entry = huge_pte_mkdirty(entry);
                 }
-               entry = huge_pte_mkdirty(entry);
         }
         entry = pte_mkyoung(entry);
         if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
@@ -5746,7 +5862,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                             unsigned long dst_addr,
                             unsigned long src_addr,
                             enum mcopy_atomic_mode mode,
-                           struct page **pagep)
+                           struct page **pagep,
+                           bool wp_copy)
  {
         bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
         struct hstate *h = hstate_vma(dst_vma);
@@ -5876,27 +5993,43 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                 goto out_release_unlock;
  
         ret = -EEXIST;
-       if (!huge_pte_none(huge_ptep_get(dst_pte)))
+       /*
+        * We allow to overwrite a pte marker: consider when both MISSING|WP
+        * registered, we firstly wr-protect a none pte which has no page cache
+        * page backing it, then access the page.
+        */
+       if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                 goto out_release_unlock;
  
         if (vm_shared) {
-               page_dup_rmap(page, true);
+               page_dup_file_rmap(page, true);
         } else {
                 ClearHPageRestoreReserve(page);
                 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
         }
  
-       /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
-       if (is_continue && !vm_shared)
+       /*
+        * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
+        * with wp flag set, don't set pte write bit.
+        */
+       if (wp_copy || (is_continue && !vm_shared))
                 writable = 0;
         else
                 writable = dst_vma->vm_flags & VM_WRITE;
  
         _dst_pte = make_huge_pte(dst_vma, page, writable);
-       if (writable)
-               _dst_pte = huge_pte_mkdirty(_dst_pte);
+       /*
+        * Always mark UFFDIO_COPY page dirty; note that this may not be
+        * extremely important for hugetlbfs for now since swapping is not
+        * supported, but we should still be clear in that this page cannot be
+        * thrown away at will, even if write bit not set.
+        */
+       _dst_pte = huge_pte_mkdirty(_dst_pte);
         _dst_pte = pte_mkyoung(_dst_pte);
  
+       if (wp_copy)
+               _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
+
         set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  
         (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
@@ -5940,6 +6073,25 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
         }
  }
  
+static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+                                              bool *unshare)
+{
+       pte_t pteval = huge_ptep_get(pte);
+
+       *unshare = false;
+       if (is_swap_pte(pteval))
+               return true;
+       if (huge_pte_write(pteval))
+               return false;
+       if (flags & FOLL_WRITE)
+               return true;
+       if (gup_must_unshare(flags, pte_page(pteval))) {
+               *unshare = true;
+               return true;
+       }
+       return false;
+}
+
  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                          struct page **pages, struct vm_area_struct **vmas,
                          unsigned long *position, unsigned long *nr_pages,
@@ -5954,6 +6106,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
         while (vaddr < vma->vm_end && remainder) {
                 pte_t *pte;
                 spinlock_t *ptl = NULL;
+               bool unshare = false;
                 int absent;
                 struct page *page;
  
@@ -6004,9 +6157,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                  * both cases, and because we can't follow correct pages
                  * directly from any kind of swap entries.
                  */
-               if (absent || is_swap_pte(huge_ptep_get(pte)) ||
-                   ((flags & FOLL_WRITE) &&
-                     !huge_pte_write(huge_ptep_get(pte)))) {
+               if (absent ||
+                   __follow_hugetlb_must_fault(flags, pte, &unshare)) {
                         vm_fault_t ret;
                         unsigned int fault_flags = 0;
  
@@ -6014,6 +6166,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                 spin_unlock(ptl);
                         if (flags & FOLL_WRITE)
                                 fault_flags |= FAULT_FLAG_WRITE;
+                       else if (unshare)
+                               fault_flags |= FAULT_FLAG_UNSHARE;
                         if (locked)
                                 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                         FAULT_FLAG_KILLABLE;
@@ -6055,6 +6209,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                 page = pte_page(huge_ptep_get(pte));
  
+               VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+                              !PageAnonExclusive(page), page);
+
                 /*
                  * If subpage information not requested, update counters
                  * and skip the same_page loop below.
@@ -6117,16 +6274,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
  }
  
  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
-               unsigned long address, unsigned long end, pgprot_t newprot)
+               unsigned long address, unsigned long end,
+               pgprot_t newprot, unsigned long cp_flags)
  {
         struct mm_struct *mm = vma->vm_mm;
         unsigned long start = address;
         pte_t *ptep;
         pte_t pte;
         struct hstate *h = hstate_vma(vma);
-       unsigned long pages = 0;
+       unsigned long pages = 0, psize = huge_page_size(h);
         bool shared_pmd = false;
         struct mmu_notifier_range range;
+       bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+       bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
  
         /*
          * In the case of shared PMDs, the area to flush could be beyond
@@ -6142,13 +6302,19 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
  
         mmu_notifier_invalidate_range_start(&range);
         i_mmap_lock_write(vma->vm_file->f_mapping);
-       for (; address < end; address += huge_page_size(h)) {
+       for (; address < end; address += psize) {
                 spinlock_t *ptl;
-               ptep = huge_pte_offset(mm, address, huge_page_size(h));
+               ptep = huge_pte_offset(mm, address, psize);
                 if (!ptep)
                         continue;
                 ptl = huge_pte_lock(h, mm, ptep);
                 if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+                       /*
+                        * When uffd-wp is enabled on the vma, unshare
+                        * shouldn't happen at all.  Warn about it if it
+                        * happened due to some reason.
+                        */
+                       WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
                         pages++;
                         spin_unlock(ptl);
                         shared_pmd = true;
@@ -6161,20 +6327,37 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                 }
                 if (unlikely(is_hugetlb_entry_migration(pte))) {
                         swp_entry_t entry = pte_to_swp_entry(pte);
+                       struct page *page = pfn_swap_entry_to_page(entry);
  
-                       if (is_writable_migration_entry(entry)) {
+                       if (!is_readable_migration_entry(entry)) {
                                 pte_t newpte;
  
-                               entry = make_readable_migration_entry(
-                                                       swp_offset(entry));
+                               if (PageAnon(page))
+                                       entry = make_readable_exclusive_migration_entry(
+                                                               swp_offset(entry));
+                               else
+                                       entry = make_readable_migration_entry(
+                                                               swp_offset(entry));
                                 newpte = swp_entry_to_pte(entry);
+                               if (uffd_wp)
+                                       newpte = pte_swp_mkuffd_wp(newpte);
+                               else if (uffd_wp_resolve)
+                                       newpte = pte_swp_clear_uffd_wp(newpte);
                                 set_huge_swap_pte_at(mm, address, ptep,
-                                                    newpte, huge_page_size(h));
+                                                    newpte, psize);
                                 pages++;
                         }
                         spin_unlock(ptl);
                         continue;
                 }
+               if (unlikely(pte_marker_uffd_wp(pte))) {
+                       /*
+                        * This is changing a non-present pte into a none pte,
+                        * no need for huge_ptep_modify_prot_start/commit().
+                        */
+                       if (uffd_wp_resolve)
+                               huge_pte_clear(mm, address, ptep, psize);
+               }
                 if (!huge_pte_none(pte)) {
                         pte_t old_pte;
                         unsigned int shift = huge_page_shift(hstate_vma(vma));
@@ -6182,8 +6365,18 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                         old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
                         pte = huge_pte_modify(old_pte, newprot);
                         pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
+                       if (uffd_wp)
+                               pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
+                       else if (uffd_wp_resolve)
+                               pte = huge_pte_clear_uffd_wp(pte);
                         huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
                         pages++;
+               } else {
+                       /* None pte */
+                       if (unlikely(uffd_wp))
+                               /* Safe to modify directly (none->non-present). */
+                               set_huge_pte_at(mm, address, ptep,
+                                               make_pte_marker(PTE_MARKER_UFFD_WP));
                 }
                 spin_unlock(ptl);
         }
@@ -6693,9 +6886,11 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
         spinlock_t *ptl;
         pte_t pte;
  
-       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
-       if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
-                        (FOLL_PIN | FOLL_GET)))
+       /*
+        * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+        * follow_hugetlb_page().
+        */
+       if (WARN_ON_ONCE(flags & FOLL_PIN))
                 return NULL;
  
  retry:
@@ -6783,7 +6978,9 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
         spin_lock_irq(&hugetlb_lock);
         if (PageHeadHuge(page)) {
                 *hugetlb = true;
-               if (HPageFreed(page) || HPageMigratable(page))
+               if (HPageFreed(page))
+                       ret = 0;
+               else if (HPageMigratable(page))
                         ret = get_page_unless_zero(page);
                 else
                         ret = -EBUSY;
@@ -6873,6 +7070,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
         if (start >= end)
                 return;
  
+       flush_cache_range(vma, start, end);
         /*
          * No need to call adjust_range_if_pmd_sharing_possible(), because
          * we have already done the PUD_SIZE alignment.
@@ -6958,7 +7156,7 @@ void __init hugetlb_cma_reserve(int order)
                 if (hugetlb_cma_size_in_node[nid] == 0)
                         continue;
  
-               if (!node_state(nid, N_ONLINE)) {
+               if (!node_online(nid)) {
                         pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
                         hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
                         hugetlb_cma_size_in_node[nid] = 0;
@@ -6997,7 +7195,7 @@ void __init hugetlb_cma_reserve(int order)
         }
  
         reserved = 0;
-       for_each_node_state(nid, N_ONLINE) {
+       for_each_online_node(nid) {
                 int res;
                 char name[CMA_MAX_NAME];