Merge tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kerne...
[platform/kernel/linux-rpi.git] / mm / hugetlb.c
index 410bbb0..7c468ac 100644 (file)
@@ -370,7 +370,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
 }
 
 static inline long
-hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
+hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
                     long to, struct hstate *h, struct hugetlb_cgroup *cg,
                     long *regions_needed)
 {
@@ -379,7 +379,7 @@ hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
        if (!regions_needed) {
                nrg = get_file_region_entry_from_cache(map, from, to);
                record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
-               list_add(&nrg->link, rg->link.prev);
+               list_add(&nrg->link, rg);
                coalesce_file_region(map, nrg);
        } else
                *regions_needed += 1;
@@ -402,47 +402,52 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
        long add = 0;
        struct list_head *head = &resv->regions;
        long last_accounted_offset = f;
-       struct file_region *rg = NULL, *trg = NULL;
+       struct file_region *iter, *trg = NULL;
+       struct list_head *rg = NULL;
 
        if (regions_needed)
                *regions_needed = 0;
 
        /* In this loop, we essentially handle an entry for the range
-        * [last_accounted_offset, rg->from), at every iteration, with some
+        * [last_accounted_offset, iter->from), at every iteration, with some
         * bounds checking.
         */
-       list_for_each_entry_safe(rg, trg, head, link) {
+       list_for_each_entry_safe(iter, trg, head, link) {
                /* Skip irrelevant regions that start before our range. */
-               if (rg->from < f) {
+               if (iter->from < f) {
                        /* If this region ends after the last accounted offset,
                         * then we need to update last_accounted_offset.
                         */
-                       if (rg->to > last_accounted_offset)
-                               last_accounted_offset = rg->to;
+                       if (iter->to > last_accounted_offset)
+                               last_accounted_offset = iter->to;
                        continue;
                }
 
                /* When we find a region that starts beyond our range, we've
                 * finished.
                 */
-               if (rg->from >= t)
+               if (iter->from >= t) {
+                       rg = iter->link.prev;
                        break;
+               }
 
-               /* Add an entry for last_accounted_offset -> rg->from, and
+               /* Add an entry for last_accounted_offset -> iter->from, and
                 * update last_accounted_offset.
                 */
-               if (rg->from > last_accounted_offset)
-                       add += hugetlb_resv_map_add(resv, rg,
+               if (iter->from > last_accounted_offset)
+                       add += hugetlb_resv_map_add(resv, iter->link.prev,
                                                    last_accounted_offset,
-                                                   rg->from, h, h_cg,
+                                                   iter->from, h, h_cg,
                                                    regions_needed);
 
-               last_accounted_offset = rg->to;
+               last_accounted_offset = iter->to;
        }
 
        /* Handle the case where our range extends beyond
         * last_accounted_offset.
         */
+       if (!rg)
+               rg = head->prev;
        if (last_accounted_offset < t)
                add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
                                            t, h, h_cg, regions_needed);
@@ -1535,7 +1540,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
 
-       if (alloc_huge_page_vmemmap(h, page)) {
+       if (hugetlb_vmemmap_alloc(h, page)) {
                spin_lock_irq(&hugetlb_lock);
                /*
                 * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1612,7 +1617,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
 
 static inline void flush_free_hpage_work(struct hstate *h)
 {
-       if (free_vmemmap_pages_per_hpage(h))
+       if (hugetlb_optimize_vmemmap_pages(h))
                flush_work(&free_hpage_work);
 }
 
@@ -1672,6 +1677,8 @@ void free_huge_page(struct page *page)
        VM_BUG_ON_PAGE(page_mapcount(page), page);
 
        hugetlb_set_page_subpool(page, NULL);
+       if (PageAnon(page))
+               __ClearPageAnonExclusive(page);
        page->mapping = NULL;
        restore_reserve = HPageRestoreReserve(page);
        ClearHPageRestoreReserve(page);
@@ -1732,7 +1739,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
 
 static void __prep_new_huge_page(struct hstate *h, struct page *page)
 {
-       free_huge_page_vmemmap(h, page);
+       hugetlb_vmemmap_free(h, page);
        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        hugetlb_set_page_subpool(page, NULL);
@@ -2105,7 +2112,7 @@ retry:
                 * Attempt to allocate vmemmmap here so that we can take
                 * appropriate action on failure.
                 */
-               rc = alloc_huge_page_vmemmap(h, head);
+               rc = hugetlb_vmemmap_alloc(h, head);
                if (!rc) {
                        /*
                         * Move PageHWPoison flag from head page to the raw
@@ -2979,8 +2986,6 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
        struct huge_bootmem_page *m = NULL; /* initialize for clang */
        int nr_nodes, node;
 
-       if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
-               return 0;
        /* do node specific alloc */
        if (nid != NUMA_NO_NODE) {
                m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
@@ -3088,7 +3093,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
        }
 
        /* do node specific alloc */
-       for (i = 0; i < nr_online_nodes; i++) {
+       for_each_online_node(i) {
                if (h->max_huge_pages_node[i] > 0) {
                        hugetlb_hstate_alloc_pages_onenode(h, i);
                        node_specific_alloc = true;
@@ -3420,7 +3425,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
        remove_hugetlb_page_for_demote(h, page, false);
        spin_unlock_irq(&hugetlb_lock);
 
-       rc = alloc_huge_page_vmemmap(h, page);
+       rc = hugetlb_vmemmap_alloc(h, page);
        if (rc) {
                /* Allocation of vmemmmap failed, we can not demote page */
                spin_lock_irq(&hugetlb_lock);
@@ -4052,7 +4057,7 @@ static int __init hugetlb_init(void)
                        default_hstate.max_huge_pages =
                                default_hstate_max_huge_pages;
 
-                       for (i = 0; i < nr_online_nodes; i++)
+                       for_each_online_node(i)
                                default_hstate.max_huge_pages_node[i] =
                                        default_hugepages_in_node[i];
                }
@@ -4119,6 +4124,20 @@ bool __init __weak hugetlb_node_alloc_supported(void)
 {
        return true;
 }
+
+static void __init hugepages_clear_pages_in_node(void)
+{
+       if (!hugetlb_max_hstate) {
+               default_hstate_max_huge_pages = 0;
+               memset(default_hugepages_in_node, 0,
+                       MAX_NUMNODES * sizeof(unsigned int));
+       } else {
+               parsed_hstate->max_huge_pages = 0;
+               memset(parsed_hstate->max_huge_pages_node, 0,
+                       MAX_NUMNODES * sizeof(unsigned int));
+       }
+}
+
 /*
  * hugepages command line processing
  * hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -4138,7 +4157,7 @@ static int __init hugepages_setup(char *s)
        if (!parsed_valid_hugepagesz) {
                pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
                parsed_valid_hugepagesz = true;
-               return 0;
+               return 1;
        }
 
        /*
@@ -4154,7 +4173,7 @@ static int __init hugepages_setup(char *s)
 
        if (mhp == last_mhp) {
                pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
-               return 0;
+               return 1;
        }
 
        while (*p) {
@@ -4165,11 +4184,11 @@ static int __init hugepages_setup(char *s)
                if (p[count] == ':') {
                        if (!hugetlb_node_alloc_supported()) {
                                pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
-                               return 0;
+                               return 1;
                        }
-                       if (tmp >= nr_online_nodes)
+                       if (tmp >= MAX_NUMNODES || !node_online(tmp))
                                goto invalid;
-                       node = array_index_nospec(tmp, nr_online_nodes);
+                       node = array_index_nospec(tmp, MAX_NUMNODES);
                        p += count + 1;
                        /* Parse hugepages */
                        if (sscanf(p, "%lu%n", &tmp, &count) != 1)
@@ -4206,7 +4225,8 @@ static int __init hugepages_setup(char *s)
 
 invalid:
        pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
-       return 0;
+       hugepages_clear_pages_in_node();
+       return 1;
 }
 __setup("hugepages=", hugepages_setup);
 
@@ -4227,7 +4247,7 @@ static int __init hugepagesz_setup(char *s)
 
        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
-               return 0;
+               return 1;
        }
 
        h = size_to_hstate(size);
@@ -4242,7 +4262,7 @@ static int __init hugepagesz_setup(char *s)
                if (!parsed_default_hugepagesz ||  h != &default_hstate ||
                    default_hstate.max_huge_pages) {
                        pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
-                       return 0;
+                       return 1;
                }
 
                /*
@@ -4273,14 +4293,14 @@ static int __init default_hugepagesz_setup(char *s)
        parsed_valid_hugepagesz = false;
        if (parsed_default_hugepagesz) {
                pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
-               return 0;
+               return 1;
        }
 
        size = (unsigned long)memparse(s, NULL);
 
        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
-               return 0;
+               return 1;
        }
 
        hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@ -4297,7 +4317,7 @@ static int __init default_hugepagesz_setup(char *s)
         */
        if (default_hstate_max_huge_pages) {
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
-               for (i = 0; i < nr_online_nodes; i++)
+               for_each_online_node(i)
                        default_hstate.max_huge_pages_node[i] =
                                default_hugepages_in_node[i];
                if (hstate_is_gigantic(&default_hstate))
@@ -4699,24 +4719,27 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
 }
 
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-                           struct vm_area_struct *vma)
+                           struct vm_area_struct *dst_vma,
+                           struct vm_area_struct *src_vma)
 {
        pte_t *src_pte, *dst_pte, entry, dst_entry;
        struct page *ptepage;
        unsigned long addr;
-       bool cow = is_cow_mapping(vma->vm_flags);
-       struct hstate *h = hstate_vma(vma);
+       bool cow = is_cow_mapping(src_vma->vm_flags);
+       struct hstate *h = hstate_vma(src_vma);
        unsigned long sz = huge_page_size(h);
        unsigned long npages = pages_per_huge_page(h);
-       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct address_space *mapping = src_vma->vm_file->f_mapping;
        struct mmu_notifier_range range;
        int ret = 0;
 
        if (cow) {
-               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
-                                       vma->vm_start,
-                                       vma->vm_end);
+               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
+                                       src_vma->vm_start,
+                                       src_vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
+               mmap_assert_write_locked(src);
+               raw_write_seqcount_begin(&src->write_protect_seq);
        } else {
                /*
                 * For shared mappings i_mmap_rwsem must be held to call
@@ -4727,12 +4750,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                i_mmap_lock_read(mapping);
        }
 
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+       for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr, sz);
                if (!src_pte)
                        continue;
-               dst_pte = huge_pte_alloc(dst, vma, addr, sz);
+               dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
                        break;
@@ -4767,8 +4790,9 @@ again:
                } else if (unlikely(is_hugetlb_entry_migration(entry) ||
                                    is_hugetlb_entry_hwpoisoned(entry))) {
                        swp_entry_t swp_entry = pte_to_swp_entry(entry);
+                       bool uffd_wp = huge_pte_uffd_wp(entry);
 
-                       if (is_writable_migration_entry(swp_entry) && cow) {
+                       if (!is_readable_migration_entry(swp_entry) && cow) {
                                /*
                                 * COW mappings require pages in both
                                 * parent and child to be set to read.
@@ -4776,38 +4800,53 @@ again:
                                swp_entry = make_readable_migration_entry(
                                                        swp_offset(swp_entry));
                                entry = swp_entry_to_pte(swp_entry);
+                               if (userfaultfd_wp(src_vma) && uffd_wp)
+                                       entry = huge_pte_mkuffd_wp(entry);
                                set_huge_swap_pte_at(src, addr, src_pte,
                                                     entry, sz);
                        }
+                       if (!userfaultfd_wp(dst_vma) && uffd_wp)
+                               entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+               } else if (unlikely(is_pte_marker(entry))) {
+                       /*
+                        * We copy the pte marker only if the dst vma has
+                        * uffd-wp enabled.
+                        */
+                       if (userfaultfd_wp(dst_vma))
+                               set_huge_pte_at(dst, addr, dst_pte, entry);
                } else {
                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
 
                        /*
-                        * This is a rare case where we see pinned hugetlb
-                        * pages while they're prone to COW.  We need to do the
-                        * COW earlier during fork.
+                        * Failing to duplicate the anon rmap is a rare case
+                        * where we see pinned hugetlb pages while they're
+                        * prone to COW. We need to do the COW earlier during
+                        * fork.
                         *
                         * When pre-allocating the page or copying data, we
                         * need to be without the pgtable locks since we could
                         * sleep during the process.
                         */
-                       if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+                       if (!PageAnon(ptepage)) {
+                               page_dup_file_rmap(ptepage, true);
+                       } else if (page_try_dup_anon_rmap(ptepage, true,
+                                                         src_vma)) {
                                pte_t src_pte_old = entry;
                                struct page *new;
 
                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                /* Do not use reserve as it's private owned */
-                               new = alloc_huge_page(vma, addr, 1);
+                               new = alloc_huge_page(dst_vma, addr, 1);
                                if (IS_ERR(new)) {
                                        put_page(ptepage);
                                        ret = PTR_ERR(new);
                                        break;
                                }
-                               copy_user_huge_page(new, ptepage, addr, vma,
+                               copy_user_huge_page(new, ptepage, addr, dst_vma,
                                                    npages);
                                put_page(ptepage);
 
@@ -4817,13 +4856,13 @@ again:
                                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                                entry = huge_ptep_get(src_pte);
                                if (!pte_same(src_pte_old, entry)) {
-                                       restore_reserve_on_error(h, vma, addr,
+                                       restore_reserve_on_error(h, dst_vma, addr,
                                                                new);
                                        put_page(new);
                                        /* dst_entry won't change as in child */
                                        goto again;
                                }
-                               hugetlb_install_page(vma, dst_pte, addr, new);
+                               hugetlb_install_page(dst_vma, dst_pte, addr, new);
                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                continue;
@@ -4841,7 +4880,6 @@ again:
                                entry = huge_pte_wrprotect(entry);
                        }
 
-                       page_dup_rmap(ptepage, true);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                        hugetlb_count_add(npages, dst);
                }
@@ -4849,10 +4887,12 @@ again:
                spin_unlock(dst_ptl);
        }
 
-       if (cow)
+       if (cow) {
+               raw_write_seqcount_end(&src->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
-       else
+       } else {
                i_mmap_unlock_read(mapping);
+       }
 
        return ret;
 }
@@ -4896,10 +4936,17 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
        unsigned long old_addr_copy;
        pte_t *src_pte, *dst_pte;
        struct mmu_notifier_range range;
+       bool shared_pmd = false;
 
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
                                old_end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+       /*
+        * In case of shared PMDs, we should cover the maximum possible
+        * range.
+        */
+       flush_cache_range(vma, range.start, range.end);
+
        mmu_notifier_invalidate_range_start(&range);
        /* Prevent race with file truncation */
        i_mmap_lock_write(mapping);
@@ -4916,8 +4963,10 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
                 */
                old_addr_copy = old_addr;
 
-               if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
+               if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+                       shared_pmd = true;
                        continue;
+               }
 
                dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
                if (!dst_pte)
@@ -4925,7 +4974,11 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 
                move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
        }
-       flush_tlb_range(vma, old_end - len, old_end);
+
+       if (shared_pmd)
+               flush_tlb_range(vma, range.start, range.end);
+       else
+               flush_tlb_range(vma, old_end - len, old_end);
        mmu_notifier_invalidate_range_end(&range);
        i_mmap_unlock_write(mapping);
 
@@ -4934,7 +4987,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 
 static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                   unsigned long start, unsigned long end,
-                                  struct page *ref_page)
+                                  struct page *ref_page, zap_flags_t zap_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -4990,7 +5043,18 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                 * unmapped and its refcount is dropped, so just clear pte here.
                 */
                if (unlikely(!pte_present(pte))) {
-                       huge_pte_clear(mm, address, ptep, sz);
+                       /*
+                        * If the pte was wr-protected by uffd-wp in any of the
+                        * swap forms, meanwhile the caller does not want to
+                        * drop the uffd-wp bit in this zap, then replace the
+                        * pte with a marker.
+                        */
+                       if (pte_swp_uffd_wp_any(pte) &&
+                           !(zap_flags & ZAP_FLAG_DROP_MARKER))
+                               set_huge_pte_at(mm, address, ptep,
+                                               make_pte_marker(PTE_MARKER_UFFD_WP));
+                       else
+                               huge_pte_clear(mm, address, ptep, sz);
                        spin_unlock(ptl);
                        continue;
                }
@@ -5018,7 +5082,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                if (huge_pte_dirty(pte))
                        set_page_dirty(page);
-
+               /* Leave a uffd-wp pte marker if needed */
+               if (huge_pte_uffd_wp(pte) &&
+                   !(zap_flags & ZAP_FLAG_DROP_MARKER))
+                       set_huge_pte_at(mm, address, ptep,
+                                       make_pte_marker(PTE_MARKER_UFFD_WP));
                hugetlb_count_sub(pages_per_huge_page(h), mm);
                page_remove_rmap(page, vma, true);
 
@@ -5052,9 +5120,10 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                          struct vm_area_struct *vma, unsigned long start,
-                         unsigned long end, struct page *ref_page)
+                         unsigned long end, struct page *ref_page,
+                         zap_flags_t zap_flags)
 {
-       __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+       __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
 
        /*
         * Clear this flag so that x86's huge_pmd_share page_table_shareable
@@ -5070,12 +5139,13 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                         unsigned long end, struct page *ref_page)
+                         unsigned long end, struct page *ref_page,
+                         zap_flags_t zap_flags)
 {
        struct mmu_gather tlb;
 
        tlb_gather_mmu(&tlb, vma->vm_mm);
-       __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+       __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
        tlb_finish_mmu(&tlb);
 }
 
@@ -5130,21 +5200,22 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                        unmap_hugepage_range(iter_vma, address,
-                                            address + huge_page_size(h), page);
+                                            address + huge_page_size(h), page, 0);
        }
        i_mmap_unlock_write(mapping);
 }
 
 /*
- * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * hugetlb_wp() should be called with page lock of the original hugepage held.
  * Called with hugetlb_fault_mutex_table held and pte_page locked so we
  * cannot race with other handlers or page migration.
  * Keep the pte_same checks anyway to make transition from the mutex easier.
  */
-static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                      unsigned long address, pte_t *ptep,
+static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
+                      unsigned long address, pte_t *ptep, unsigned int flags,
                       struct page *pagecache_page, spinlock_t *ptl)
 {
+       const bool unshare = flags & FAULT_FLAG_UNSHARE;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
@@ -5153,17 +5224,26 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long haddr = address & huge_page_mask(h);
        struct mmu_notifier_range range;
 
+       VM_BUG_ON(unshare && (flags & FOLL_WRITE));
+       VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
+
        pte = huge_ptep_get(ptep);
        old_page = pte_page(pte);
 
 retry_avoidcopy:
-       /* If no-one else is actually using this page, avoid the copy
-        * and just make the page writable */
+       /*
+        * If no-one else is actually using this page, we're the exclusive
+        * owner and can reuse this page.
+        */
        if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
-               page_move_anon_rmap(old_page, vma);
-               set_huge_ptep_writable(vma, haddr, ptep);
+               if (!PageAnonExclusive(old_page))
+                       page_move_anon_rmap(old_page, vma);
+               if (likely(!unshare))
+                       set_huge_ptep_writable(vma, haddr, ptep);
                return 0;
        }
+       VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
+                      old_page);
 
        /*
         * If the process that created a MAP_PRIVATE mapping is about to
@@ -5262,13 +5342,13 @@ retry_avoidcopy:
        if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
                ClearHPageRestoreReserve(new_page);
 
-               /* Break COW */
+               /* Break COW or unshare */
                huge_ptep_clear_flush(vma, haddr, ptep);
                mmu_notifier_invalidate_range(mm, range.start, range.end);
                page_remove_rmap(old_page, vma, true);
                hugepage_add_new_anon_rmap(new_page, vma, haddr);
                set_huge_pte_at(mm, haddr, ptep,
-                               make_huge_pte(vma, new_page, 1));
+                               make_huge_pte(vma, new_page, !unshare));
                SetHPageMigratable(new_page);
                /* Make the old page be freed below */
                new_page = old_page;
@@ -5276,7 +5356,10 @@ retry_avoidcopy:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
 out_release_all:
-       /* No restore in case of successful pagetable update (Break COW) */
+       /*
+        * No restore in case of successful pagetable update (Break COW or
+        * unshare)
+        */
        if (new_page != old_page)
                restore_reserve_on_error(h, vma, haddr, new_page);
        put_page(new_page);
@@ -5386,7 +5469,8 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                        struct vm_area_struct *vma,
                        struct address_space *mapping, pgoff_t idx,
-                       unsigned long address, pte_t *ptep, unsigned int flags)
+                       unsigned long address, pte_t *ptep,
+                       pte_t old_pte, unsigned int flags)
 {
        struct hstate *h = hstate_vma(vma);
        vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -5401,7 +5485,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        /*
         * Currently, we are forced to kill the process in the event the
         * original mapper has unmapped pages from the child due to a failed
-        * COW. Warn that such a situation has occurred as it may not be obvious
+        * COW/unsharing. Warn that such a situation has occurred as it may not
+        * be obvious.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
@@ -5512,22 +5597,29 @@ retry:
 
        ptl = huge_pte_lock(h, mm, ptep);
        ret = 0;
-       if (!huge_pte_none(huge_ptep_get(ptep)))
+       /* If pte changed from under us, retry */
+       if (!pte_same(huge_ptep_get(ptep), old_pte))
                goto backout;
 
        if (anon_rmap) {
                ClearHPageRestoreReserve(page);
                hugepage_add_new_anon_rmap(page, vma, haddr);
        } else
-               page_dup_rmap(page, true);
+               page_dup_file_rmap(page, true);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                && (vma->vm_flags & VM_SHARED)));
+       /*
+        * If this pte was previously wr-protected, keep it wr-protected even
+        * if populated.
+        */
+       if (unlikely(pte_marker_uffd_wp(old_pte)))
+               new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
        set_huge_pte_at(mm, haddr, ptep, new_pte);
 
        hugetlb_count_add(pages_per_huge_page(h), mm);
        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-               ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+               ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
        }
 
        spin_unlock(ptl);
@@ -5639,8 +5731,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
        entry = huge_ptep_get(ptep);
-       if (huge_pte_none(entry)) {
-               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
+       /* PTE markers should be handled the same way as none pte */
+       if (huge_pte_none_mostly(entry)) {
+               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+                                     entry, flags);
                goto out_mutex;
        }
 
@@ -5657,14 +5751,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_mutex;
 
        /*
-        * If we are going to COW the mapping later, we examine the pending
-        * reservations for this page now. This will ensure that any
+        * If we are going to COW/unshare the mapping later, we examine the
+        * pending reservations for this page now. This will ensure that any
         * allocations necessary to record that reservation occur outside the
         * spinlock. For private mappings, we also lookup the pagecache
         * page now as it is used to determine if a reservation has been
         * consumed.
         */
-       if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+       if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+           !huge_pte_write(entry)) {
                if (vma_needs_reservation(h, vma, haddr) < 0) {
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
@@ -5679,12 +5774,32 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
        ptl = huge_pte_lock(h, mm, ptep);
 
-       /* Check for a racing update before calling hugetlb_cow */
+       /* Check for a racing update before calling hugetlb_wp() */
        if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
                goto out_ptl;
 
+       /* Handle userfault-wp first, before trying to lock more pages */
+       if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
+           (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+               struct vm_fault vmf = {
+                       .vma = vma,
+                       .address = haddr,
+                       .real_address = address,
+                       .flags = flags,
+               };
+
+               spin_unlock(ptl);
+               if (pagecache_page) {
+                       unlock_page(pagecache_page);
+                       put_page(pagecache_page);
+               }
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+               i_mmap_unlock_read(mapping);
+               return handle_userfault(&vmf, VM_UFFD_WP);
+       }
+
        /*
-        * hugetlb_cow() requires page locks of pte_page(entry) and
+        * hugetlb_wp() requires page locks of pte_page(entry) and
         * pagecache_page, so here we need take the former one
         * when page != pagecache_page or !pagecache_page.
         */
@@ -5697,13 +5812,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
        get_page(page);
 
-       if (flags & FAULT_FLAG_WRITE) {
+       if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!huge_pte_write(entry)) {
-                       ret = hugetlb_cow(mm, vma, address, ptep,
-                                         pagecache_page, ptl);
+                       ret = hugetlb_wp(mm, vma, address, ptep, flags,
+                                        pagecache_page, ptl);
                        goto out_put_page;
+               } else if (likely(flags & FAULT_FLAG_WRITE)) {
+                       entry = huge_pte_mkdirty(entry);
                }
-               entry = huge_pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
@@ -5746,7 +5862,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                            unsigned long dst_addr,
                            unsigned long src_addr,
                            enum mcopy_atomic_mode mode,
-                           struct page **pagep)
+                           struct page **pagep,
+                           bool wp_copy)
 {
        bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
        struct hstate *h = hstate_vma(dst_vma);
@@ -5876,27 +5993,43 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                goto out_release_unlock;
 
        ret = -EEXIST;
-       if (!huge_pte_none(huge_ptep_get(dst_pte)))
+       /*
+        * We allow to overwrite a pte marker: consider when both MISSING|WP
+        * registered, we firstly wr-protect a none pte which has no page cache
+        * page backing it, then access the page.
+        */
+       if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                goto out_release_unlock;
 
        if (vm_shared) {
-               page_dup_rmap(page, true);
+               page_dup_file_rmap(page, true);
        } else {
                ClearHPageRestoreReserve(page);
                hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
        }
 
-       /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
-       if (is_continue && !vm_shared)
+       /*
+        * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
+        * with wp flag set, don't set pte write bit.
+        */
+       if (wp_copy || (is_continue && !vm_shared))
                writable = 0;
        else
                writable = dst_vma->vm_flags & VM_WRITE;
 
        _dst_pte = make_huge_pte(dst_vma, page, writable);
-       if (writable)
-               _dst_pte = huge_pte_mkdirty(_dst_pte);
+       /*
+        * Always mark UFFDIO_COPY page dirty; note that this may not be
+        * extremely important for hugetlbfs for now since swapping is not
+        * supported, but we should still be clear in that this page cannot be
+        * thrown away at will, even if write bit not set.
+        */
+       _dst_pte = huge_pte_mkdirty(_dst_pte);
        _dst_pte = pte_mkyoung(_dst_pte);
 
+       if (wp_copy)
+               _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
+
        set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
        (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
@@ -5940,6 +6073,25 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
        }
 }
 
+static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+                                              bool *unshare)
+{
+       pte_t pteval = huge_ptep_get(pte);
+
+       *unshare = false;
+       if (is_swap_pte(pteval))
+               return true;
+       if (huge_pte_write(pteval))
+               return false;
+       if (flags & FOLL_WRITE)
+               return true;
+       if (gup_must_unshare(flags, pte_page(pteval))) {
+               *unshare = true;
+               return true;
+       }
+       return false;
+}
+
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         struct page **pages, struct vm_area_struct **vmas,
                         unsigned long *position, unsigned long *nr_pages,
@@ -5954,6 +6106,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;
                spinlock_t *ptl = NULL;
+               bool unshare = false;
                int absent;
                struct page *page;
 
@@ -6004,9 +6157,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * both cases, and because we can't follow correct pages
                 * directly from any kind of swap entries.
                 */
-               if (absent || is_swap_pte(huge_ptep_get(pte)) ||
-                   ((flags & FOLL_WRITE) &&
-                     !huge_pte_write(huge_ptep_get(pte)))) {
+               if (absent ||
+                   __follow_hugetlb_must_fault(flags, pte, &unshare)) {
                        vm_fault_t ret;
                        unsigned int fault_flags = 0;
 
@@ -6014,6 +6166,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                spin_unlock(ptl);
                        if (flags & FOLL_WRITE)
                                fault_flags |= FAULT_FLAG_WRITE;
+                       else if (unshare)
+                               fault_flags |= FAULT_FLAG_UNSHARE;
                        if (locked)
                                fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                        FAULT_FLAG_KILLABLE;
@@ -6055,6 +6209,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                page = pte_page(huge_ptep_get(pte));
 
+               VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+                              !PageAnonExclusive(page), page);
+
                /*
                 * If subpage information not requested, update counters
                 * and skip the same_page loop below.
@@ -6117,16 +6274,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
-               unsigned long address, unsigned long end, pgprot_t newprot)
+               unsigned long address, unsigned long end,
+               pgprot_t newprot, unsigned long cp_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long start = address;
        pte_t *ptep;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
-       unsigned long pages = 0;
+       unsigned long pages = 0, psize = huge_page_size(h);
        bool shared_pmd = false;
        struct mmu_notifier_range range;
+       bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+       bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
        /*
         * In the case of shared PMDs, the area to flush could be beyond
@@ -6142,13 +6302,19 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 
        mmu_notifier_invalidate_range_start(&range);
        i_mmap_lock_write(vma->vm_file->f_mapping);
-       for (; address < end; address += huge_page_size(h)) {
+       for (; address < end; address += psize) {
                spinlock_t *ptl;
-               ptep = huge_pte_offset(mm, address, huge_page_size(h));
+               ptep = huge_pte_offset(mm, address, psize);
                if (!ptep)
                        continue;
                ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+                       /*
+                        * When uffd-wp is enabled on the vma, unshare
+                        * shouldn't happen at all.  Warn about it if it
+                        * happened due to some reason.
+                        */
+                       WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
                        pages++;
                        spin_unlock(ptl);
                        shared_pmd = true;
@@ -6161,20 +6327,37 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                }
                if (unlikely(is_hugetlb_entry_migration(pte))) {
                        swp_entry_t entry = pte_to_swp_entry(pte);
+                       struct page *page = pfn_swap_entry_to_page(entry);
 
-                       if (is_writable_migration_entry(entry)) {
+                       if (!is_readable_migration_entry(entry)) {
                                pte_t newpte;
 
-                               entry = make_readable_migration_entry(
-                                                       swp_offset(entry));
+                               if (PageAnon(page))
+                                       entry = make_readable_exclusive_migration_entry(
+                                                               swp_offset(entry));
+                               else
+                                       entry = make_readable_migration_entry(
+                                                               swp_offset(entry));
                                newpte = swp_entry_to_pte(entry);
+                               if (uffd_wp)
+                                       newpte = pte_swp_mkuffd_wp(newpte);
+                               else if (uffd_wp_resolve)
+                                       newpte = pte_swp_clear_uffd_wp(newpte);
                                set_huge_swap_pte_at(mm, address, ptep,
-                                                    newpte, huge_page_size(h));
+                                                    newpte, psize);
                                pages++;
                        }
                        spin_unlock(ptl);
                        continue;
                }
+               if (unlikely(pte_marker_uffd_wp(pte))) {
+                       /*
+                        * This is changing a non-present pte into a none pte,
+                        * no need for huge_ptep_modify_prot_start/commit().
+                        */
+                       if (uffd_wp_resolve)
+                               huge_pte_clear(mm, address, ptep, psize);
+               }
                if (!huge_pte_none(pte)) {
                        pte_t old_pte;
                        unsigned int shift = huge_page_shift(hstate_vma(vma));
@@ -6182,8 +6365,18 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                        old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
                        pte = huge_pte_modify(old_pte, newprot);
                        pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
+                       if (uffd_wp)
+                               pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
+                       else if (uffd_wp_resolve)
+                               pte = huge_pte_clear_uffd_wp(pte);
                        huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
                        pages++;
+               } else {
+                       /* None pte */
+                       if (unlikely(uffd_wp))
+                               /* Safe to modify directly (none->non-present). */
+                               set_huge_pte_at(mm, address, ptep,
+                                               make_pte_marker(PTE_MARKER_UFFD_WP));
                }
                spin_unlock(ptl);
        }
@@ -6693,9 +6886,11 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        spinlock_t *ptl;
        pte_t pte;
 
-       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
-       if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
-                        (FOLL_PIN | FOLL_GET)))
+       /*
+        * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+        * follow_hugetlb_page().
+        */
+       if (WARN_ON_ONCE(flags & FOLL_PIN))
                return NULL;
 
 retry:
@@ -6783,7 +6978,9 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
        spin_lock_irq(&hugetlb_lock);
        if (PageHeadHuge(page)) {
                *hugetlb = true;
-               if (HPageFreed(page) || HPageMigratable(page))
+               if (HPageFreed(page))
+                       ret = 0;
+               else if (HPageMigratable(page))
                        ret = get_page_unless_zero(page);
                else
                        ret = -EBUSY;
@@ -6873,6 +7070,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
        if (start >= end)
                return;
 
+       flush_cache_range(vma, start, end);
        /*
         * No need to call adjust_range_if_pmd_sharing_possible(), because
         * we have already done the PUD_SIZE alignment.
@@ -6958,7 +7156,7 @@ void __init hugetlb_cma_reserve(int order)
                if (hugetlb_cma_size_in_node[nid] == 0)
                        continue;
 
-               if (!node_state(nid, N_ONLINE)) {
+               if (!node_online(nid)) {
                        pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
                        hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
                        hugetlb_cma_size_in_node[nid] = 0;
@@ -6997,7 +7195,7 @@ void __init hugetlb_cma_reserve(int order)
        }
 
        reserved = 0;
-       for_each_node_state(nid, N_ONLINE) {
+       for_each_online_node(nid) {
                int res;
                char name[CMA_MAX_NAME];