Merge tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kerne...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 27 May 2022 18:29:35 +0000 (11:29 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 27 May 2022 18:29:35 +0000 (11:29 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 27 May 2022 18:29:35 +0000 (11:29 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 27 May 2022 18:29:35 +0000 (11:29 -0700)
diff --combined mm/hugetlb.c

index 01f0e2e,410bbb0..7c468ac
--- 1/mm/hugetlb.c
--- 2/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@@ -370,7 -370,7 +370,7 @@@ static void coalesce_file_region(struc
   }
   
   static inline long
- -hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
+ +hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
                      long to, struct hstate *h, struct hugetlb_cgroup *cg,
                      long *regions_needed)
   {
@@@ -379,7 -379,7 +379,7 @@@
         if (!regions_needed) {
                 nrg = get_file_region_entry_from_cache(map, from, to);
                 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
- -              list_add(&nrg->link, rg->link.prev);
+ +              list_add(&nrg->link, rg);
                 coalesce_file_region(map, nrg);
         } else
                 *regions_needed += 1;
@@@ -402,52 -402,47 +402,52 @@@ static long add_reservation_in_range(st
         long add = 0;
         struct list_head *head = &resv->regions;
         long last_accounted_offset = f;
- -      struct file_region *rg = NULL, *trg = NULL;
+ +      struct file_region *iter, *trg = NULL;
+ +      struct list_head *rg = NULL;
   
         if (regions_needed)
                 *regions_needed = 0;
   
         /* In this loop, we essentially handle an entry for the range
- -       * [last_accounted_offset, rg->from), at every iteration, with some
+ +       * [last_accounted_offset, iter->from), at every iteration, with some
          * bounds checking.
          */
- -      list_for_each_entry_safe(rg, trg, head, link) {
+ +      list_for_each_entry_safe(iter, trg, head, link) {
                 /* Skip irrelevant regions that start before our range. */
- -              if (rg->from < f) {
+ +              if (iter->from < f) {
                         /* If this region ends after the last accounted offset,
                          * then we need to update last_accounted_offset.
                          */
- -                      if (rg->to > last_accounted_offset)
- -                              last_accounted_offset = rg->to;
+ +                      if (iter->to > last_accounted_offset)
+ +                              last_accounted_offset = iter->to;
                         continue;
                 }
   
                 /* When we find a region that starts beyond our range, we've
                  * finished.
                  */
- -              if (rg->from >= t)
+ +              if (iter->from >= t) {
+ +                      rg = iter->link.prev;
                         break;
+ +              }
   
- -              /* Add an entry for last_accounted_offset -> rg->from, and
+ +              /* Add an entry for last_accounted_offset -> iter->from, and
                  * update last_accounted_offset.
                  */
- -              if (rg->from > last_accounted_offset)
- -                      add += hugetlb_resv_map_add(resv, rg,
+ +              if (iter->from > last_accounted_offset)
+ +                      add += hugetlb_resv_map_add(resv, iter->link.prev,
                                                     last_accounted_offset,
- -                                                  rg->from, h, h_cg,
+ +                                                  iter->from, h, h_cg,
                                                     regions_needed);
   
- -              last_accounted_offset = rg->to;
+ +              last_accounted_offset = iter->to;
         }
   
         /* Handle the case where our range extends beyond
          * last_accounted_offset.
          */
+ +      if (!rg)
+ +              rg = head->prev;
         if (last_accounted_offset < t)
                 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
                                             t, h, h_cg, regions_needed);
@@@ -1540,7 -1535,7 +1540,7 @@@ static void __update_and_free_page(stru
         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                 return;
   
- -      if (alloc_huge_page_vmemmap(h, page)) {
+ +      if (hugetlb_vmemmap_alloc(h, page)) {
                 spin_lock_irq(&hugetlb_lock);
                 /*
                  * If we cannot allocate vmemmap pages, just refuse to free the
@@@ -1617,7 -1612,7 +1617,7 @@@ static DECLARE_WORK(free_hpage_work, fr
   
   static inline void flush_free_hpage_work(struct hstate *h)
   {
- -      if (free_vmemmap_pages_per_hpage(h))
+ +      if (hugetlb_optimize_vmemmap_pages(h))
                 flush_work(&free_hpage_work);
   }
   
@@@ -1677,8 -1672,6 +1677,8 @@@ void free_huge_page(struct page *page
         VM_BUG_ON_PAGE(page_mapcount(page), page);
   
         hugetlb_set_page_subpool(page, NULL);
+ +      if (PageAnon(page))
+ +              __ClearPageAnonExclusive(page);
         page->mapping = NULL;
         restore_reserve = HPageRestoreReserve(page);
         ClearHPageRestoreReserve(page);
@@@ -1739,7 -1732,7 +1739,7 @@@ static void __prep_account_new_huge_pag
   
   static void __prep_new_huge_page(struct hstate *h, struct page *page)
   {
- -      free_huge_page_vmemmap(h, page);
+ +      hugetlb_vmemmap_free(h, page);
         INIT_LIST_HEAD(&page->lru);
         set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
         hugetlb_set_page_subpool(page, NULL);
@@@ -2112,7 -2105,7 +2112,7 @@@ retry
                  * Attempt to allocate vmemmmap here so that we can take
                  * appropriate action on failure.
                  */
- -              rc = alloc_huge_page_vmemmap(h, head);
+ +              rc = hugetlb_vmemmap_alloc(h, head);
                 if (!rc) {
                         /*
                          * Move PageHWPoison flag from head page to the raw
@@@ -2986,6 -2979,8 +2986,6 @@@ int __alloc_bootmem_huge_page(struct hs
         struct huge_bootmem_page *m = NULL; /* initialize for clang */
         int nr_nodes, node;
   
- -      if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
- -              return 0;
         /* do node specific alloc */
         if (nid != NUMA_NO_NODE) {
                 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
@@@ -3093,7 -3088,7 +3093,7 @@@ static void __init hugetlb_hstate_alloc
         }
   
         /* do node specific alloc */
- -      for (i = 0; i < nr_online_nodes; i++) {
+ +      for_each_online_node(i) {
                 if (h->max_huge_pages_node[i] > 0) {
                         hugetlb_hstate_alloc_pages_onenode(h, i);
                         node_specific_alloc = true;
@@@ -3425,7 -3420,7 +3425,7 @@@ static int demote_free_huge_page(struc
         remove_hugetlb_page_for_demote(h, page, false);
         spin_unlock_irq(&hugetlb_lock);
   
- -      rc = alloc_huge_page_vmemmap(h, page);
+ +      rc = hugetlb_vmemmap_alloc(h, page);
         if (rc) {
                 /* Allocation of vmemmmap failed, we can not demote page */
                 spin_lock_irq(&hugetlb_lock);
@@@ -4057,7 -4052,7 +4057,7 @@@ static int __init hugetlb_init(void
                         default_hstate.max_huge_pages =
                                 default_hstate_max_huge_pages;
   
- -                      for (i = 0; i < nr_online_nodes; i++)
+ +                      for_each_online_node(i)
                                 default_hstate.max_huge_pages_node[i] =
                                         default_hugepages_in_node[i];
                 }
@@@ -4124,20 -4119,6 +4124,20 @@@ bool __init __weak hugetlb_node_alloc_s
   {
         return true;
   }
+ +
+ +static void __init hugepages_clear_pages_in_node(void)
+ +{
+ +      if (!hugetlb_max_hstate) {
+ +              default_hstate_max_huge_pages = 0;
+ +              memset(default_hugepages_in_node, 0,
+ +                      MAX_NUMNODES * sizeof(unsigned int));
+ +      } else {
+ +              parsed_hstate->max_huge_pages = 0;
+ +              memset(parsed_hstate->max_huge_pages_node, 0,
+ +                      MAX_NUMNODES * sizeof(unsigned int));
+ +      }
+ +}
+ +
   /*
    * hugepages command line processing
    * hugepages normally follows a valid hugepagsz or default_hugepagsz
@@@ -4157,7 -4138,7 +4157,7 @@@ static int __init hugepages_setup(char 
         if (!parsed_valid_hugepagesz) {
                 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
                 parsed_valid_hugepagesz = true;
- -              return 0;
+ +              return 1;
         }
   
         /*
@@@ -4173,7 -4154,7 +4173,7 @@@
   
         if (mhp == last_mhp) {
                 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
- -              return 0;
+ +              return 1;
         }
   
         while (*p) {
@@@ -4184,11 -4165,11 +4184,11 @@@
                 if (p[count] == ':') {
                         if (!hugetlb_node_alloc_supported()) {
                                 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
- -                              return 0;
+ +                              return 1;
                         }
- -                      if (tmp >= nr_online_nodes)
+ +                      if (tmp >= MAX_NUMNODES || !node_online(tmp))
                                 goto invalid;
- -                      node = array_index_nospec(tmp, nr_online_nodes);
+ +                      node = array_index_nospec(tmp, MAX_NUMNODES);
                         p += count + 1;
                         /* Parse hugepages */
                         if (sscanf(p, "%lu%n", &tmp, &count) != 1)
@@@ -4225,8 -4206,7 +4225,8 @@@
   
   invalid:
         pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
- -      return 0;
+ +      hugepages_clear_pages_in_node();
+ +      return 1;
   }
   __setup("hugepages=", hugepages_setup);
   
@@@ -4247,7 -4227,7 +4247,7 @@@ static int __init hugepagesz_setup(cha
   
         if (!arch_hugetlb_valid_size(size)) {
                 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
- -              return 0;
+ +              return 1;
         }
   
         h = size_to_hstate(size);
@@@ -4262,7 -4242,7 +4262,7 @@@
                 if (!parsed_default_hugepagesz ||  h != &default_hstate ||
                     default_hstate.max_huge_pages) {
                         pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
- -                      return 0;
+ +                      return 1;
                 }
   
                 /*
@@@ -4293,14 -4273,14 +4293,14 @@@ static int __init default_hugepagesz_se
         parsed_valid_hugepagesz = false;
         if (parsed_default_hugepagesz) {
                 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
- -              return 0;
+ +              return 1;
         }
   
         size = (unsigned long)memparse(s, NULL);
   
         if (!arch_hugetlb_valid_size(size)) {
                 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
- -              return 0;
+ +              return 1;
         }
   
         hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@@ -4317,7 -4297,7 +4317,7 @@@
          */
         if (default_hstate_max_huge_pages) {
                 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
- -              for (i = 0; i < nr_online_nodes; i++)
+ +              for_each_online_node(i)
                         default_hstate.max_huge_pages_node[i] =
                                 default_hugepages_in_node[i];
                 if (hstate_is_gigantic(&default_hstate))
@@@ -4719,27 -4699,24 +4719,27 @@@ hugetlb_install_page(struct vm_area_str
   }
   
   int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
- -                          struct vm_area_struct *vma)
+ +                          struct vm_area_struct *dst_vma,
+ +                          struct vm_area_struct *src_vma)
   {
         pte_t *src_pte, *dst_pte, entry, dst_entry;
         struct page *ptepage;
         unsigned long addr;
- -      bool cow = is_cow_mapping(vma->vm_flags);
- -      struct hstate *h = hstate_vma(vma);
+ +      bool cow = is_cow_mapping(src_vma->vm_flags);
+ +      struct hstate *h = hstate_vma(src_vma);
         unsigned long sz = huge_page_size(h);
         unsigned long npages = pages_per_huge_page(h);
- -      struct address_space *mapping = vma->vm_file->f_mapping;
+ +      struct address_space *mapping = src_vma->vm_file->f_mapping;
         struct mmu_notifier_range range;
         int ret = 0;
   
         if (cow) {
- -              mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
- -                                      vma->vm_start,
- -                                      vma->vm_end);
+ +              mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
+ +                                      src_vma->vm_start,
+ +                                      src_vma->vm_end);
                 mmu_notifier_invalidate_range_start(&range);
+ +              mmap_assert_write_locked(src);
+ +              raw_write_seqcount_begin(&src->write_protect_seq);
         } else {
                 /*
                  * For shared mappings i_mmap_rwsem must be held to call
@@@ -4750,12 -4727,12 +4750,12 @@@
                 i_mmap_lock_read(mapping);
         }
   
- -      for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ +      for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
                 spinlock_t *src_ptl, *dst_ptl;
                 src_pte = huge_pte_offset(src, addr, sz);
                 if (!src_pte)
                         continue;
- -              dst_pte = huge_pte_alloc(dst, vma, addr, sz);
+ +              dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
                 if (!dst_pte) {
                         ret = -ENOMEM;
                         break;
@@@ -4790,9 -4767,8 +4790,9 @@@ again
                 } else if (unlikely(is_hugetlb_entry_migration(entry) ||
                                     is_hugetlb_entry_hwpoisoned(entry))) {
                         swp_entry_t swp_entry = pte_to_swp_entry(entry);
+ +                      bool uffd_wp = huge_pte_uffd_wp(entry);
   
- -                      if (is_writable_migration_entry(swp_entry) && cow) {
+ +                      if (!is_readable_migration_entry(swp_entry) && cow) {
                                 /*
                                  * COW mappings require pages in both
                                  * parent and child to be set to read.
@@@ -4800,53 -4776,38 +4800,53 @@@
                                 swp_entry = make_readable_migration_entry(
                                                         swp_offset(swp_entry));
                                 entry = swp_entry_to_pte(swp_entry);
+ +                              if (userfaultfd_wp(src_vma) && uffd_wp)
+ +                                      entry = huge_pte_mkuffd_wp(entry);
                                 set_huge_swap_pte_at(src, addr, src_pte,
                                                      entry, sz);
                         }
+ +                      if (!userfaultfd_wp(dst_vma) && uffd_wp)
+ +                              entry = huge_pte_clear_uffd_wp(entry);
                         set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+ +              } else if (unlikely(is_pte_marker(entry))) {
+ +                      /*
+ +                       * We copy the pte marker only if the dst vma has
+ +                       * uffd-wp enabled.
+ +                       */
+ +                      if (userfaultfd_wp(dst_vma))
+ +                              set_huge_pte_at(dst, addr, dst_pte, entry);
                 } else {
                         entry = huge_ptep_get(src_pte);
                         ptepage = pte_page(entry);
                         get_page(ptepage);
   
                         /*
- -                       * This is a rare case where we see pinned hugetlb
- -                       * pages while they're prone to COW.  We need to do the
- -                       * COW earlier during fork.
+ +                       * Failing to duplicate the anon rmap is a rare case
+ +                       * where we see pinned hugetlb pages while they're
+ +                       * prone to COW. We need to do the COW earlier during
+ +                       * fork.
                          *
                          * When pre-allocating the page or copying data, we
                          * need to be without the pgtable locks since we could
                          * sleep during the process.
                          */
- -                      if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+ +                      if (!PageAnon(ptepage)) {
+ +                              page_dup_file_rmap(ptepage, true);
+ +                      } else if (page_try_dup_anon_rmap(ptepage, true,
+ +                                                        src_vma)) {
                                 pte_t src_pte_old = entry;
                                 struct page *new;
   
                                 spin_unlock(src_ptl);
                                 spin_unlock(dst_ptl);
                                 /* Do not use reserve as it's private owned */
- -                              new = alloc_huge_page(vma, addr, 1);
+ +                              new = alloc_huge_page(dst_vma, addr, 1);
                                 if (IS_ERR(new)) {
                                         put_page(ptepage);
                                         ret = PTR_ERR(new);
                                         break;
                                 }
- -                              copy_user_huge_page(new, ptepage, addr, vma,
+ +                              copy_user_huge_page(new, ptepage, addr, dst_vma,
                                                     npages);
                                 put_page(ptepage);
   
@@@ -4856,13 -4817,13 +4856,13 @@@
                                 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                                 entry = huge_ptep_get(src_pte);
                                 if (!pte_same(src_pte_old, entry)) {
- -                                      restore_reserve_on_error(h, vma, addr,
+ +                                      restore_reserve_on_error(h, dst_vma, addr,
                                                                 new);
                                         put_page(new);
                                         /* dst_entry won't change as in child */
                                         goto again;
                                 }
- -                              hugetlb_install_page(vma, dst_pte, addr, new);
+ +                              hugetlb_install_page(dst_vma, dst_pte, addr, new);
                                 spin_unlock(src_ptl);
                                 spin_unlock(dst_ptl);
                                 continue;
@@@ -4880,6 -4841,7 +4880,6 @@@
                                 entry = huge_pte_wrprotect(entry);
                         }
   
- -                      page_dup_rmap(ptepage, true);
                         set_huge_pte_at(dst, addr, dst_pte, entry);
                         hugetlb_count_add(npages, dst);
                 }
@@@ -4887,12 -4849,10 +4887,12 @@@
                 spin_unlock(dst_ptl);
         }
   
- -      if (cow)
+ +      if (cow) {
+ +              raw_write_seqcount_end(&src->write_protect_seq);
                 mmu_notifier_invalidate_range_end(&range);
- -      else
+ +      } else {
                 i_mmap_unlock_read(mapping);
+ +      }
   
         return ret;
   }
@@@ -4936,17 -4896,10 +4936,17 @@@ int move_hugetlb_page_tables(struct vm_
         unsigned long old_addr_copy;
         pte_t *src_pte, *dst_pte;
         struct mmu_notifier_range range;
+ +      bool shared_pmd = false;
   
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
                                 old_end);
         adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ +      /*
+ +       * In case of shared PMDs, we should cover the maximum possible
+ +       * range.
+ +       */
+ +      flush_cache_range(vma, range.start, range.end);
+ +
         mmu_notifier_invalidate_range_start(&range);
         /* Prevent race with file truncation */
         i_mmap_lock_write(mapping);
@@@ -4963,10 -4916,8 +4963,10 @@@
                  */
                 old_addr_copy = old_addr;
   
- -              if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
+ +              if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+ +                      shared_pmd = true;
                         continue;
+ +              }
   
                 dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
                 if (!dst_pte)
@@@ -4974,11 -4925,7 +4974,11 @@@
   
                 move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
         }
- -      flush_tlb_range(vma, old_end - len, old_end);
+ +
+ +      if (shared_pmd)
+ +              flush_tlb_range(vma, range.start, range.end);
+ +      else
+ +              flush_tlb_range(vma, old_end - len, old_end);
         mmu_notifier_invalidate_range_end(&range);
         i_mmap_unlock_write(mapping);
   
@@@ -4987,7 -4934,7 +4987,7 @@@
   
   static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                    unsigned long start, unsigned long end,
- -                                 struct page *ref_page)
+ +                                 struct page *ref_page, zap_flags_t zap_flags)
   {
         struct mm_struct *mm = vma->vm_mm;
         unsigned long address;
@@@ -5043,18 -4990,7 +5043,18 @@@
                  * unmapped and its refcount is dropped, so just clear pte here.
                  */
                 if (unlikely(!pte_present(pte))) {
- -                      huge_pte_clear(mm, address, ptep, sz);
+ +                      /*
+ +                       * If the pte was wr-protected by uffd-wp in any of the
+ +                       * swap forms, meanwhile the caller does not want to
+ +                       * drop the uffd-wp bit in this zap, then replace the
+ +                       * pte with a marker.
+ +                       */
+ +                      if (pte_swp_uffd_wp_any(pte) &&
+ +                          !(zap_flags & ZAP_FLAG_DROP_MARKER))
+ +                              set_huge_pte_at(mm, address, ptep,
+ +                                              make_pte_marker(PTE_MARKER_UFFD_WP));
+ +                      else
+ +                              huge_pte_clear(mm, address, ptep, sz);
                         spin_unlock(ptl);
                         continue;
                 }
@@@ -5082,11 -5018,7 +5082,11 @@@
                 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                 if (huge_pte_dirty(pte))
                         set_page_dirty(page);
- -
+ +              /* Leave a uffd-wp pte marker if needed */
+ +              if (huge_pte_uffd_wp(pte) &&
+ +                  !(zap_flags & ZAP_FLAG_DROP_MARKER))
+ +                      set_huge_pte_at(mm, address, ptep,
+ +                                      make_pte_marker(PTE_MARKER_UFFD_WP));
                 hugetlb_count_sub(pages_per_huge_page(h), mm);
                 page_remove_rmap(page, vma, true);
   
@@@ -5120,10 -5052,9 +5120,10 @@@
   
   void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                           struct vm_area_struct *vma, unsigned long start,
- -                        unsigned long end, struct page *ref_page)
+ +                        unsigned long end, struct page *ref_page,
+ +                        zap_flags_t zap_flags)
   {
- -      __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+ +      __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
   
         /*
          * Clear this flag so that x86's huge_pmd_share page_table_shareable
@@@ -5139,13 -5070,12 +5139,13 @@@
   }
   
   void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- -                        unsigned long end, struct page *ref_page)
+ +                        unsigned long end, struct page *ref_page,
+ +                        zap_flags_t zap_flags)
   {
         struct mmu_gather tlb;
   
         tlb_gather_mmu(&tlb, vma->vm_mm);
- -      __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ +      __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
         tlb_finish_mmu(&tlb);
   }
   
@@@ -5200,22 -5130,21 +5200,22 @@@ static void unmap_ref_private(struct mm
                  */
                 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                         unmap_hugepage_range(iter_vma, address,
- -                                           address + huge_page_size(h), page);
+ +                                           address + huge_page_size(h), page, 0);
         }
         i_mmap_unlock_write(mapping);
   }
   
   /*
- - * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ + * hugetlb_wp() should be called with page lock of the original hugepage held.
    * Called with hugetlb_fault_mutex_table held and pte_page locked so we
    * cannot race with other handlers or page migration.
    * Keep the pte_same checks anyway to make transition from the mutex easier.
    */
- -static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- -                     unsigned long address, pte_t *ptep,
+ +static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
+ +                     unsigned long address, pte_t *ptep, unsigned int flags,
                        struct page *pagecache_page, spinlock_t *ptl)
   {
+ +      const bool unshare = flags & FAULT_FLAG_UNSHARE;
         pte_t pte;
         struct hstate *h = hstate_vma(vma);
         struct page *old_page, *new_page;
@@@ -5224,26 -5153,17 +5224,26 @@@
         unsigned long haddr = address & huge_page_mask(h);
         struct mmu_notifier_range range;
   
+ +      VM_BUG_ON(unshare && (flags & FOLL_WRITE));
+ +      VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
+ +
         pte = huge_ptep_get(ptep);
         old_page = pte_page(pte);
   
   retry_avoidcopy:
- -      /* If no-one else is actually using this page, avoid the copy
- -       * and just make the page writable */
+ +      /*
+ +       * If no-one else is actually using this page, we're the exclusive
+ +       * owner and can reuse this page.
+ +       */
         if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
- -              page_move_anon_rmap(old_page, vma);
- -              set_huge_ptep_writable(vma, haddr, ptep);
+ +              if (!PageAnonExclusive(old_page))
+ +                      page_move_anon_rmap(old_page, vma);
+ +              if (likely(!unshare))
+ +                      set_huge_ptep_writable(vma, haddr, ptep);
                 return 0;
         }
+ +      VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
+ +                     old_page);
   
         /*
          * If the process that created a MAP_PRIVATE mapping is about to
@@@ -5342,13 -5262,13 +5342,13 @@@
         if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
                 ClearHPageRestoreReserve(new_page);
   
- -              /* Break COW */
+ +              /* Break COW or unshare */
                 huge_ptep_clear_flush(vma, haddr, ptep);
                 mmu_notifier_invalidate_range(mm, range.start, range.end);
                 page_remove_rmap(old_page, vma, true);
                 hugepage_add_new_anon_rmap(new_page, vma, haddr);
                 set_huge_pte_at(mm, haddr, ptep,
- -                              make_huge_pte(vma, new_page, 1));
+ +                              make_huge_pte(vma, new_page, !unshare));
                 SetHPageMigratable(new_page);
                 /* Make the old page be freed below */
                 new_page = old_page;
@@@ -5356,10 -5276,7 +5356,10 @@@
         spin_unlock(ptl);
         mmu_notifier_invalidate_range_end(&range);
   out_release_all:
- -      /* No restore in case of successful pagetable update (Break COW) */
+ +      /*
+ +       * No restore in case of successful pagetable update (Break COW or
+ +       * unshare)
+ +       */
         if (new_page != old_page)
                 restore_reserve_on_error(h, vma, haddr, new_page);
         put_page(new_page);
@@@ -5469,8 -5386,7 +5469,8 @@@ static inline vm_fault_t hugetlb_handle
   static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                         struct vm_area_struct *vma,
                         struct address_space *mapping, pgoff_t idx,
- -                      unsigned long address, pte_t *ptep, unsigned int flags)
+ +                      unsigned long address, pte_t *ptep,
+ +                      pte_t old_pte, unsigned int flags)
   {
         struct hstate *h = hstate_vma(vma);
         vm_fault_t ret = VM_FAULT_SIGBUS;
@@@ -5485,8 -5401,7 +5485,8 @@@
         /*
          * Currently, we are forced to kill the process in the event the
          * original mapper has unmapped pages from the child due to a failed
- -       * COW. Warn that such a situation has occurred as it may not be obvious
+ +       * COW/unsharing. Warn that such a situation has occurred as it may not
+ +       * be obvious.
          */
         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
@@@ -5597,29 -5512,22 +5597,29 @@@ retry
   
         ptl = huge_pte_lock(h, mm, ptep);
         ret = 0;
- -      if (!huge_pte_none(huge_ptep_get(ptep)))
+ +      /* If pte changed from under us, retry */
+ +      if (!pte_same(huge_ptep_get(ptep), old_pte))
                 goto backout;
   
         if (anon_rmap) {
                 ClearHPageRestoreReserve(page);
                 hugepage_add_new_anon_rmap(page, vma, haddr);
         } else
- -              page_dup_rmap(page, true);
+ +              page_dup_file_rmap(page, true);
         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                 && (vma->vm_flags & VM_SHARED)));
+ +      /*
+ +       * If this pte was previously wr-protected, keep it wr-protected even
+ +       * if populated.
+ +       */
+ +      if (unlikely(pte_marker_uffd_wp(old_pte)))
+ +              new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
         set_huge_pte_at(mm, haddr, ptep, new_pte);
   
         hugetlb_count_add(pages_per_huge_page(h), mm);
         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                 /* Optimization, do the COW without a second fault */
- -              ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+ +              ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
         }
   
         spin_unlock(ptl);
@@@ -5731,10 -5639,8 +5731,10 @@@ vm_fault_t hugetlb_fault(struct mm_stru
         mutex_lock(&hugetlb_fault_mutex_table[hash]);
   
         entry = huge_ptep_get(ptep);
- -      if (huge_pte_none(entry)) {
- -              ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
+ +      /* PTE markers should be handled the same way as none pte */
+ +      if (huge_pte_none_mostly(entry)) {
+ +              ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+ +                                    entry, flags);
                 goto out_mutex;
         }
   
@@@ -5751,15 -5657,14 +5751,15 @@@
                 goto out_mutex;
   
         /*
- -       * If we are going to COW the mapping later, we examine the pending
- -       * reservations for this page now. This will ensure that any
+ +       * If we are going to COW/unshare the mapping later, we examine the
+ +       * pending reservations for this page now. This will ensure that any
          * allocations necessary to record that reservation occur outside the
          * spinlock. For private mappings, we also lookup the pagecache
          * page now as it is used to determine if a reservation has been
          * consumed.
          */
- -      if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ +      if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+ +          !huge_pte_write(entry)) {
                 if (vma_needs_reservation(h, vma, haddr) < 0) {
                         ret = VM_FAULT_OOM;
                         goto out_mutex;
@@@ -5774,32 -5679,12 +5774,32 @@@
   
         ptl = huge_pte_lock(h, mm, ptep);
   
- -      /* Check for a racing update before calling hugetlb_cow */
+ +      /* Check for a racing update before calling hugetlb_wp() */
         if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
                 goto out_ptl;
   
+ +      /* Handle userfault-wp first, before trying to lock more pages */
+ +      if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
+ +          (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ +              struct vm_fault vmf = {
+ +                      .vma = vma,
+ +                      .address = haddr,
+ +                      .real_address = address,
+ +                      .flags = flags,
+ +              };
+ +
+ +              spin_unlock(ptl);
+ +              if (pagecache_page) {
+ +                      unlock_page(pagecache_page);
+ +                      put_page(pagecache_page);
+ +              }
+ +              mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ +              i_mmap_unlock_read(mapping);
+ +              return handle_userfault(&vmf, VM_UFFD_WP);
+ +      }
+ +
         /*
- -       * hugetlb_cow() requires page locks of pte_page(entry) and
+ +       * hugetlb_wp() requires page locks of pte_page(entry) and
          * pagecache_page, so here we need take the former one
          * when page != pagecache_page or !pagecache_page.
          */
@@@ -5812,14 -5697,13 +5812,14 @@@
   
         get_page(page);
   
- -      if (flags & FAULT_FLAG_WRITE) {
+ +      if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                 if (!huge_pte_write(entry)) {
- -                      ret = hugetlb_cow(mm, vma, address, ptep,
- -                                        pagecache_page, ptl);
+ +                      ret = hugetlb_wp(mm, vma, address, ptep, flags,
+ +                                       pagecache_page, ptl);
                         goto out_put_page;
+ +              } else if (likely(flags & FAULT_FLAG_WRITE)) {
+ +                      entry = huge_pte_mkdirty(entry);
                 }
- -              entry = huge_pte_mkdirty(entry);
         }
         entry = pte_mkyoung(entry);
         if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
@@@ -5862,8 -5746,7 +5862,8 @@@ int hugetlb_mcopy_atomic_pte(struct mm_
                             unsigned long dst_addr,
                             unsigned long src_addr,
                             enum mcopy_atomic_mode mode,
- -                          struct page **pagep)
+ +                          struct page **pagep,
+ +                          bool wp_copy)
   {
         bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
         struct hstate *h = hstate_vma(dst_vma);
@@@ -5993,43 -5876,27 +5993,43 @@@
                 goto out_release_unlock;
   
         ret = -EEXIST;
- -      if (!huge_pte_none(huge_ptep_get(dst_pte)))
+ +      /*
+ +       * We allow to overwrite a pte marker: consider when both MISSING|WP
+ +       * registered, we firstly wr-protect a none pte which has no page cache
+ +       * page backing it, then access the page.
+ +       */
+ +      if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                 goto out_release_unlock;
   
         if (vm_shared) {
- -              page_dup_rmap(page, true);
+ +              page_dup_file_rmap(page, true);
         } else {
                 ClearHPageRestoreReserve(page);
                 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
         }
   
- -      /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
- -      if (is_continue && !vm_shared)
+ +      /*
+ +       * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
+ +       * with wp flag set, don't set pte write bit.
+ +       */
+ +      if (wp_copy || (is_continue && !vm_shared))
                 writable = 0;
         else
                 writable = dst_vma->vm_flags & VM_WRITE;
   
         _dst_pte = make_huge_pte(dst_vma, page, writable);
- -      if (writable)
- -              _dst_pte = huge_pte_mkdirty(_dst_pte);
+ +      /*
+ +       * Always mark UFFDIO_COPY page dirty; note that this may not be
+ +       * extremely important for hugetlbfs for now since swapping is not
+ +       * supported, but we should still be clear in that this page cannot be
+ +       * thrown away at will, even if write bit not set.
+ +       */
+ +      _dst_pte = huge_pte_mkdirty(_dst_pte);
         _dst_pte = pte_mkyoung(_dst_pte);
   
+ +      if (wp_copy)
+ +              _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
+ +
         set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
   
         (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
@@@ -6073,25 -5940,6 +6073,25 @@@ static void record_subpages_vmas(struc
         }
   }
   
+ +static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+ +                                             bool *unshare)
+ +{
+ +      pte_t pteval = huge_ptep_get(pte);
+ +
+ +      *unshare = false;
+ +      if (is_swap_pte(pteval))
+ +              return true;
+ +      if (huge_pte_write(pteval))
+ +              return false;
+ +      if (flags & FOLL_WRITE)
+ +              return true;
+ +      if (gup_must_unshare(flags, pte_page(pteval))) {
+ +              *unshare = true;
+ +              return true;
+ +      }
+ +      return false;
+ +}
+ +
   long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                          struct page **pages, struct vm_area_struct **vmas,
                          unsigned long *position, unsigned long *nr_pages,
@@@ -6106,7 -5954,6 +6106,7 @@@
         while (vaddr < vma->vm_end && remainder) {
                 pte_t *pte;
                 spinlock_t *ptl = NULL;
+ +              bool unshare = false;
                 int absent;
                 struct page *page;
   
@@@ -6157,8 -6004,9 +6157,8 @@@
                  * both cases, and because we can't follow correct pages
                  * directly from any kind of swap entries.
                  */
- -              if (absent || is_swap_pte(huge_ptep_get(pte)) ||
- -                  ((flags & FOLL_WRITE) &&
- -                    !huge_pte_write(huge_ptep_get(pte)))) {
+ +              if (absent ||
+ +                  __follow_hugetlb_must_fault(flags, pte, &unshare)) {
                         vm_fault_t ret;
                         unsigned int fault_flags = 0;
   
@@@ -6166,8 -6014,6 +6166,8 @@@
                                 spin_unlock(ptl);
                         if (flags & FOLL_WRITE)
                                 fault_flags |= FAULT_FLAG_WRITE;
+ +                      else if (unshare)
+ +                              fault_flags |= FAULT_FLAG_UNSHARE;
                         if (locked)
                                 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                         FAULT_FLAG_KILLABLE;
@@@ -6209,9 -6055,6 +6209,9 @@@
                 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                 page = pte_page(huge_ptep_get(pte));
   
+ +              VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ +                             !PageAnonExclusive(page), page);
+ +
                 /*
                  * If subpage information not requested, update counters
                  * and skip the same_page loop below.
@@@ -6274,19 -6117,16 +6274,19 @@@
   }
   
   unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
- -              unsigned long address, unsigned long end, pgprot_t newprot)
+ +              unsigned long address, unsigned long end,
+ +              pgprot_t newprot, unsigned long cp_flags)
   {
         struct mm_struct *mm = vma->vm_mm;
         unsigned long start = address;
         pte_t *ptep;
         pte_t pte;
         struct hstate *h = hstate_vma(vma);
- -      unsigned long pages = 0;
+ +      unsigned long pages = 0, psize = huge_page_size(h);
         bool shared_pmd = false;
         struct mmu_notifier_range range;
+ +      bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ +      bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
   
         /*
          * In the case of shared PMDs, the area to flush could be beyond
@@@ -6302,19 -6142,13 +6302,19 @@@
   
         mmu_notifier_invalidate_range_start(&range);
         i_mmap_lock_write(vma->vm_file->f_mapping);
- -      for (; address < end; address += huge_page_size(h)) {
+ +      for (; address < end; address += psize) {
                 spinlock_t *ptl;
- -              ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ +              ptep = huge_pte_offset(mm, address, psize);
                 if (!ptep)
                         continue;
                 ptl = huge_pte_lock(h, mm, ptep);
                 if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ +                      /*
+ +                       * When uffd-wp is enabled on the vma, unshare
+ +                       * shouldn't happen at all.  Warn about it if it
+ +                       * happened due to some reason.
+ +                       */
+ +                      WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
                         pages++;
                         spin_unlock(ptl);
                         shared_pmd = true;
@@@ -6327,37 -6161,20 +6327,37 @@@
                 }
                 if (unlikely(is_hugetlb_entry_migration(pte))) {
                         swp_entry_t entry = pte_to_swp_entry(pte);
+ +                      struct page *page = pfn_swap_entry_to_page(entry);
   
- -                      if (is_writable_migration_entry(entry)) {
+ +                      if (!is_readable_migration_entry(entry)) {
                                 pte_t newpte;
   
- -                              entry = make_readable_migration_entry(
- -                                                      swp_offset(entry));
+ +                              if (PageAnon(page))
+ +                                      entry = make_readable_exclusive_migration_entry(
+ +                                                              swp_offset(entry));
+ +                              else
+ +                                      entry = make_readable_migration_entry(
+ +                                                              swp_offset(entry));
                                 newpte = swp_entry_to_pte(entry);
+ +                              if (uffd_wp)
+ +                                      newpte = pte_swp_mkuffd_wp(newpte);
+ +                              else if (uffd_wp_resolve)
+ +                                      newpte = pte_swp_clear_uffd_wp(newpte);
                                 set_huge_swap_pte_at(mm, address, ptep,
- -                                                   newpte, huge_page_size(h));
+ +                                                   newpte, psize);
                                 pages++;
                         }
                         spin_unlock(ptl);
                         continue;
                 }
+ +              if (unlikely(pte_marker_uffd_wp(pte))) {
+ +                      /*
+ +                       * This is changing a non-present pte into a none pte,
+ +                       * no need for huge_ptep_modify_prot_start/commit().
+ +                       */
+ +                      if (uffd_wp_resolve)
+ +                              huge_pte_clear(mm, address, ptep, psize);
+ +              }
                 if (!huge_pte_none(pte)) {
                         pte_t old_pte;
                         unsigned int shift = huge_page_shift(hstate_vma(vma));
@@@ -6365,18 -6182,8 +6365,18 @@@
                         old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
                         pte = huge_pte_modify(old_pte, newprot);
                         pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
+ +                      if (uffd_wp)
+ +                              pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
+ +                      else if (uffd_wp_resolve)
+ +                              pte = huge_pte_clear_uffd_wp(pte);
                         huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
                         pages++;
+ +              } else {
+ +                      /* None pte */
+ +                      if (unlikely(uffd_wp))
+ +                              /* Safe to modify directly (none->non-present). */
+ +                              set_huge_pte_at(mm, address, ptep,
+ +                                              make_pte_marker(PTE_MARKER_UFFD_WP));
                 }
                 spin_unlock(ptl);
         }
@@@ -6755,7 -6562,14 +6755,14 @@@ int huge_pmd_unshare(struct mm_struct *
         pud_clear(pud);
         put_page(virt_to_page(ptep));
         mm_dec_nr_pmds(mm);
-       *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+       /*
+        * This update of passed address optimizes loops sequentially
+        * processing addresses in increments of huge page size (PMD_SIZE
+        * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
+        * Update address to the 'last page' in the cleared area so that
+        * calling loop can move to first page past this area.
+        */
+       *addr |= PUD_SIZE - PMD_SIZE;
         return 1;
   }
   
@@@ -6879,11 -6693,9 +6886,11 @@@ follow_huge_pmd(struct mm_struct *mm, u
         spinlock_t *ptl;
         pte_t pte;
   
- -      /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- -      if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- -                       (FOLL_PIN | FOLL_GET)))
+ +      /*
+ +       * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+ +       * follow_hugetlb_page().
+ +       */
+ +      if (WARN_ON_ONCE(flags & FOLL_PIN))
                 return NULL;
   
   retry:
@@@ -6971,9 -6783,7 +6978,9 @@@ int get_hwpoison_huge_page(struct page 
         spin_lock_irq(&hugetlb_lock);
         if (PageHeadHuge(page)) {
                 *hugetlb = true;
- -              if (HPageFreed(page) || HPageMigratable(page))
+ +              if (HPageFreed(page))
+ +                      ret = 0;
+ +              else if (HPageMigratable(page))
                         ret = get_page_unless_zero(page);
                 else
                         ret = -EBUSY;
@@@ -7063,7 -6873,6 +7070,7 @@@ void hugetlb_unshare_all_pmds(struct vm
         if (start >= end)
                 return;
   
+ +      flush_cache_range(vma, start, end);
         /*
          * No need to call adjust_range_if_pmd_sharing_possible(), because
          * we have already done the PUD_SIZE alignment.
@@@ -7149,7 -6958,7 +7156,7 @@@ void __init hugetlb_cma_reserve(int ord
                 if (hugetlb_cma_size_in_node[nid] == 0)
                         continue;
   
- -              if (!node_state(nid, N_ONLINE)) {
+ +              if (!node_online(nid)) {
                         pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
                         hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
                         hugetlb_cma_size_in_node[nid] = 0;
@@@ -7188,7 -6997,7 +7195,7 @@@
         }
   
         reserved = 0;
- -      for_each_node_state(nid, N_ONLINE) {
+ +      for_each_online_node(nid) {
                 int res;
                 char name[CMA_MAX_NAME];
   
diff --combined mm/page_alloc.c

index bc93a82,5ced6cb..149f2ab
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -81,7 -81,6 +81,7 @@@
   #include "internal.h"
   #include "shuffle.h"
   #include "page_reporting.h"
+ +#include "swap.h"
   
   /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
   typedef int __bitwise fpi_t;
@@@ -868,6 -867,40 +868,6 @@@ static inline void set_buddy_order(stru
         __SetPageBuddy(page);
   }
   
- -/*
- - * This function checks whether a page is free && is the buddy
- - * we can coalesce a page and its buddy if
- - * (a) the buddy is not in a hole (check before calling!) &&
- - * (b) the buddy is in the buddy system &&
- - * (c) a page and its buddy have the same order &&
- - * (d) a page and its buddy are in the same zone.
- - *
- - * For recording whether a page is in the buddy system, we set PageBuddy.
- - * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
- - *
- - * For recording page's order, we use page_private(page).
- - */
- -static inline bool page_is_buddy(struct page *page, struct page *buddy,
- -                                                      unsigned int order)
- -{
- -      if (!page_is_guard(buddy) && !PageBuddy(buddy))
- -              return false;
- -
- -      if (buddy_order(buddy) != order)
- -              return false;
- -
- -      /*
- -       * zone check is done late to avoid uselessly calculating
- -       * zone/node ids for pages that could never merge.
- -       */
- -      if (page_zone_id(page) != page_zone_id(buddy))
- -              return false;
- -
- -      VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
- -
- -      return true;
- -}
- -
   #ifdef CONFIG_COMPACTION
   static inline struct capture_control *task_capc(struct zone *zone)
   {
@@@ -976,17 -1009,18 +976,17 @@@ static inline boo
   buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
                    struct page *page, unsigned int order)
   {
- -      struct page *higher_page, *higher_buddy;
- -      unsigned long combined_pfn;
+ +      unsigned long higher_page_pfn;
+ +      struct page *higher_page;
   
         if (order >= MAX_ORDER - 2)
                 return false;
   
- -      combined_pfn = buddy_pfn & pfn;
- -      higher_page = page + (combined_pfn - pfn);
- -      buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
- -      higher_buddy = higher_page + (buddy_pfn - combined_pfn);
+ +      higher_page_pfn = buddy_pfn & pfn;
+ +      higher_page = page + (higher_page_pfn - pfn);
   
- -      return page_is_buddy(higher_page, higher_buddy, order + 1);
+ +      return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
+ +                      NULL) != NULL;
   }
   
   /*
@@@ -1019,6 -1053,7 +1019,6 @@@ static inline void __free_one_page(stru
                 int migratetype, fpi_t fpi_flags)
   {
         struct capture_control *capc = task_capc(zone);
- -      unsigned int max_order = pageblock_order;
         unsigned long buddy_pfn;
         unsigned long combined_pfn;
         struct page *buddy;
@@@ -1034,32 -1069,18 +1034,32 @@@
         VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
         VM_BUG_ON_PAGE(bad_range(zone, page), page);
   
- -continue_merging:
- -      while (order < max_order) {
+ +      while (order < MAX_ORDER - 1) {
                 if (compaction_capture(capc, page, order, migratetype)) {
                         __mod_zone_freepage_state(zone, -(1 << order),
                                                                 migratetype);
                         return;
                 }
- -              buddy_pfn = __find_buddy_pfn(pfn, order);
- -              buddy = page + (buddy_pfn - pfn);
   
- -              if (!page_is_buddy(page, buddy, order))
+ +              buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
+ +              if (!buddy)
                         goto done_merging;
+ +
+ +              if (unlikely(order >= pageblock_order)) {
+ +                      /*
+ +                       * We want to prevent merge between freepages on pageblock
+ +                       * without fallbacks and normal pageblock. Without this,
+ +                       * pageblock isolation could cause incorrect freepage or CMA
+ +                       * accounting or HIGHATOMIC accounting.
+ +                       */
+ +                      int buddy_mt = get_pageblock_migratetype(buddy);
+ +
+ +                      if (migratetype != buddy_mt
+ +                                      && (!migratetype_is_mergeable(migratetype) ||
+ +                                              !migratetype_is_mergeable(buddy_mt)))
+ +                              goto done_merging;
+ +              }
+ +
                 /*
                  * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
                  * merge with it and move up one order.
@@@ -1073,6 -1094,32 +1073,6 @@@
                 pfn = combined_pfn;
                 order++;
         }
- -      if (order < MAX_ORDER - 1) {
- -              /* If we are here, it means order is >= pageblock_order.
- -               * We want to prevent merge between freepages on pageblock
- -               * without fallbacks and normal pageblock. Without this,
- -               * pageblock isolation could cause incorrect freepage or CMA
- -               * accounting or HIGHATOMIC accounting.
- -               *
- -               * We don't want to hit this code for the more frequent
- -               * low-order merging.
- -               */
- -              int buddy_mt;
- -
- -              buddy_pfn = __find_buddy_pfn(pfn, order);
- -              buddy = page + (buddy_pfn - pfn);
- -
- -              if (!page_is_buddy(page, buddy, order))
- -                      goto done_merging;
- -              buddy_mt = get_pageblock_migratetype(buddy);
- -
- -              if (migratetype != buddy_mt
- -                              && (!migratetype_is_mergeable(migratetype) ||
- -                                      !migratetype_is_mergeable(buddy_mt)))
- -                      goto done_merging;
- -              max_order = order + 1;
- -              goto continue_merging;
- -      }
   
   done_merging:
         set_buddy_order(page, order);
@@@ -1094,48 -1141,6 +1094,48 @@@
                 page_reporting_notify_free(order);
   }
   
+ +/**
+ + * split_free_page() -- split a free page at split_pfn_offset
+ + * @free_page:                the original free page
+ + * @order:            the order of the page
+ + * @split_pfn_offset: split offset within the page
+ + *
+ + * It is used when the free page crosses two pageblocks with different migratetypes
+ + * at split_pfn_offset within the page. The split free page will be put into
+ + * separate migratetype lists afterwards. Otherwise, the function achieves
+ + * nothing.
+ + */
+ +void split_free_page(struct page *free_page,
+ +                              int order, unsigned long split_pfn_offset)
+ +{
+ +      struct zone *zone = page_zone(free_page);
+ +      unsigned long free_page_pfn = page_to_pfn(free_page);
+ +      unsigned long pfn;
+ +      unsigned long flags;
+ +      int free_page_order;
+ +
+ +      if (split_pfn_offset == 0)
+ +              return;
+ +
+ +      spin_lock_irqsave(&zone->lock, flags);
+ +      del_page_from_free_list(free_page, zone, order);
+ +      for (pfn = free_page_pfn;
+ +           pfn < free_page_pfn + (1UL << order);) {
+ +              int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+ +
+ +              free_page_order = min_t(int,
+ +                                      pfn ? __ffs(pfn) : order,
+ +                                      __fls(split_pfn_offset));
+ +              __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+ +                              mt, FPI_NONE);
+ +              pfn += 1UL << free_page_order;
+ +              split_pfn_offset -= (1UL << free_page_order);
+ +              /* we have done the first part, now switch to second part */
+ +              if (split_pfn_offset == 0)
+ +                      split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+ +      }
+ +      spin_unlock_irqrestore(&zone->lock, flags);
+ +}
   /*
    * A bad page could be due to a number of fields. Instead of multiple branches,
    * try and check multiple fields with one check. The caller must do a detailed
@@@ -2471,9 -2476,6 +2471,9 @@@ struct page *__rmqueue_smallest(struct 
                 del_page_from_free_list(page, zone, current_order);
                 expand(zone, page, order, current_order, migratetype);
                 set_pcppage_migratetype(page, migratetype);
+ +              trace_mm_page_alloc_zone_locked(page, order, migratetype,
+ +                              pcp_allowed_order(order) &&
+ +                              migratetype < MIGRATE_PCPTYPES);
                 return page;
         }
   
@@@ -2997,7 -2999,7 +2997,7 @@@ __rmqueue(struct zone *zone, unsigned i
                     zone_page_state(zone, NR_FREE_PAGES) / 2) {
                         page = __rmqueue_cma_fallback(zone, order);
                         if (page)
- -                              goto out;
+ +                              return page;
                 }
         }
   retry:
@@@ -3010,6 -3012,9 +3010,6 @@@
                                                                 alloc_flags))
                         goto retry;
         }
- -out:
- -      if (page)
- -              trace_mm_page_alloc_zone_locked(page, order, migratetype);
         return page;
   }
   
@@@ -3728,8 -3733,11 +3728,8 @@@ struct page *rmqueue(struct zone *prefe
                  * reserved for high-order atomic allocation, so order-0
                  * request should skip it.
                  */
- -              if (order > 0 && alloc_flags & ALLOC_HARDER) {
+ +              if (order > 0 && alloc_flags & ALLOC_HARDER)
                         page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- -                      if (page)
- -                              trace_mm_page_alloc_zone_locked(page, order, migratetype);
- -              }
                 if (!page) {
                         page = __rmqueue(zone, order, migratetype, alloc_flags);
                         if (!page)
@@@ -3791,9 -3799,6 +3791,9 @@@ static bool __should_fail_alloc_page(gf
                         (gfp_mask & __GFP_DIRECT_RECLAIM))
                 return false;
   
+ +      if (gfp_mask & __GFP_NOWARN)
+ +              fail_page_alloc.attr.no_warn = true;
+ +
         return should_fail(&fail_page_alloc.attr, 1 << order);
   }
   
@@@ -4063,8 -4068,7 +4063,8 @@@ get_page_from_freelist(gfp_t gfp_mask, 
   {
         struct zoneref *z;
         struct zone *zone;
- -      struct pglist_data *last_pgdat_dirty_limit = NULL;
+ +      struct pglist_data *last_pgdat = NULL;
+ +      bool last_pgdat_dirty_ok = false;
         bool no_fallback;
   
   retry:
@@@ -4103,13 -4107,13 +4103,13 @@@
                  * dirty-throttling and the flusher threads.
                  */
                 if (ac->spread_dirty_pages) {
- -                      if (last_pgdat_dirty_limit == zone->zone_pgdat)
- -                              continue;
+ +                      if (last_pgdat != zone->zone_pgdat) {
+ +                              last_pgdat = zone->zone_pgdat;
+ +                              last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
+ +                      }
   
- -                      if (!node_dirty_ok(zone->zone_pgdat)) {
- -                              last_pgdat_dirty_limit = zone->zone_pgdat;
+ +                      if (!last_pgdat_dirty_ok)
                                 continue;
- -                      }
                 }
   
                 if (no_fallback && nr_online_nodes > 1 &&
@@@ -4342,8 -4346,7 +4342,8 @@@ __alloc_pages_may_oom(gfp_t gfp_mask, u
          */
   
         /* Exhausted what can be done so it's blame time */
- -      if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+ +      if (out_of_memory(&oc) ||
+ +          WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
                 *did_some_progress = 1;
   
                 /*
@@@ -4674,12 -4677,9 +4674,12 @@@ static void wake_all_kswapds(unsigned i
   
         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
                                         ac->nodemask) {
- -              if (last_pgdat != zone->zone_pgdat)
+ +              if (!managed_zone(zone))
+ +                      continue;
+ +              if (last_pgdat != zone->zone_pgdat) {
                         wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
- -              last_pgdat = zone->zone_pgdat;
+ +                      last_pgdat = zone->zone_pgdat;
+ +              }
         }
   }
   
@@@ -5117,7 -5117,7 +5117,7 @@@ nopage
                  * All existing users of the __GFP_NOFAIL are blockable, so warn
                  * of any new users that actually require GFP_NOWAIT
                  */
- -              if (WARN_ON_ONCE(!can_direct_reclaim))
+ +              if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
                         goto fail;
   
                 /*
@@@ -5125,7 -5125,7 +5125,7 @@@
                  * because we cannot reclaim anything and only can loop waiting
                  * for somebody to do a work for us
                  */
- -              WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+ +              WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
   
                 /*
                  * non failing costly orders are a hard requirement which we
@@@ -5133,7 -5133,7 +5133,7 @@@
                  * so that we can identify them and convert them to something
                  * else.
                  */
- -              WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+ +              WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask);
   
                 /*
                  * Help non-failing allocations by giving them access to memory
@@@ -5324,8 -5324,8 +5324,8 @@@ unsigned long __alloc_pages_bulk(gfp_t 
                 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
                                                                 pcp, pcp_list);
                 if (unlikely(!page)) {
-                       /* Try and get at least one page */
-                       if (!nr_populated)
+                       /* Try and allocate at least one page */
+                       if (!nr_account)
                                 goto failed_irq;
                         break;
                 }
@@@ -5379,8 -5379,10 +5379,8 @@@ struct page *__alloc_pages(gfp_t gfp, u
          * There are several places where we assume that the order value is sane
          * so bail out early if the request is out of bound.
          */
- -      if (unlikely(order >= MAX_ORDER)) {
- -              WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
+ +      if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
                 return NULL;
- -      }
   
         gfp &= gfp_allowed_mask;
         /*
@@@ -6169,6 -6171,7 +6169,6 @@@ int numa_zonelist_order_handler(struct 
   }
   
   
- -#define MAX_NODE_LOAD (nr_online_nodes)
   static int node_load[MAX_NUMNODES];
   
   /**
@@@ -6215,7 -6218,7 +6215,7 @@@ int find_next_best_node(int node, nodem
                         val += PENALTY_FOR_NODE_WITH_CPUS;
   
                 /* Slight preference for less loaded node */
- -              val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+ +              val *= MAX_NUMNODES;
                 val += node_load[n];
   
                 if (val < min_val) {
@@@ -6281,12 -6284,13 +6281,12 @@@ static void build_thisnode_zonelists(pg
   static void build_zonelists(pg_data_t *pgdat)
   {
         static int node_order[MAX_NUMNODES];
- -      int node, load, nr_nodes = 0;
+ +      int node, nr_nodes = 0;
         nodemask_t used_mask = NODE_MASK_NONE;
         int local_node, prev_node;
   
         /* NUMA-aware ordering of nodes */
         local_node = pgdat->node_id;
- -      load = nr_online_nodes;
         prev_node = local_node;
   
         memset(node_order, 0, sizeof(node_order));
@@@ -6298,10 -6302,11 +6298,10 @@@
                  */
                 if (node_distance(local_node, node) !=
                     node_distance(local_node, prev_node))
- -                      node_load[node] += load;
+ +                      node_load[node] += 1;
   
                 node_order[nr_nodes++] = node;
                 prev_node = node;
- -              load--;
         }
   
         build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
@@@ -6640,21 -6645,6 +6640,21 @@@ static void __ref __init_zone_device_pa
         }
   }
   
+ +/*
+ + * With compound page geometry and when struct pages are stored in ram most
+ + * tail pages are reused. Consequently, the amount of unique struct pages to
+ + * initialize is a lot smaller that the total amount of struct pages being
+ + * mapped. This is a paired / mild layering violation with explicit knowledge
+ + * of how the sparse_vmemmap internals handle compound pages in the lack
+ + * of an altmap. See vmemmap_populate_compound_pages().
+ + */
+ +static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
+ +                                            unsigned long nr_pages)
+ +{
+ +      return is_power_of_2(sizeof(struct page)) &&
+ +              !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
+ +}
+ +
   static void __ref memmap_init_compound(struct page *head,
                                        unsigned long head_pfn,
                                        unsigned long zone_idx, int nid,
@@@ -6719,7 -6709,7 +6719,7 @@@ void __ref memmap_init_zone_device(stru
                         continue;
   
                 memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
- -                                   pfns_per_compound);
+ +                                   compound_nr_pages(altmap, pfns_per_compound));
         }
   
         pr_info("%s initialised %lu pages in %ums\n", __func__,
@@@ -7880,7 -7870,7 +7880,7 @@@ static void __init find_zone_movable_pf
   
                         usable_startpfn = memblock_region_memory_base_pfn(r);
   
- -                      if (usable_startpfn < 0x100000) {
+ +                      if (usable_startpfn < PHYS_PFN(SZ_4G)) {
                                 mem_below_4gb_not_mirrored = true;
                                 continue;
                         }
@@@ -8959,7 -8949,136 +8959,7 @@@ void *__init alloc_large_system_hash(co
         return table;
   }
   
- -/*
- - * This function checks whether pageblock includes unmovable pages or not.
- - *
- - * PageLRU check without isolation or lru_lock could race so that
- - * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- - * check without lock_page also may miss some movable non-lru pages at
- - * race condition. So you can't expect this function should be exact.
- - *
- - * Returns a page without holding a reference. If the caller wants to
- - * dereference that page (e.g., dumping), it has to make sure that it
- - * cannot get removed (e.g., via memory unplug) concurrently.
- - *
- - */
- -struct page *has_unmovable_pages(struct zone *zone, struct page *page,
- -                               int migratetype, int flags)
- -{
- -      unsigned long iter = 0;
- -      unsigned long pfn = page_to_pfn(page);
- -      unsigned long offset = pfn % pageblock_nr_pages;
- -
- -      if (is_migrate_cma_page(page)) {
- -              /*
- -               * CMA allocations (alloc_contig_range) really need to mark
- -               * isolate CMA pageblocks even when they are not movable in fact
- -               * so consider them movable here.
- -               */
- -              if (is_migrate_cma(migratetype))
- -                      return NULL;
- -
- -              return page;
- -      }
- -
- -      for (; iter < pageblock_nr_pages - offset; iter++) {
- -              page = pfn_to_page(pfn + iter);
- -
- -              /*
- -               * Both, bootmem allocations and memory holes are marked
- -               * PG_reserved and are unmovable. We can even have unmovable
- -               * allocations inside ZONE_MOVABLE, for example when
- -               * specifying "movablecore".
- -               */
- -              if (PageReserved(page))
- -                      return page;
- -
- -              /*
- -               * If the zone is movable and we have ruled out all reserved
- -               * pages then it should be reasonably safe to assume the rest
- -               * is movable.
- -               */
- -              if (zone_idx(zone) == ZONE_MOVABLE)
- -                      continue;
- -
- -              /*
- -               * Hugepages are not in LRU lists, but they're movable.
- -               * THPs are on the LRU, but need to be counted as #small pages.
- -               * We need not scan over tail pages because we don't
- -               * handle each tail page individually in migration.
- -               */
- -              if (PageHuge(page) || PageTransCompound(page)) {
- -                      struct page *head = compound_head(page);
- -                      unsigned int skip_pages;
- -
- -                      if (PageHuge(page)) {
- -                              if (!hugepage_migration_supported(page_hstate(head)))
- -                                      return page;
- -                      } else if (!PageLRU(head) && !__PageMovable(head)) {
- -                              return page;
- -                      }
- -
- -                      skip_pages = compound_nr(head) - (page - head);
- -                      iter += skip_pages - 1;
- -                      continue;
- -              }
- -
- -              /*
- -               * We can't use page_count without pin a page
- -               * because another CPU can free compound page.
- -               * This check already skips compound tails of THP
- -               * because their page->_refcount is zero at all time.
- -               */
- -              if (!page_ref_count(page)) {
- -                      if (PageBuddy(page))
- -                              iter += (1 << buddy_order(page)) - 1;
- -                      continue;
- -              }
- -
- -              /*
- -               * The HWPoisoned page may be not in buddy system, and
- -               * page_count() is not 0.
- -               */
- -              if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
- -                      continue;
- -
- -              /*
- -               * We treat all PageOffline() pages as movable when offlining
- -               * to give drivers a chance to decrement their reference count
- -               * in MEM_GOING_OFFLINE in order to indicate that these pages
- -               * can be offlined as there are no direct references anymore.
- -               * For actually unmovable PageOffline() where the driver does
- -               * not support this, we will fail later when trying to actually
- -               * move these pages that still have a reference count > 0.
- -               * (false negatives in this function only)
- -               */
- -              if ((flags & MEMORY_OFFLINE) && PageOffline(page))
- -                      continue;
- -
- -              if (__PageMovable(page) || PageLRU(page))
- -                      continue;
- -
- -              /*
- -               * If there are RECLAIMABLE pages, we need to check
- -               * it.  But now, memory offline itself doesn't call
- -               * shrink_node_slabs() and it still to be fixed.
- -               */
- -              return page;
- -      }
- -      return NULL;
- -}
- -
   #ifdef CONFIG_CONTIG_ALLOC
- -static unsigned long pfn_max_align_down(unsigned long pfn)
- -{
- -      return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
- -}
- -
- -static unsigned long pfn_max_align_up(unsigned long pfn)
- -{
- -      return ALIGN(pfn, MAX_ORDER_NR_PAGES);
- -}
- -
   #if defined(CONFIG_DYNAMIC_DEBUG) || \
         (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
   /* Usage: See admin-guide/dynamic-debug-howto.rst */
@@@ -8982,7 -9101,7 +8982,7 @@@ static inline void alloc_contig_dump_pa
   #endif
   
   /* [start, end) must belong to a single zone. */
- -static int __alloc_contig_migrate_range(struct compact_control *cc,
+ +int __alloc_contig_migrate_range(struct compact_control *cc,
                                         unsigned long start, unsigned long end)
   {
         /* This function is based on compact_zone() from compaction.c. */
@@@ -9032,7 -9151,7 +9032,7 @@@
   
         lru_cache_enable();
         if (ret < 0) {
- -              if (ret == -EBUSY)
+ +              if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
                         alloc_contig_dump_pages(&cc->migratepages);
                 putback_movable_pages(&cc->migratepages);
                 return ret;
@@@ -9050,8 -9169,8 +9050,8 @@@
    *                    be either of the two.
    * @gfp_mask: GFP mask to use during compaction
    *
- - * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
- - * aligned.  The PFN range must belong to a single zone.
+ + * The PFN range does not have to be pageblock aligned. The PFN range must
+ + * belong to a single zone.
    *
    * The first thing this routine does is attempt to MIGRATE_ISOLATE all
    * pageblocks in the range.  Once isolated, the pageblocks should not
@@@ -9065,7 -9184,7 +9065,7 @@@ int alloc_contig_range(unsigned long st
                        unsigned migratetype, gfp_t gfp_mask)
   {
         unsigned long outer_start, outer_end;
- -      unsigned int order;
+ +      int order;
         int ret = 0;
   
         struct compact_control cc = {
@@@ -9084,11 -9203,14 +9084,11 @@@
          * What we do here is we mark all pageblocks in range as
          * MIGRATE_ISOLATE.  Because pageblock and max order pages may
          * have different sizes, and due to the way page allocator
- -       * work, we align the range to biggest of the two pages so
- -       * that page allocator won't try to merge buddies from
- -       * different pageblocks and change MIGRATE_ISOLATE to some
- -       * other migration type.
+ +       * work, start_isolate_page_range() has special handlings for this.
          *
          * Once the pageblocks are marked as MIGRATE_ISOLATE, we
          * migrate the pages from an unaligned range (ie. pages that
- -       * we are interested in).  This will put all the pages in
+ +       * we are interested in). This will put all the pages in
          * range back to page allocator as MIGRATE_ISOLATE.
          *
          * When this is done, we take the pages in range from page
@@@ -9101,9 -9223,10 +9101,9 @@@
          * put back to page allocator so that buddy can use them.
          */
   
- -      ret = start_isolate_page_range(pfn_max_align_down(start),
- -                                     pfn_max_align_up(end), migratetype, 0);
+ +      ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
         if (ret)
- -              return ret;
+ +              goto done;
   
         drain_all_pages(cc.zone);
   
@@@ -9123,7 -9246,7 +9123,7 @@@
         ret = 0;
   
         /*
- -       * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+ +       * Pages from [start, end) are within a pageblock_nr_pages
          * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
          * more, all pages in [start, end) are free in page allocator.
          * What we are going to do is to allocate all pages from
@@@ -9182,7 -9305,8 +9182,7 @@@
                 free_contig_range(end, outer_end - end);
   
   done:
- -      undo_isolate_page_range(pfn_max_align_down(start),
- -                              pfn_max_align_up(end), migratetype);
+ +      undo_isolate_page_range(start, end, migratetype);
         return ret;
   }
   EXPORT_SYMBOL(alloc_contig_range);
@@@ -9501,6 -9625,7 +9501,6 @@@ bool put_page_back_buddy(struct page *p
                 ClearPageHWPoisonTakenOff(page);
                 __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
                 if (TestClearPageHWPoison(page)) {
- -                      num_poisoned_pages_dec();
                         ret = true;
                 }
         }
diff --combined mm/page_table_check.c

index 3692bea,bc55be2..e206274
--- 1/mm/page_table_check.c
--- 2/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@@ -52,6 -52,23 +52,6 @@@ static struct page_table_check *get_pag
         return (void *)(page_ext) + page_table_check_ops.offset;
   }
   
- -static inline bool pte_user_accessible_page(pte_t pte)
- -{
- -      return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
- -}
- -
- -static inline bool pmd_user_accessible_page(pmd_t pmd)
- -{
- -      return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) &&
- -              (pmd_val(pmd) & _PAGE_USER);
- -}
- -
- -static inline bool pud_user_accessible_page(pud_t pud)
- -{
- -      return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) &&
- -              (pud_val(pud) & _PAGE_USER);
- -}
- -
   /*
    * An enty is removed from the page table, decrement the counters for that page
    * verify that it is of correct type and counters do not become negative.
@@@ -160,7 -177,7 +160,7 @@@ void __page_table_check_pmd_clear(struc
   
         if (pmd_user_accessible_page(pmd)) {
                 page_table_check_clear(mm, addr, pmd_pfn(pmd),
- -                                     PMD_PAGE_SIZE >> PAGE_SHIFT);
+ +                                     PMD_SIZE >> PAGE_SHIFT);
         }
   }
   EXPORT_SYMBOL(__page_table_check_pmd_clear);
@@@ -173,7 -190,7 +173,7 @@@ void __page_table_check_pud_clear(struc
   
         if (pud_user_accessible_page(pud)) {
                 page_table_check_clear(mm, addr, pud_pfn(pud),
- -                                     PUD_PAGE_SIZE >> PAGE_SHIFT);
+ +                                     PUD_SIZE >> PAGE_SHIFT);
         }
   }
   EXPORT_SYMBOL(__page_table_check_pud_clear);
@@@ -202,7 -219,7 +202,7 @@@ void __page_table_check_pmd_set(struct 
         __page_table_check_pmd_clear(mm, addr, *pmdp);
         if (pmd_user_accessible_page(pmd)) {
                 page_table_check_set(mm, addr, pmd_pfn(pmd),
- -                                   PMD_PAGE_SIZE >> PAGE_SHIFT,
+ +                                   PMD_SIZE >> PAGE_SHIFT,
                                      pmd_write(pmd));
         }
   }
@@@ -217,7 -234,7 +217,7 @@@ void __page_table_check_pud_set(struct 
         __page_table_check_pud_clear(mm, addr, *pudp);
         if (pud_user_accessible_page(pud)) {
                 page_table_check_set(mm, addr, pud_pfn(pud),
- -                                   PUD_PAGE_SIZE >> PAGE_SHIFT,
+ +                                   PUD_SIZE >> PAGE_SHIFT,
                                      pud_write(pud));
         }
   }
@@@ -234,11 -251,11 +234,11 @@@ void __page_table_check_pte_clear_range
                 pte_t *ptep = pte_offset_map(&pmd, addr);
                 unsigned long i;
   
-               pte_unmap(ptep);
                 for (i = 0; i < PTRS_PER_PTE; i++) {
                         __page_table_check_pte_clear(mm, addr, *ptep);
                         addr += PAGE_SIZE;
                         ptep++;
                 }
+               pte_unmap(ptep - PTRS_PER_PTE);
         }
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 27 May 2022 18:29:35 +0000 (11:29 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 27 May 2022 18:29:35 +0000 (11:29 -0700)
		1	2
mm/hugetlb.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_table_check.c	patch \|	diff1 \|	diff2 \|	blob \| history