Merge tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kerne...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 27 May 2022 18:29:35 +0000 (11:29 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 27 May 2022 18:29:35 +0000 (11:29 -0700)
Pull hotfixes from Andrew Morton:
 "Six hotfixes.

  The page_table_check one from Miaohe Lin is considered a minor thing
  so it isn't marked for -stable. The remainder address pre-5.19 issues
  and are cc:stable"

* tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
  mm/page_table_check: fix accessing unmapped ptep
  kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add]
  mm/page_alloc: always attempt to allocate at least one page during bulk allocation
  hugetlb: fix huge_pmd_unshare address update
  zsmalloc: fix races between asynchronous zspage free and page migration
  Revert "mm/cma.c: remove redundant cma_mutex lock"

1  2 
mm/hugetlb.c
mm/page_alloc.c
mm/page_table_check.c

diff --combined mm/hugetlb.c
@@@ -370,7 -370,7 +370,7 @@@ static void coalesce_file_region(struc
  }
  
  static inline long
 -hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
 +hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
                     long to, struct hstate *h, struct hugetlb_cgroup *cg,
                     long *regions_needed)
  {
        if (!regions_needed) {
                nrg = get_file_region_entry_from_cache(map, from, to);
                record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
 -              list_add(&nrg->link, rg->link.prev);
 +              list_add(&nrg->link, rg);
                coalesce_file_region(map, nrg);
        } else
                *regions_needed += 1;
@@@ -402,52 -402,47 +402,52 @@@ static long add_reservation_in_range(st
        long add = 0;
        struct list_head *head = &resv->regions;
        long last_accounted_offset = f;
 -      struct file_region *rg = NULL, *trg = NULL;
 +      struct file_region *iter, *trg = NULL;
 +      struct list_head *rg = NULL;
  
        if (regions_needed)
                *regions_needed = 0;
  
        /* In this loop, we essentially handle an entry for the range
 -       * [last_accounted_offset, rg->from), at every iteration, with some
 +       * [last_accounted_offset, iter->from), at every iteration, with some
         * bounds checking.
         */
 -      list_for_each_entry_safe(rg, trg, head, link) {
 +      list_for_each_entry_safe(iter, trg, head, link) {
                /* Skip irrelevant regions that start before our range. */
 -              if (rg->from < f) {
 +              if (iter->from < f) {
                        /* If this region ends after the last accounted offset,
                         * then we need to update last_accounted_offset.
                         */
 -                      if (rg->to > last_accounted_offset)
 -                              last_accounted_offset = rg->to;
 +                      if (iter->to > last_accounted_offset)
 +                              last_accounted_offset = iter->to;
                        continue;
                }
  
                /* When we find a region that starts beyond our range, we've
                 * finished.
                 */
 -              if (rg->from >= t)
 +              if (iter->from >= t) {
 +                      rg = iter->link.prev;
                        break;
 +              }
  
 -              /* Add an entry for last_accounted_offset -> rg->from, and
 +              /* Add an entry for last_accounted_offset -> iter->from, and
                 * update last_accounted_offset.
                 */
 -              if (rg->from > last_accounted_offset)
 -                      add += hugetlb_resv_map_add(resv, rg,
 +              if (iter->from > last_accounted_offset)
 +                      add += hugetlb_resv_map_add(resv, iter->link.prev,
                                                    last_accounted_offset,
 -                                                  rg->from, h, h_cg,
 +                                                  iter->from, h, h_cg,
                                                    regions_needed);
  
 -              last_accounted_offset = rg->to;
 +              last_accounted_offset = iter->to;
        }
  
        /* Handle the case where our range extends beyond
         * last_accounted_offset.
         */
 +      if (!rg)
 +              rg = head->prev;
        if (last_accounted_offset < t)
                add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
                                            t, h, h_cg, regions_needed);
@@@ -1540,7 -1535,7 +1540,7 @@@ static void __update_and_free_page(stru
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
  
 -      if (alloc_huge_page_vmemmap(h, page)) {
 +      if (hugetlb_vmemmap_alloc(h, page)) {
                spin_lock_irq(&hugetlb_lock);
                /*
                 * If we cannot allocate vmemmap pages, just refuse to free the
@@@ -1617,7 -1612,7 +1617,7 @@@ static DECLARE_WORK(free_hpage_work, fr
  
  static inline void flush_free_hpage_work(struct hstate *h)
  {
 -      if (free_vmemmap_pages_per_hpage(h))
 +      if (hugetlb_optimize_vmemmap_pages(h))
                flush_work(&free_hpage_work);
  }
  
@@@ -1677,8 -1672,6 +1677,8 @@@ void free_huge_page(struct page *page
        VM_BUG_ON_PAGE(page_mapcount(page), page);
  
        hugetlb_set_page_subpool(page, NULL);
 +      if (PageAnon(page))
 +              __ClearPageAnonExclusive(page);
        page->mapping = NULL;
        restore_reserve = HPageRestoreReserve(page);
        ClearHPageRestoreReserve(page);
@@@ -1739,7 -1732,7 +1739,7 @@@ static void __prep_account_new_huge_pag
  
  static void __prep_new_huge_page(struct hstate *h, struct page *page)
  {
 -      free_huge_page_vmemmap(h, page);
 +      hugetlb_vmemmap_free(h, page);
        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        hugetlb_set_page_subpool(page, NULL);
@@@ -2112,7 -2105,7 +2112,7 @@@ retry
                 * Attempt to allocate vmemmmap here so that we can take
                 * appropriate action on failure.
                 */
 -              rc = alloc_huge_page_vmemmap(h, head);
 +              rc = hugetlb_vmemmap_alloc(h, head);
                if (!rc) {
                        /*
                         * Move PageHWPoison flag from head page to the raw
@@@ -2986,6 -2979,8 +2986,6 @@@ int __alloc_bootmem_huge_page(struct hs
        struct huge_bootmem_page *m = NULL; /* initialize for clang */
        int nr_nodes, node;
  
 -      if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
 -              return 0;
        /* do node specific alloc */
        if (nid != NUMA_NO_NODE) {
                m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
@@@ -3093,7 -3088,7 +3093,7 @@@ static void __init hugetlb_hstate_alloc
        }
  
        /* do node specific alloc */
 -      for (i = 0; i < nr_online_nodes; i++) {
 +      for_each_online_node(i) {
                if (h->max_huge_pages_node[i] > 0) {
                        hugetlb_hstate_alloc_pages_onenode(h, i);
                        node_specific_alloc = true;
@@@ -3425,7 -3420,7 +3425,7 @@@ static int demote_free_huge_page(struc
        remove_hugetlb_page_for_demote(h, page, false);
        spin_unlock_irq(&hugetlb_lock);
  
 -      rc = alloc_huge_page_vmemmap(h, page);
 +      rc = hugetlb_vmemmap_alloc(h, page);
        if (rc) {
                /* Allocation of vmemmmap failed, we can not demote page */
                spin_lock_irq(&hugetlb_lock);
@@@ -4057,7 -4052,7 +4057,7 @@@ static int __init hugetlb_init(void
                        default_hstate.max_huge_pages =
                                default_hstate_max_huge_pages;
  
 -                      for (i = 0; i < nr_online_nodes; i++)
 +                      for_each_online_node(i)
                                default_hstate.max_huge_pages_node[i] =
                                        default_hugepages_in_node[i];
                }
@@@ -4124,20 -4119,6 +4124,20 @@@ bool __init __weak hugetlb_node_alloc_s
  {
        return true;
  }
 +
 +static void __init hugepages_clear_pages_in_node(void)
 +{
 +      if (!hugetlb_max_hstate) {
 +              default_hstate_max_huge_pages = 0;
 +              memset(default_hugepages_in_node, 0,
 +                      MAX_NUMNODES * sizeof(unsigned int));
 +      } else {
 +              parsed_hstate->max_huge_pages = 0;
 +              memset(parsed_hstate->max_huge_pages_node, 0,
 +                      MAX_NUMNODES * sizeof(unsigned int));
 +      }
 +}
 +
  /*
   * hugepages command line processing
   * hugepages normally follows a valid hugepagsz or default_hugepagsz
@@@ -4157,7 -4138,7 +4157,7 @@@ static int __init hugepages_setup(char 
        if (!parsed_valid_hugepagesz) {
                pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
                parsed_valid_hugepagesz = true;
 -              return 0;
 +              return 1;
        }
  
        /*
  
        if (mhp == last_mhp) {
                pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
 -              return 0;
 +              return 1;
        }
  
        while (*p) {
                if (p[count] == ':') {
                        if (!hugetlb_node_alloc_supported()) {
                                pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
 -                              return 0;
 +                              return 1;
                        }
 -                      if (tmp >= nr_online_nodes)
 +                      if (tmp >= MAX_NUMNODES || !node_online(tmp))
                                goto invalid;
 -                      node = array_index_nospec(tmp, nr_online_nodes);
 +                      node = array_index_nospec(tmp, MAX_NUMNODES);
                        p += count + 1;
                        /* Parse hugepages */
                        if (sscanf(p, "%lu%n", &tmp, &count) != 1)
  
  invalid:
        pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
 -      return 0;
 +      hugepages_clear_pages_in_node();
 +      return 1;
  }
  __setup("hugepages=", hugepages_setup);
  
@@@ -4247,7 -4227,7 +4247,7 @@@ static int __init hugepagesz_setup(cha
  
        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
 -              return 0;
 +              return 1;
        }
  
        h = size_to_hstate(size);
                if (!parsed_default_hugepagesz ||  h != &default_hstate ||
                    default_hstate.max_huge_pages) {
                        pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
 -                      return 0;
 +                      return 1;
                }
  
                /*
@@@ -4293,14 -4273,14 +4293,14 @@@ static int __init default_hugepagesz_se
        parsed_valid_hugepagesz = false;
        if (parsed_default_hugepagesz) {
                pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
 -              return 0;
 +              return 1;
        }
  
        size = (unsigned long)memparse(s, NULL);
  
        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
 -              return 0;
 +              return 1;
        }
  
        hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
         */
        if (default_hstate_max_huge_pages) {
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
 -              for (i = 0; i < nr_online_nodes; i++)
 +              for_each_online_node(i)
                        default_hstate.max_huge_pages_node[i] =
                                default_hugepages_in_node[i];
                if (hstate_is_gigantic(&default_hstate))
@@@ -4719,27 -4699,24 +4719,27 @@@ hugetlb_install_page(struct vm_area_str
  }
  
  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 -                          struct vm_area_struct *vma)
 +                          struct vm_area_struct *dst_vma,
 +                          struct vm_area_struct *src_vma)
  {
        pte_t *src_pte, *dst_pte, entry, dst_entry;
        struct page *ptepage;
        unsigned long addr;
 -      bool cow = is_cow_mapping(vma->vm_flags);
 -      struct hstate *h = hstate_vma(vma);
 +      bool cow = is_cow_mapping(src_vma->vm_flags);
 +      struct hstate *h = hstate_vma(src_vma);
        unsigned long sz = huge_page_size(h);
        unsigned long npages = pages_per_huge_page(h);
 -      struct address_space *mapping = vma->vm_file->f_mapping;
 +      struct address_space *mapping = src_vma->vm_file->f_mapping;
        struct mmu_notifier_range range;
        int ret = 0;
  
        if (cow) {
 -              mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
 -                                      vma->vm_start,
 -                                      vma->vm_end);
 +              mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
 +                                      src_vma->vm_start,
 +                                      src_vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
 +              mmap_assert_write_locked(src);
 +              raw_write_seqcount_begin(&src->write_protect_seq);
        } else {
                /*
                 * For shared mappings i_mmap_rwsem must be held to call
                i_mmap_lock_read(mapping);
        }
  
 -      for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
 +      for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr, sz);
                if (!src_pte)
                        continue;
 -              dst_pte = huge_pte_alloc(dst, vma, addr, sz);
 +              dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
                        break;
@@@ -4790,9 -4767,8 +4790,9 @@@ again
                } else if (unlikely(is_hugetlb_entry_migration(entry) ||
                                    is_hugetlb_entry_hwpoisoned(entry))) {
                        swp_entry_t swp_entry = pte_to_swp_entry(entry);
 +                      bool uffd_wp = huge_pte_uffd_wp(entry);
  
 -                      if (is_writable_migration_entry(swp_entry) && cow) {
 +                      if (!is_readable_migration_entry(swp_entry) && cow) {
                                /*
                                 * COW mappings require pages in both
                                 * parent and child to be set to read.
                                swp_entry = make_readable_migration_entry(
                                                        swp_offset(swp_entry));
                                entry = swp_entry_to_pte(swp_entry);
 +                              if (userfaultfd_wp(src_vma) && uffd_wp)
 +                                      entry = huge_pte_mkuffd_wp(entry);
                                set_huge_swap_pte_at(src, addr, src_pte,
                                                     entry, sz);
                        }
 +                      if (!userfaultfd_wp(dst_vma) && uffd_wp)
 +                              entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
 +              } else if (unlikely(is_pte_marker(entry))) {
 +                      /*
 +                       * We copy the pte marker only if the dst vma has
 +                       * uffd-wp enabled.
 +                       */
 +                      if (userfaultfd_wp(dst_vma))
 +                              set_huge_pte_at(dst, addr, dst_pte, entry);
                } else {
                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
  
                        /*
 -                       * This is a rare case where we see pinned hugetlb
 -                       * pages while they're prone to COW.  We need to do the
 -                       * COW earlier during fork.
 +                       * Failing to duplicate the anon rmap is a rare case
 +                       * where we see pinned hugetlb pages while they're
 +                       * prone to COW. We need to do the COW earlier during
 +                       * fork.
                         *
                         * When pre-allocating the page or copying data, we
                         * need to be without the pgtable locks since we could
                         * sleep during the process.
                         */
 -                      if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
 +                      if (!PageAnon(ptepage)) {
 +                              page_dup_file_rmap(ptepage, true);
 +                      } else if (page_try_dup_anon_rmap(ptepage, true,
 +                                                        src_vma)) {
                                pte_t src_pte_old = entry;
                                struct page *new;
  
                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                /* Do not use reserve as it's private owned */
 -                              new = alloc_huge_page(vma, addr, 1);
 +                              new = alloc_huge_page(dst_vma, addr, 1);
                                if (IS_ERR(new)) {
                                        put_page(ptepage);
                                        ret = PTR_ERR(new);
                                        break;
                                }
 -                              copy_user_huge_page(new, ptepage, addr, vma,
 +                              copy_user_huge_page(new, ptepage, addr, dst_vma,
                                                    npages);
                                put_page(ptepage);
  
                                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                                entry = huge_ptep_get(src_pte);
                                if (!pte_same(src_pte_old, entry)) {
 -                                      restore_reserve_on_error(h, vma, addr,
 +                                      restore_reserve_on_error(h, dst_vma, addr,
                                                                new);
                                        put_page(new);
                                        /* dst_entry won't change as in child */
                                        goto again;
                                }
 -                              hugetlb_install_page(vma, dst_pte, addr, new);
 +                              hugetlb_install_page(dst_vma, dst_pte, addr, new);
                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                continue;
                                entry = huge_pte_wrprotect(entry);
                        }
  
 -                      page_dup_rmap(ptepage, true);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                        hugetlb_count_add(npages, dst);
                }
                spin_unlock(dst_ptl);
        }
  
 -      if (cow)
 +      if (cow) {
 +              raw_write_seqcount_end(&src->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
 -      else
 +      } else {
                i_mmap_unlock_read(mapping);
 +      }
  
        return ret;
  }
@@@ -4936,17 -4896,10 +4936,17 @@@ int move_hugetlb_page_tables(struct vm_
        unsigned long old_addr_copy;
        pte_t *src_pte, *dst_pte;
        struct mmu_notifier_range range;
 +      bool shared_pmd = false;
  
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
                                old_end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 +      /*
 +       * In case of shared PMDs, we should cover the maximum possible
 +       * range.
 +       */
 +      flush_cache_range(vma, range.start, range.end);
 +
        mmu_notifier_invalidate_range_start(&range);
        /* Prevent race with file truncation */
        i_mmap_lock_write(mapping);
                 */
                old_addr_copy = old_addr;
  
 -              if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
 +              if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
 +                      shared_pmd = true;
                        continue;
 +              }
  
                dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
                if (!dst_pte)
  
                move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
        }
 -      flush_tlb_range(vma, old_end - len, old_end);
 +
 +      if (shared_pmd)
 +              flush_tlb_range(vma, range.start, range.end);
 +      else
 +              flush_tlb_range(vma, old_end - len, old_end);
        mmu_notifier_invalidate_range_end(&range);
        i_mmap_unlock_write(mapping);
  
  
  static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                   unsigned long start, unsigned long end,
 -                                 struct page *ref_page)
 +                                 struct page *ref_page, zap_flags_t zap_flags)
  {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
                 * unmapped and its refcount is dropped, so just clear pte here.
                 */
                if (unlikely(!pte_present(pte))) {
 -                      huge_pte_clear(mm, address, ptep, sz);
 +                      /*
 +                       * If the pte was wr-protected by uffd-wp in any of the
 +                       * swap forms, meanwhile the caller does not want to
 +                       * drop the uffd-wp bit in this zap, then replace the
 +                       * pte with a marker.
 +                       */
 +                      if (pte_swp_uffd_wp_any(pte) &&
 +                          !(zap_flags & ZAP_FLAG_DROP_MARKER))
 +                              set_huge_pte_at(mm, address, ptep,
 +                                              make_pte_marker(PTE_MARKER_UFFD_WP));
 +                      else
 +                              huge_pte_clear(mm, address, ptep, sz);
                        spin_unlock(ptl);
                        continue;
                }
                tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                if (huge_pte_dirty(pte))
                        set_page_dirty(page);
 -
 +              /* Leave a uffd-wp pte marker if needed */
 +              if (huge_pte_uffd_wp(pte) &&
 +                  !(zap_flags & ZAP_FLAG_DROP_MARKER))
 +                      set_huge_pte_at(mm, address, ptep,
 +                                      make_pte_marker(PTE_MARKER_UFFD_WP));
                hugetlb_count_sub(pages_per_huge_page(h), mm);
                page_remove_rmap(page, vma, true);
  
  
  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                          struct vm_area_struct *vma, unsigned long start,
 -                        unsigned long end, struct page *ref_page)
 +                        unsigned long end, struct page *ref_page,
 +                        zap_flags_t zap_flags)
  {
 -      __unmap_hugepage_range(tlb, vma, start, end, ref_page);
 +      __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
  
        /*
         * Clear this flag so that x86's huge_pmd_share page_table_shareable
  }
  
  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 -                        unsigned long end, struct page *ref_page)
 +                        unsigned long end, struct page *ref_page,
 +                        zap_flags_t zap_flags)
  {
        struct mmu_gather tlb;
  
        tlb_gather_mmu(&tlb, vma->vm_mm);
 -      __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
 +      __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
        tlb_finish_mmu(&tlb);
  }
  
@@@ -5200,22 -5130,21 +5200,22 @@@ static void unmap_ref_private(struct mm
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                        unmap_hugepage_range(iter_vma, address,
 -                                           address + huge_page_size(h), page);
 +                                           address + huge_page_size(h), page, 0);
        }
        i_mmap_unlock_write(mapping);
  }
  
  /*
 - * Hugetlb_cow() should be called with page lock of the original hugepage held.
 + * hugetlb_wp() should be called with page lock of the original hugepage held.
   * Called with hugetlb_fault_mutex_table held and pte_page locked so we
   * cannot race with other handlers or page migration.
   * Keep the pte_same checks anyway to make transition from the mutex easier.
   */
 -static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 -                     unsigned long address, pte_t *ptep,
 +static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 +                     unsigned long address, pte_t *ptep, unsigned int flags,
                       struct page *pagecache_page, spinlock_t *ptl)
  {
 +      const bool unshare = flags & FAULT_FLAG_UNSHARE;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
        unsigned long haddr = address & huge_page_mask(h);
        struct mmu_notifier_range range;
  
 +      VM_BUG_ON(unshare && (flags & FOLL_WRITE));
 +      VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
 +
        pte = huge_ptep_get(ptep);
        old_page = pte_page(pte);
  
  retry_avoidcopy:
 -      /* If no-one else is actually using this page, avoid the copy
 -       * and just make the page writable */
 +      /*
 +       * If no-one else is actually using this page, we're the exclusive
 +       * owner and can reuse this page.
 +       */
        if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
 -              page_move_anon_rmap(old_page, vma);
 -              set_huge_ptep_writable(vma, haddr, ptep);
 +              if (!PageAnonExclusive(old_page))
 +                      page_move_anon_rmap(old_page, vma);
 +              if (likely(!unshare))
 +                      set_huge_ptep_writable(vma, haddr, ptep);
                return 0;
        }
 +      VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
 +                     old_page);
  
        /*
         * If the process that created a MAP_PRIVATE mapping is about to
        if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
                ClearHPageRestoreReserve(new_page);
  
 -              /* Break COW */
 +              /* Break COW or unshare */
                huge_ptep_clear_flush(vma, haddr, ptep);
                mmu_notifier_invalidate_range(mm, range.start, range.end);
                page_remove_rmap(old_page, vma, true);
                hugepage_add_new_anon_rmap(new_page, vma, haddr);
                set_huge_pte_at(mm, haddr, ptep,
 -                              make_huge_pte(vma, new_page, 1));
 +                              make_huge_pte(vma, new_page, !unshare));
                SetHPageMigratable(new_page);
                /* Make the old page be freed below */
                new_page = old_page;
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
  out_release_all:
 -      /* No restore in case of successful pagetable update (Break COW) */
 +      /*
 +       * No restore in case of successful pagetable update (Break COW or
 +       * unshare)
 +       */
        if (new_page != old_page)
                restore_reserve_on_error(h, vma, haddr, new_page);
        put_page(new_page);
@@@ -5469,8 -5386,7 +5469,8 @@@ static inline vm_fault_t hugetlb_handle
  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                        struct vm_area_struct *vma,
                        struct address_space *mapping, pgoff_t idx,
 -                      unsigned long address, pte_t *ptep, unsigned int flags)
 +                      unsigned long address, pte_t *ptep,
 +                      pte_t old_pte, unsigned int flags)
  {
        struct hstate *h = hstate_vma(vma);
        vm_fault_t ret = VM_FAULT_SIGBUS;
        /*
         * Currently, we are forced to kill the process in the event the
         * original mapper has unmapped pages from the child due to a failed
 -       * COW. Warn that such a situation has occurred as it may not be obvious
 +       * COW/unsharing. Warn that such a situation has occurred as it may not
 +       * be obvious.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
@@@ -5597,29 -5512,22 +5597,29 @@@ retry
  
        ptl = huge_pte_lock(h, mm, ptep);
        ret = 0;
 -      if (!huge_pte_none(huge_ptep_get(ptep)))
 +      /* If pte changed from under us, retry */
 +      if (!pte_same(huge_ptep_get(ptep), old_pte))
                goto backout;
  
        if (anon_rmap) {
                ClearHPageRestoreReserve(page);
                hugepage_add_new_anon_rmap(page, vma, haddr);
        } else
 -              page_dup_rmap(page, true);
 +              page_dup_file_rmap(page, true);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                && (vma->vm_flags & VM_SHARED)));
 +      /*
 +       * If this pte was previously wr-protected, keep it wr-protected even
 +       * if populated.
 +       */
 +      if (unlikely(pte_marker_uffd_wp(old_pte)))
 +              new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
        set_huge_pte_at(mm, haddr, ptep, new_pte);
  
        hugetlb_count_add(pages_per_huge_page(h), mm);
        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
 -              ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
 +              ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
        }
  
        spin_unlock(ptl);
@@@ -5731,10 -5639,8 +5731,10 @@@ vm_fault_t hugetlb_fault(struct mm_stru
        mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
        entry = huge_ptep_get(ptep);
 -      if (huge_pte_none(entry)) {
 -              ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
 +      /* PTE markers should be handled the same way as none pte */
 +      if (huge_pte_none_mostly(entry)) {
 +              ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
 +                                    entry, flags);
                goto out_mutex;
        }
  
                goto out_mutex;
  
        /*
 -       * If we are going to COW the mapping later, we examine the pending
 -       * reservations for this page now. This will ensure that any
 +       * If we are going to COW/unshare the mapping later, we examine the
 +       * pending reservations for this page now. This will ensure that any
         * allocations necessary to record that reservation occur outside the
         * spinlock. For private mappings, we also lookup the pagecache
         * page now as it is used to determine if a reservation has been
         * consumed.
         */
 -      if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
 +      if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
 +          !huge_pte_write(entry)) {
                if (vma_needs_reservation(h, vma, haddr) < 0) {
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
  
        ptl = huge_pte_lock(h, mm, ptep);
  
 -      /* Check for a racing update before calling hugetlb_cow */
 +      /* Check for a racing update before calling hugetlb_wp() */
        if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
                goto out_ptl;
  
 +      /* Handle userfault-wp first, before trying to lock more pages */
 +      if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
 +          (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
 +              struct vm_fault vmf = {
 +                      .vma = vma,
 +                      .address = haddr,
 +                      .real_address = address,
 +                      .flags = flags,
 +              };
 +
 +              spin_unlock(ptl);
 +              if (pagecache_page) {
 +                      unlock_page(pagecache_page);
 +                      put_page(pagecache_page);
 +              }
 +              mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 +              i_mmap_unlock_read(mapping);
 +              return handle_userfault(&vmf, VM_UFFD_WP);
 +      }
 +
        /*
 -       * hugetlb_cow() requires page locks of pte_page(entry) and
 +       * hugetlb_wp() requires page locks of pte_page(entry) and
         * pagecache_page, so here we need take the former one
         * when page != pagecache_page or !pagecache_page.
         */
  
        get_page(page);
  
 -      if (flags & FAULT_FLAG_WRITE) {
 +      if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!huge_pte_write(entry)) {
 -                      ret = hugetlb_cow(mm, vma, address, ptep,
 -                                        pagecache_page, ptl);
 +                      ret = hugetlb_wp(mm, vma, address, ptep, flags,
 +                                       pagecache_page, ptl);
                        goto out_put_page;
 +              } else if (likely(flags & FAULT_FLAG_WRITE)) {
 +                      entry = huge_pte_mkdirty(entry);
                }
 -              entry = huge_pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
@@@ -5862,8 -5746,7 +5862,8 @@@ int hugetlb_mcopy_atomic_pte(struct mm_
                            unsigned long dst_addr,
                            unsigned long src_addr,
                            enum mcopy_atomic_mode mode,
 -                          struct page **pagep)
 +                          struct page **pagep,
 +                          bool wp_copy)
  {
        bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
        struct hstate *h = hstate_vma(dst_vma);
                goto out_release_unlock;
  
        ret = -EEXIST;
 -      if (!huge_pte_none(huge_ptep_get(dst_pte)))
 +      /*
 +       * We allow to overwrite a pte marker: consider when both MISSING|WP
 +       * registered, we firstly wr-protect a none pte which has no page cache
 +       * page backing it, then access the page.
 +       */
 +      if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                goto out_release_unlock;
  
        if (vm_shared) {
 -              page_dup_rmap(page, true);
 +              page_dup_file_rmap(page, true);
        } else {
                ClearHPageRestoreReserve(page);
                hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
        }
  
 -      /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
 -      if (is_continue && !vm_shared)
 +      /*
 +       * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
 +       * with wp flag set, don't set pte write bit.
 +       */
 +      if (wp_copy || (is_continue && !vm_shared))
                writable = 0;
        else
                writable = dst_vma->vm_flags & VM_WRITE;
  
        _dst_pte = make_huge_pte(dst_vma, page, writable);
 -      if (writable)
 -              _dst_pte = huge_pte_mkdirty(_dst_pte);
 +      /*
 +       * Always mark UFFDIO_COPY page dirty; note that this may not be
 +       * extremely important for hugetlbfs for now since swapping is not
 +       * supported, but we should still be clear in that this page cannot be
 +       * thrown away at will, even if write bit not set.
 +       */
 +      _dst_pte = huge_pte_mkdirty(_dst_pte);
        _dst_pte = pte_mkyoung(_dst_pte);
  
 +      if (wp_copy)
 +              _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
 +
        set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  
        (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
@@@ -6073,25 -5940,6 +6073,25 @@@ static void record_subpages_vmas(struc
        }
  }
  
 +static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
 +                                             bool *unshare)
 +{
 +      pte_t pteval = huge_ptep_get(pte);
 +
 +      *unshare = false;
 +      if (is_swap_pte(pteval))
 +              return true;
 +      if (huge_pte_write(pteval))
 +              return false;
 +      if (flags & FOLL_WRITE)
 +              return true;
 +      if (gup_must_unshare(flags, pte_page(pteval))) {
 +              *unshare = true;
 +              return true;
 +      }
 +      return false;
 +}
 +
  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         struct page **pages, struct vm_area_struct **vmas,
                         unsigned long *position, unsigned long *nr_pages,
        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;
                spinlock_t *ptl = NULL;
 +              bool unshare = false;
                int absent;
                struct page *page;
  
                 * both cases, and because we can't follow correct pages
                 * directly from any kind of swap entries.
                 */
 -              if (absent || is_swap_pte(huge_ptep_get(pte)) ||
 -                  ((flags & FOLL_WRITE) &&
 -                    !huge_pte_write(huge_ptep_get(pte)))) {
 +              if (absent ||
 +                  __follow_hugetlb_must_fault(flags, pte, &unshare)) {
                        vm_fault_t ret;
                        unsigned int fault_flags = 0;
  
                                spin_unlock(ptl);
                        if (flags & FOLL_WRITE)
                                fault_flags |= FAULT_FLAG_WRITE;
 +                      else if (unshare)
 +                              fault_flags |= FAULT_FLAG_UNSHARE;
                        if (locked)
                                fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                        FAULT_FLAG_KILLABLE;
                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                page = pte_page(huge_ptep_get(pte));
  
 +              VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
 +                             !PageAnonExclusive(page), page);
 +
                /*
                 * If subpage information not requested, update counters
                 * and skip the same_page loop below.
  }
  
  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 -              unsigned long address, unsigned long end, pgprot_t newprot)
 +              unsigned long address, unsigned long end,
 +              pgprot_t newprot, unsigned long cp_flags)
  {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long start = address;
        pte_t *ptep;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
 -      unsigned long pages = 0;
 +      unsigned long pages = 0, psize = huge_page_size(h);
        bool shared_pmd = false;
        struct mmu_notifier_range range;
 +      bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 +      bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
  
        /*
         * In the case of shared PMDs, the area to flush could be beyond
  
        mmu_notifier_invalidate_range_start(&range);
        i_mmap_lock_write(vma->vm_file->f_mapping);
 -      for (; address < end; address += huge_page_size(h)) {
 +      for (; address < end; address += psize) {
                spinlock_t *ptl;
 -              ptep = huge_pte_offset(mm, address, huge_page_size(h));
 +              ptep = huge_pte_offset(mm, address, psize);
                if (!ptep)
                        continue;
                ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(mm, vma, &address, ptep)) {
 +                      /*
 +                       * When uffd-wp is enabled on the vma, unshare
 +                       * shouldn't happen at all.  Warn about it if it
 +                       * happened due to some reason.
 +                       */
 +                      WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
                        pages++;
                        spin_unlock(ptl);
                        shared_pmd = true;
                }
                if (unlikely(is_hugetlb_entry_migration(pte))) {
                        swp_entry_t entry = pte_to_swp_entry(pte);
 +                      struct page *page = pfn_swap_entry_to_page(entry);
  
 -                      if (is_writable_migration_entry(entry)) {
 +                      if (!is_readable_migration_entry(entry)) {
                                pte_t newpte;
  
 -                              entry = make_readable_migration_entry(
 -                                                      swp_offset(entry));
 +                              if (PageAnon(page))
 +                                      entry = make_readable_exclusive_migration_entry(
 +                                                              swp_offset(entry));
 +                              else
 +                                      entry = make_readable_migration_entry(
 +                                                              swp_offset(entry));
                                newpte = swp_entry_to_pte(entry);
 +                              if (uffd_wp)
 +                                      newpte = pte_swp_mkuffd_wp(newpte);
 +                              else if (uffd_wp_resolve)
 +                                      newpte = pte_swp_clear_uffd_wp(newpte);
                                set_huge_swap_pte_at(mm, address, ptep,
 -                                                   newpte, huge_page_size(h));
 +                                                   newpte, psize);
                                pages++;
                        }
                        spin_unlock(ptl);
                        continue;
                }
 +              if (unlikely(pte_marker_uffd_wp(pte))) {
 +                      /*
 +                       * This is changing a non-present pte into a none pte,
 +                       * no need for huge_ptep_modify_prot_start/commit().
 +                       */
 +                      if (uffd_wp_resolve)
 +                              huge_pte_clear(mm, address, ptep, psize);
 +              }
                if (!huge_pte_none(pte)) {
                        pte_t old_pte;
                        unsigned int shift = huge_page_shift(hstate_vma(vma));
                        old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
                        pte = huge_pte_modify(old_pte, newprot);
                        pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
 +                      if (uffd_wp)
 +                              pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
 +                      else if (uffd_wp_resolve)
 +                              pte = huge_pte_clear_uffd_wp(pte);
                        huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
                        pages++;
 +              } else {
 +                      /* None pte */
 +                      if (unlikely(uffd_wp))
 +                              /* Safe to modify directly (none->non-present). */
 +                              set_huge_pte_at(mm, address, ptep,
 +                                              make_pte_marker(PTE_MARKER_UFFD_WP));
                }
                spin_unlock(ptl);
        }
@@@ -6755,7 -6562,14 +6755,14 @@@ int huge_pmd_unshare(struct mm_struct *
        pud_clear(pud);
        put_page(virt_to_page(ptep));
        mm_dec_nr_pmds(mm);
-       *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+       /*
+        * This update of passed address optimizes loops sequentially
+        * processing addresses in increments of huge page size (PMD_SIZE
+        * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
+        * Update address to the 'last page' in the cleared area so that
+        * calling loop can move to first page past this area.
+        */
+       *addr |= PUD_SIZE - PMD_SIZE;
        return 1;
  }
  
@@@ -6879,11 -6693,9 +6886,11 @@@ follow_huge_pmd(struct mm_struct *mm, u
        spinlock_t *ptl;
        pte_t pte;
  
 -      /* FOLL_GET and FOLL_PIN are mutually exclusive. */
 -      if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
 -                       (FOLL_PIN | FOLL_GET)))
 +      /*
 +       * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
 +       * follow_hugetlb_page().
 +       */
 +      if (WARN_ON_ONCE(flags & FOLL_PIN))
                return NULL;
  
  retry:
@@@ -6971,9 -6783,7 +6978,9 @@@ int get_hwpoison_huge_page(struct page 
        spin_lock_irq(&hugetlb_lock);
        if (PageHeadHuge(page)) {
                *hugetlb = true;
 -              if (HPageFreed(page) || HPageMigratable(page))
 +              if (HPageFreed(page))
 +                      ret = 0;
 +              else if (HPageMigratable(page))
                        ret = get_page_unless_zero(page);
                else
                        ret = -EBUSY;
@@@ -7063,7 -6873,6 +7070,7 @@@ void hugetlb_unshare_all_pmds(struct vm
        if (start >= end)
                return;
  
 +      flush_cache_range(vma, start, end);
        /*
         * No need to call adjust_range_if_pmd_sharing_possible(), because
         * we have already done the PUD_SIZE alignment.
@@@ -7149,7 -6958,7 +7156,7 @@@ void __init hugetlb_cma_reserve(int ord
                if (hugetlb_cma_size_in_node[nid] == 0)
                        continue;
  
 -              if (!node_state(nid, N_ONLINE)) {
 +              if (!node_online(nid)) {
                        pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
                        hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
                        hugetlb_cma_size_in_node[nid] = 0;
        }
  
        reserved = 0;
 -      for_each_node_state(nid, N_ONLINE) {
 +      for_each_online_node(nid) {
                int res;
                char name[CMA_MAX_NAME];
  
diff --combined mm/page_alloc.c
@@@ -81,7 -81,6 +81,7 @@@
  #include "internal.h"
  #include "shuffle.h"
  #include "page_reporting.h"
 +#include "swap.h"
  
  /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
  typedef int __bitwise fpi_t;
@@@ -868,6 -867,40 +868,6 @@@ static inline void set_buddy_order(stru
        __SetPageBuddy(page);
  }
  
 -/*
 - * This function checks whether a page is free && is the buddy
 - * we can coalesce a page and its buddy if
 - * (a) the buddy is not in a hole (check before calling!) &&
 - * (b) the buddy is in the buddy system &&
 - * (c) a page and its buddy have the same order &&
 - * (d) a page and its buddy are in the same zone.
 - *
 - * For recording whether a page is in the buddy system, we set PageBuddy.
 - * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
 - *
 - * For recording page's order, we use page_private(page).
 - */
 -static inline bool page_is_buddy(struct page *page, struct page *buddy,
 -                                                      unsigned int order)
 -{
 -      if (!page_is_guard(buddy) && !PageBuddy(buddy))
 -              return false;
 -
 -      if (buddy_order(buddy) != order)
 -              return false;
 -
 -      /*
 -       * zone check is done late to avoid uselessly calculating
 -       * zone/node ids for pages that could never merge.
 -       */
 -      if (page_zone_id(page) != page_zone_id(buddy))
 -              return false;
 -
 -      VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
 -
 -      return true;
 -}
 -
  #ifdef CONFIG_COMPACTION
  static inline struct capture_control *task_capc(struct zone *zone)
  {
@@@ -976,17 -1009,18 +976,17 @@@ static inline boo
  buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
                   struct page *page, unsigned int order)
  {
 -      struct page *higher_page, *higher_buddy;
 -      unsigned long combined_pfn;
 +      unsigned long higher_page_pfn;
 +      struct page *higher_page;
  
        if (order >= MAX_ORDER - 2)
                return false;
  
 -      combined_pfn = buddy_pfn & pfn;
 -      higher_page = page + (combined_pfn - pfn);
 -      buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
 -      higher_buddy = higher_page + (buddy_pfn - combined_pfn);
 +      higher_page_pfn = buddy_pfn & pfn;
 +      higher_page = page + (higher_page_pfn - pfn);
  
 -      return page_is_buddy(higher_page, higher_buddy, order + 1);
 +      return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
 +                      NULL) != NULL;
  }
  
  /*
@@@ -1019,6 -1053,7 +1019,6 @@@ static inline void __free_one_page(stru
                int migratetype, fpi_t fpi_flags)
  {
        struct capture_control *capc = task_capc(zone);
 -      unsigned int max_order = pageblock_order;
        unsigned long buddy_pfn;
        unsigned long combined_pfn;
        struct page *buddy;
        VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
        VM_BUG_ON_PAGE(bad_range(zone, page), page);
  
 -continue_merging:
 -      while (order < max_order) {
 +      while (order < MAX_ORDER - 1) {
                if (compaction_capture(capc, page, order, migratetype)) {
                        __mod_zone_freepage_state(zone, -(1 << order),
                                                                migratetype);
                        return;
                }
 -              buddy_pfn = __find_buddy_pfn(pfn, order);
 -              buddy = page + (buddy_pfn - pfn);
  
 -              if (!page_is_buddy(page, buddy, order))
 +              buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
 +              if (!buddy)
                        goto done_merging;
 +
 +              if (unlikely(order >= pageblock_order)) {
 +                      /*
 +                       * We want to prevent merge between freepages on pageblock
 +                       * without fallbacks and normal pageblock. Without this,
 +                       * pageblock isolation could cause incorrect freepage or CMA
 +                       * accounting or HIGHATOMIC accounting.
 +                       */
 +                      int buddy_mt = get_pageblock_migratetype(buddy);
 +
 +                      if (migratetype != buddy_mt
 +                                      && (!migratetype_is_mergeable(migratetype) ||
 +                                              !migratetype_is_mergeable(buddy_mt)))
 +                              goto done_merging;
 +              }
 +
                /*
                 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
                 * merge with it and move up one order.
                pfn = combined_pfn;
                order++;
        }
 -      if (order < MAX_ORDER - 1) {
 -              /* If we are here, it means order is >= pageblock_order.
 -               * We want to prevent merge between freepages on pageblock
 -               * without fallbacks and normal pageblock. Without this,
 -               * pageblock isolation could cause incorrect freepage or CMA
 -               * accounting or HIGHATOMIC accounting.
 -               *
 -               * We don't want to hit this code for the more frequent
 -               * low-order merging.
 -               */
 -              int buddy_mt;
 -
 -              buddy_pfn = __find_buddy_pfn(pfn, order);
 -              buddy = page + (buddy_pfn - pfn);
 -
 -              if (!page_is_buddy(page, buddy, order))
 -                      goto done_merging;
 -              buddy_mt = get_pageblock_migratetype(buddy);
 -
 -              if (migratetype != buddy_mt
 -                              && (!migratetype_is_mergeable(migratetype) ||
 -                                      !migratetype_is_mergeable(buddy_mt)))
 -                      goto done_merging;
 -              max_order = order + 1;
 -              goto continue_merging;
 -      }
  
  done_merging:
        set_buddy_order(page, order);
                page_reporting_notify_free(order);
  }
  
 +/**
 + * split_free_page() -- split a free page at split_pfn_offset
 + * @free_page:                the original free page
 + * @order:            the order of the page
 + * @split_pfn_offset: split offset within the page
 + *
 + * It is used when the free page crosses two pageblocks with different migratetypes
 + * at split_pfn_offset within the page. The split free page will be put into
 + * separate migratetype lists afterwards. Otherwise, the function achieves
 + * nothing.
 + */
 +void split_free_page(struct page *free_page,
 +                              int order, unsigned long split_pfn_offset)
 +{
 +      struct zone *zone = page_zone(free_page);
 +      unsigned long free_page_pfn = page_to_pfn(free_page);
 +      unsigned long pfn;
 +      unsigned long flags;
 +      int free_page_order;
 +
 +      if (split_pfn_offset == 0)
 +              return;
 +
 +      spin_lock_irqsave(&zone->lock, flags);
 +      del_page_from_free_list(free_page, zone, order);
 +      for (pfn = free_page_pfn;
 +           pfn < free_page_pfn + (1UL << order);) {
 +              int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
 +
 +              free_page_order = min_t(int,
 +                                      pfn ? __ffs(pfn) : order,
 +                                      __fls(split_pfn_offset));
 +              __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
 +                              mt, FPI_NONE);
 +              pfn += 1UL << free_page_order;
 +              split_pfn_offset -= (1UL << free_page_order);
 +              /* we have done the first part, now switch to second part */
 +              if (split_pfn_offset == 0)
 +                      split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
 +      }
 +      spin_unlock_irqrestore(&zone->lock, flags);
 +}
  /*
   * A bad page could be due to a number of fields. Instead of multiple branches,
   * try and check multiple fields with one check. The caller must do a detailed
@@@ -2471,9 -2476,6 +2471,9 @@@ struct page *__rmqueue_smallest(struct 
                del_page_from_free_list(page, zone, current_order);
                expand(zone, page, order, current_order, migratetype);
                set_pcppage_migratetype(page, migratetype);
 +              trace_mm_page_alloc_zone_locked(page, order, migratetype,
 +                              pcp_allowed_order(order) &&
 +                              migratetype < MIGRATE_PCPTYPES);
                return page;
        }
  
@@@ -2997,7 -2999,7 +2997,7 @@@ __rmqueue(struct zone *zone, unsigned i
                    zone_page_state(zone, NR_FREE_PAGES) / 2) {
                        page = __rmqueue_cma_fallback(zone, order);
                        if (page)
 -                              goto out;
 +                              return page;
                }
        }
  retry:
                                                                alloc_flags))
                        goto retry;
        }
 -out:
 -      if (page)
 -              trace_mm_page_alloc_zone_locked(page, order, migratetype);
        return page;
  }
  
@@@ -3728,8 -3733,11 +3728,8 @@@ struct page *rmqueue(struct zone *prefe
                 * reserved for high-order atomic allocation, so order-0
                 * request should skip it.
                 */
 -              if (order > 0 && alloc_flags & ALLOC_HARDER) {
 +              if (order > 0 && alloc_flags & ALLOC_HARDER)
                        page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
 -                      if (page)
 -                              trace_mm_page_alloc_zone_locked(page, order, migratetype);
 -              }
                if (!page) {
                        page = __rmqueue(zone, order, migratetype, alloc_flags);
                        if (!page)
@@@ -3791,9 -3799,6 +3791,9 @@@ static bool __should_fail_alloc_page(gf
                        (gfp_mask & __GFP_DIRECT_RECLAIM))
                return false;
  
 +      if (gfp_mask & __GFP_NOWARN)
 +              fail_page_alloc.attr.no_warn = true;
 +
        return should_fail(&fail_page_alloc.attr, 1 << order);
  }
  
@@@ -4063,8 -4068,7 +4063,8 @@@ get_page_from_freelist(gfp_t gfp_mask, 
  {
        struct zoneref *z;
        struct zone *zone;
 -      struct pglist_data *last_pgdat_dirty_limit = NULL;
 +      struct pglist_data *last_pgdat = NULL;
 +      bool last_pgdat_dirty_ok = false;
        bool no_fallback;
  
  retry:
                 * dirty-throttling and the flusher threads.
                 */
                if (ac->spread_dirty_pages) {
 -                      if (last_pgdat_dirty_limit == zone->zone_pgdat)
 -                              continue;
 +                      if (last_pgdat != zone->zone_pgdat) {
 +                              last_pgdat = zone->zone_pgdat;
 +                              last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
 +                      }
  
 -                      if (!node_dirty_ok(zone->zone_pgdat)) {
 -                              last_pgdat_dirty_limit = zone->zone_pgdat;
 +                      if (!last_pgdat_dirty_ok)
                                continue;
 -                      }
                }
  
                if (no_fallback && nr_online_nodes > 1 &&
@@@ -4342,8 -4346,7 +4342,8 @@@ __alloc_pages_may_oom(gfp_t gfp_mask, u
         */
  
        /* Exhausted what can be done so it's blame time */
 -      if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
 +      if (out_of_memory(&oc) ||
 +          WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
                *did_some_progress = 1;
  
                /*
@@@ -4674,12 -4677,9 +4674,12 @@@ static void wake_all_kswapds(unsigned i
  
        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
                                        ac->nodemask) {
 -              if (last_pgdat != zone->zone_pgdat)
 +              if (!managed_zone(zone))
 +                      continue;
 +              if (last_pgdat != zone->zone_pgdat) {
                        wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
 -              last_pgdat = zone->zone_pgdat;
 +                      last_pgdat = zone->zone_pgdat;
 +              }
        }
  }
  
@@@ -5117,7 -5117,7 +5117,7 @@@ nopage
                 * All existing users of the __GFP_NOFAIL are blockable, so warn
                 * of any new users that actually require GFP_NOWAIT
                 */
 -              if (WARN_ON_ONCE(!can_direct_reclaim))
 +              if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
                        goto fail;
  
                /*
                 * because we cannot reclaim anything and only can loop waiting
                 * for somebody to do a work for us
                 */
 -              WARN_ON_ONCE(current->flags & PF_MEMALLOC);
 +              WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
  
                /*
                 * non failing costly orders are a hard requirement which we
                 * so that we can identify them and convert them to something
                 * else.
                 */
 -              WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
 +              WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask);
  
                /*
                 * Help non-failing allocations by giving them access to memory
@@@ -5324,8 -5324,8 +5324,8 @@@ unsigned long __alloc_pages_bulk(gfp_t 
                page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
                                                                pcp, pcp_list);
                if (unlikely(!page)) {
-                       /* Try and get at least one page */
-                       if (!nr_populated)
+                       /* Try and allocate at least one page */
+                       if (!nr_account)
                                goto failed_irq;
                        break;
                }
@@@ -5379,8 -5379,10 +5379,8 @@@ struct page *__alloc_pages(gfp_t gfp, u
         * There are several places where we assume that the order value is sane
         * so bail out early if the request is out of bound.
         */
 -      if (unlikely(order >= MAX_ORDER)) {
 -              WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
 +      if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
                return NULL;
 -      }
  
        gfp &= gfp_allowed_mask;
        /*
@@@ -6169,6 -6171,7 +6169,6 @@@ int numa_zonelist_order_handler(struct 
  }
  
  
 -#define MAX_NODE_LOAD (nr_online_nodes)
  static int node_load[MAX_NUMNODES];
  
  /**
@@@ -6215,7 -6218,7 +6215,7 @@@ int find_next_best_node(int node, nodem
                        val += PENALTY_FOR_NODE_WITH_CPUS;
  
                /* Slight preference for less loaded node */
 -              val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 +              val *= MAX_NUMNODES;
                val += node_load[n];
  
                if (val < min_val) {
@@@ -6281,12 -6284,13 +6281,12 @@@ static void build_thisnode_zonelists(pg
  static void build_zonelists(pg_data_t *pgdat)
  {
        static int node_order[MAX_NUMNODES];
 -      int node, load, nr_nodes = 0;
 +      int node, nr_nodes = 0;
        nodemask_t used_mask = NODE_MASK_NONE;
        int local_node, prev_node;
  
        /* NUMA-aware ordering of nodes */
        local_node = pgdat->node_id;
 -      load = nr_online_nodes;
        prev_node = local_node;
  
        memset(node_order, 0, sizeof(node_order));
                 */
                if (node_distance(local_node, node) !=
                    node_distance(local_node, prev_node))
 -                      node_load[node] += load;
 +                      node_load[node] += 1;
  
                node_order[nr_nodes++] = node;
                prev_node = node;
 -              load--;
        }
  
        build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
@@@ -6640,21 -6645,6 +6640,21 @@@ static void __ref __init_zone_device_pa
        }
  }
  
 +/*
 + * With compound page geometry and when struct pages are stored in ram most
 + * tail pages are reused. Consequently, the amount of unique struct pages to
 + * initialize is a lot smaller that the total amount of struct pages being
 + * mapped. This is a paired / mild layering violation with explicit knowledge
 + * of how the sparse_vmemmap internals handle compound pages in the lack
 + * of an altmap. See vmemmap_populate_compound_pages().
 + */
 +static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
 +                                            unsigned long nr_pages)
 +{
 +      return is_power_of_2(sizeof(struct page)) &&
 +              !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
 +}
 +
  static void __ref memmap_init_compound(struct page *head,
                                       unsigned long head_pfn,
                                       unsigned long zone_idx, int nid,
@@@ -6719,7 -6709,7 +6719,7 @@@ void __ref memmap_init_zone_device(stru
                        continue;
  
                memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
 -                                   pfns_per_compound);
 +                                   compound_nr_pages(altmap, pfns_per_compound));
        }
  
        pr_info("%s initialised %lu pages in %ums\n", __func__,
@@@ -7880,7 -7870,7 +7880,7 @@@ static void __init find_zone_movable_pf
  
                        usable_startpfn = memblock_region_memory_base_pfn(r);
  
 -                      if (usable_startpfn < 0x100000) {
 +                      if (usable_startpfn < PHYS_PFN(SZ_4G)) {
                                mem_below_4gb_not_mirrored = true;
                                continue;
                        }
@@@ -8959,7 -8949,136 +8959,7 @@@ void *__init alloc_large_system_hash(co
        return table;
  }
  
 -/*
 - * This function checks whether pageblock includes unmovable pages or not.
 - *
 - * PageLRU check without isolation or lru_lock could race so that
 - * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
 - * check without lock_page also may miss some movable non-lru pages at
 - * race condition. So you can't expect this function should be exact.
 - *
 - * Returns a page without holding a reference. If the caller wants to
 - * dereference that page (e.g., dumping), it has to make sure that it
 - * cannot get removed (e.g., via memory unplug) concurrently.
 - *
 - */
 -struct page *has_unmovable_pages(struct zone *zone, struct page *page,
 -                               int migratetype, int flags)
 -{
 -      unsigned long iter = 0;
 -      unsigned long pfn = page_to_pfn(page);
 -      unsigned long offset = pfn % pageblock_nr_pages;
 -
 -      if (is_migrate_cma_page(page)) {
 -              /*
 -               * CMA allocations (alloc_contig_range) really need to mark
 -               * isolate CMA pageblocks even when they are not movable in fact
 -               * so consider them movable here.
 -               */
 -              if (is_migrate_cma(migratetype))
 -                      return NULL;
 -
 -              return page;
 -      }
 -
 -      for (; iter < pageblock_nr_pages - offset; iter++) {
 -              page = pfn_to_page(pfn + iter);
 -
 -              /*
 -               * Both, bootmem allocations and memory holes are marked
 -               * PG_reserved and are unmovable. We can even have unmovable
 -               * allocations inside ZONE_MOVABLE, for example when
 -               * specifying "movablecore".
 -               */
 -              if (PageReserved(page))
 -                      return page;
 -
 -              /*
 -               * If the zone is movable and we have ruled out all reserved
 -               * pages then it should be reasonably safe to assume the rest
 -               * is movable.
 -               */
 -              if (zone_idx(zone) == ZONE_MOVABLE)
 -                      continue;
 -
 -              /*
 -               * Hugepages are not in LRU lists, but they're movable.
 -               * THPs are on the LRU, but need to be counted as #small pages.
 -               * We need not scan over tail pages because we don't
 -               * handle each tail page individually in migration.
 -               */
 -              if (PageHuge(page) || PageTransCompound(page)) {
 -                      struct page *head = compound_head(page);
 -                      unsigned int skip_pages;
 -
 -                      if (PageHuge(page)) {
 -                              if (!hugepage_migration_supported(page_hstate(head)))
 -                                      return page;
 -                      } else if (!PageLRU(head) && !__PageMovable(head)) {
 -                              return page;
 -                      }
 -
 -                      skip_pages = compound_nr(head) - (page - head);
 -                      iter += skip_pages - 1;
 -                      continue;
 -              }
 -
 -              /*
 -               * We can't use page_count without pin a page
 -               * because another CPU can free compound page.
 -               * This check already skips compound tails of THP
 -               * because their page->_refcount is zero at all time.
 -               */
 -              if (!page_ref_count(page)) {
 -                      if (PageBuddy(page))
 -                              iter += (1 << buddy_order(page)) - 1;
 -                      continue;
 -              }
 -
 -              /*
 -               * The HWPoisoned page may be not in buddy system, and
 -               * page_count() is not 0.
 -               */
 -              if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
 -                      continue;
 -
 -              /*
 -               * We treat all PageOffline() pages as movable when offlining
 -               * to give drivers a chance to decrement their reference count
 -               * in MEM_GOING_OFFLINE in order to indicate that these pages
 -               * can be offlined as there are no direct references anymore.
 -               * For actually unmovable PageOffline() where the driver does
 -               * not support this, we will fail later when trying to actually
 -               * move these pages that still have a reference count > 0.
 -               * (false negatives in this function only)
 -               */
 -              if ((flags & MEMORY_OFFLINE) && PageOffline(page))
 -                      continue;
 -
 -              if (__PageMovable(page) || PageLRU(page))
 -                      continue;
 -
 -              /*
 -               * If there are RECLAIMABLE pages, we need to check
 -               * it.  But now, memory offline itself doesn't call
 -               * shrink_node_slabs() and it still to be fixed.
 -               */
 -              return page;
 -      }
 -      return NULL;
 -}
 -
  #ifdef CONFIG_CONTIG_ALLOC
 -static unsigned long pfn_max_align_down(unsigned long pfn)
 -{
 -      return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
 -}
 -
 -static unsigned long pfn_max_align_up(unsigned long pfn)
 -{
 -      return ALIGN(pfn, MAX_ORDER_NR_PAGES);
 -}
 -
  #if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
  /* Usage: See admin-guide/dynamic-debug-howto.rst */
@@@ -8982,7 -9101,7 +8982,7 @@@ static inline void alloc_contig_dump_pa
  #endif
  
  /* [start, end) must belong to a single zone. */
 -static int __alloc_contig_migrate_range(struct compact_control *cc,
 +int __alloc_contig_migrate_range(struct compact_control *cc,
                                        unsigned long start, unsigned long end)
  {
        /* This function is based on compact_zone() from compaction.c. */
  
        lru_cache_enable();
        if (ret < 0) {
 -              if (ret == -EBUSY)
 +              if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
                        alloc_contig_dump_pages(&cc->migratepages);
                putback_movable_pages(&cc->migratepages);
                return ret;
   *                    be either of the two.
   * @gfp_mask: GFP mask to use during compaction
   *
 - * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
 - * aligned.  The PFN range must belong to a single zone.
 + * The PFN range does not have to be pageblock aligned. The PFN range must
 + * belong to a single zone.
   *
   * The first thing this routine does is attempt to MIGRATE_ISOLATE all
   * pageblocks in the range.  Once isolated, the pageblocks should not
@@@ -9065,7 -9184,7 +9065,7 @@@ int alloc_contig_range(unsigned long st
                       unsigned migratetype, gfp_t gfp_mask)
  {
        unsigned long outer_start, outer_end;
 -      unsigned int order;
 +      int order;
        int ret = 0;
  
        struct compact_control cc = {
         * What we do here is we mark all pageblocks in range as
         * MIGRATE_ISOLATE.  Because pageblock and max order pages may
         * have different sizes, and due to the way page allocator
 -       * work, we align the range to biggest of the two pages so
 -       * that page allocator won't try to merge buddies from
 -       * different pageblocks and change MIGRATE_ISOLATE to some
 -       * other migration type.
 +       * work, start_isolate_page_range() has special handlings for this.
         *
         * Once the pageblocks are marked as MIGRATE_ISOLATE, we
         * migrate the pages from an unaligned range (ie. pages that
 -       * we are interested in).  This will put all the pages in
 +       * we are interested in). This will put all the pages in
         * range back to page allocator as MIGRATE_ISOLATE.
         *
         * When this is done, we take the pages in range from page
         * put back to page allocator so that buddy can use them.
         */
  
 -      ret = start_isolate_page_range(pfn_max_align_down(start),
 -                                     pfn_max_align_up(end), migratetype, 0);
 +      ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
        if (ret)
 -              return ret;
 +              goto done;
  
        drain_all_pages(cc.zone);
  
        ret = 0;
  
        /*
 -       * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 +       * Pages from [start, end) are within a pageblock_nr_pages
         * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
         * more, all pages in [start, end) are free in page allocator.
         * What we are going to do is to allocate all pages from
                free_contig_range(end, outer_end - end);
  
  done:
 -      undo_isolate_page_range(pfn_max_align_down(start),
 -                              pfn_max_align_up(end), migratetype);
 +      undo_isolate_page_range(start, end, migratetype);
        return ret;
  }
  EXPORT_SYMBOL(alloc_contig_range);
@@@ -9501,6 -9625,7 +9501,6 @@@ bool put_page_back_buddy(struct page *p
                ClearPageHWPoisonTakenOff(page);
                __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
                if (TestClearPageHWPoison(page)) {
 -                      num_poisoned_pages_dec();
                        ret = true;
                }
        }
diff --combined mm/page_table_check.c
@@@ -52,6 -52,23 +52,6 @@@ static struct page_table_check *get_pag
        return (void *)(page_ext) + page_table_check_ops.offset;
  }
  
 -static inline bool pte_user_accessible_page(pte_t pte)
 -{
 -      return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
 -}
 -
 -static inline bool pmd_user_accessible_page(pmd_t pmd)
 -{
 -      return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) &&
 -              (pmd_val(pmd) & _PAGE_USER);
 -}
 -
 -static inline bool pud_user_accessible_page(pud_t pud)
 -{
 -      return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) &&
 -              (pud_val(pud) & _PAGE_USER);
 -}
 -
  /*
   * An enty is removed from the page table, decrement the counters for that page
   * verify that it is of correct type and counters do not become negative.
@@@ -160,7 -177,7 +160,7 @@@ void __page_table_check_pmd_clear(struc
  
        if (pmd_user_accessible_page(pmd)) {
                page_table_check_clear(mm, addr, pmd_pfn(pmd),
 -                                     PMD_PAGE_SIZE >> PAGE_SHIFT);
 +                                     PMD_SIZE >> PAGE_SHIFT);
        }
  }
  EXPORT_SYMBOL(__page_table_check_pmd_clear);
@@@ -173,7 -190,7 +173,7 @@@ void __page_table_check_pud_clear(struc
  
        if (pud_user_accessible_page(pud)) {
                page_table_check_clear(mm, addr, pud_pfn(pud),
 -                                     PUD_PAGE_SIZE >> PAGE_SHIFT);
 +                                     PUD_SIZE >> PAGE_SHIFT);
        }
  }
  EXPORT_SYMBOL(__page_table_check_pud_clear);
@@@ -202,7 -219,7 +202,7 @@@ void __page_table_check_pmd_set(struct 
        __page_table_check_pmd_clear(mm, addr, *pmdp);
        if (pmd_user_accessible_page(pmd)) {
                page_table_check_set(mm, addr, pmd_pfn(pmd),
 -                                   PMD_PAGE_SIZE >> PAGE_SHIFT,
 +                                   PMD_SIZE >> PAGE_SHIFT,
                                     pmd_write(pmd));
        }
  }
@@@ -217,7 -234,7 +217,7 @@@ void __page_table_check_pud_set(struct 
        __page_table_check_pud_clear(mm, addr, *pudp);
        if (pud_user_accessible_page(pud)) {
                page_table_check_set(mm, addr, pud_pfn(pud),
 -                                   PUD_PAGE_SIZE >> PAGE_SHIFT,
 +                                   PUD_SIZE >> PAGE_SHIFT,
                                     pud_write(pud));
        }
  }
@@@ -234,11 -251,11 +234,11 @@@ void __page_table_check_pte_clear_range
                pte_t *ptep = pte_offset_map(&pmd, addr);
                unsigned long i;
  
-               pte_unmap(ptep);
                for (i = 0; i < PTRS_PER_PTE; i++) {
                        __page_table_check_pte_clear(mm, addr, *ptep);
                        addr += PAGE_SIZE;
                        ptep++;
                }
+               pte_unmap(ptep - PTRS_PER_PTE);
        }
  }