From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 May 2022 18:29:35 +0000 (-0700)
Subject: Merge tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kerne... 
X-Git-Tag: v6.6.17~7420
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=77fb622de1393b1d54f24f4f7ed98f84feeda502;hp=-c;p=platform%2Fkernel%2Flinux-rpi.git

Merge tag 'mm-hotfixes-stable-2022-05-27' of git://git./linux/kernel/git/akpm/mm

Pull hotfixes from Andrew Morton:
 "Six hotfixes.

  The page_table_check one from Miaohe Lin is considered a minor thing
  so it isn't marked for -stable. The remainder address pre-5.19 issues
  and are cc:stable"

* tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
  mm/page_table_check: fix accessing unmapped ptep
  kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add]
  mm/page_alloc: always attempt to allocate at least one page during bulk allocation
  hugetlb: fix huge_pmd_unshare address update
  zsmalloc: fix races between asynchronous zspage free and page migration
  Revert "mm/cma.c: remove redundant cma_mutex lock"
---

77fb622de1393b1d54f24f4f7ed98f84feeda502
diff --combined mm/hugetlb.c
index 01f0e2e,410bbb0..7c468ac
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@@ -370,7 -370,7 +370,7 @@@ static void coalesce_file_region(struc
  }
  
  static inline long
 -hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
 +hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
  		     long to, struct hstate *h, struct hugetlb_cgroup *cg,
  		     long *regions_needed)
  {
@@@ -379,7 -379,7 +379,7 @@@
  	if (!regions_needed) {
  		nrg = get_file_region_entry_from_cache(map, from, to);
  		record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
 -		list_add(&nrg->link, rg->link.prev);
 +		list_add(&nrg->link, rg);
  		coalesce_file_region(map, nrg);
  	} else
  		*regions_needed += 1;
@@@ -402,52 -402,47 +402,52 @@@ static long add_reservation_in_range(st
  	long add = 0;
  	struct list_head *head = &resv->regions;
  	long last_accounted_offset = f;
 -	struct file_region *rg = NULL, *trg = NULL;
 +	struct file_region *iter, *trg = NULL;
 +	struct list_head *rg = NULL;
  
  	if (regions_needed)
  		*regions_needed = 0;
  
  	/* In this loop, we essentially handle an entry for the range
 -	 * [last_accounted_offset, rg->from), at every iteration, with some
 +	 * [last_accounted_offset, iter->from), at every iteration, with some
  	 * bounds checking.
  	 */
 -	list_for_each_entry_safe(rg, trg, head, link) {
 +	list_for_each_entry_safe(iter, trg, head, link) {
  		/* Skip irrelevant regions that start before our range. */
 -		if (rg->from < f) {
 +		if (iter->from < f) {
  			/* If this region ends after the last accounted offset,
  			 * then we need to update last_accounted_offset.
  			 */
 -			if (rg->to > last_accounted_offset)
 -				last_accounted_offset = rg->to;
 +			if (iter->to > last_accounted_offset)
 +				last_accounted_offset = iter->to;
  			continue;
  		}
  
  		/* When we find a region that starts beyond our range, we've
  		 * finished.
  		 */
 -		if (rg->from >= t)
 +		if (iter->from >= t) {
 +			rg = iter->link.prev;
  			break;
 +		}
  
 -		/* Add an entry for last_accounted_offset -> rg->from, and
 +		/* Add an entry for last_accounted_offset -> iter->from, and
  		 * update last_accounted_offset.
  		 */
 -		if (rg->from > last_accounted_offset)
 -			add += hugetlb_resv_map_add(resv, rg,
 +		if (iter->from > last_accounted_offset)
 +			add += hugetlb_resv_map_add(resv, iter->link.prev,
  						    last_accounted_offset,
 -						    rg->from, h, h_cg,
 +						    iter->from, h, h_cg,
  						    regions_needed);
  
 -		last_accounted_offset = rg->to;
 +		last_accounted_offset = iter->to;
  	}
  
  	/* Handle the case where our range extends beyond
  	 * last_accounted_offset.
  	 */
 +	if (!rg)
 +		rg = head->prev;
  	if (last_accounted_offset < t)
  		add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
  					    t, h, h_cg, regions_needed);
@@@ -1540,7 -1535,7 +1540,7 @@@ static void __update_and_free_page(stru
  	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
  		return;
  
 -	if (alloc_huge_page_vmemmap(h, page)) {
 +	if (hugetlb_vmemmap_alloc(h, page)) {
  		spin_lock_irq(&hugetlb_lock);
  		/*
  		 * If we cannot allocate vmemmap pages, just refuse to free the
@@@ -1617,7 -1612,7 +1617,7 @@@ static DECLARE_WORK(free_hpage_work, fr
  
  static inline void flush_free_hpage_work(struct hstate *h)
  {
 -	if (free_vmemmap_pages_per_hpage(h))
 +	if (hugetlb_optimize_vmemmap_pages(h))
  		flush_work(&free_hpage_work);
  }
  
@@@ -1677,8 -1672,6 +1677,8 @@@ void free_huge_page(struct page *page
  	VM_BUG_ON_PAGE(page_mapcount(page), page);
  
  	hugetlb_set_page_subpool(page, NULL);
 +	if (PageAnon(page))
 +		__ClearPageAnonExclusive(page);
  	page->mapping = NULL;
  	restore_reserve = HPageRestoreReserve(page);
  	ClearHPageRestoreReserve(page);
@@@ -1739,7 -1732,7 +1739,7 @@@ static void __prep_account_new_huge_pag
  
  static void __prep_new_huge_page(struct hstate *h, struct page *page)
  {
 -	free_huge_page_vmemmap(h, page);
 +	hugetlb_vmemmap_free(h, page);
  	INIT_LIST_HEAD(&page->lru);
  	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
  	hugetlb_set_page_subpool(page, NULL);
@@@ -2112,7 -2105,7 +2112,7 @@@ retry
  		 * Attempt to allocate vmemmmap here so that we can take
  		 * appropriate action on failure.
  		 */
 -		rc = alloc_huge_page_vmemmap(h, head);
 +		rc = hugetlb_vmemmap_alloc(h, head);
  		if (!rc) {
  			/*
  			 * Move PageHWPoison flag from head page to the raw
@@@ -2986,6 -2979,8 +2986,6 @@@ int __alloc_bootmem_huge_page(struct hs
  	struct huge_bootmem_page *m = NULL; /* initialize for clang */
  	int nr_nodes, node;
  
 -	if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
 -		return 0;
  	/* do node specific alloc */
  	if (nid != NUMA_NO_NODE) {
  		m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
@@@ -3093,7 -3088,7 +3093,7 @@@ static void __init hugetlb_hstate_alloc
  	}
  
  	/* do node specific alloc */
 -	for (i = 0; i < nr_online_nodes; i++) {
 +	for_each_online_node(i) {
  		if (h->max_huge_pages_node[i] > 0) {
  			hugetlb_hstate_alloc_pages_onenode(h, i);
  			node_specific_alloc = true;
@@@ -3425,7 -3420,7 +3425,7 @@@ static int demote_free_huge_page(struc
  	remove_hugetlb_page_for_demote(h, page, false);
  	spin_unlock_irq(&hugetlb_lock);
  
 -	rc = alloc_huge_page_vmemmap(h, page);
 +	rc = hugetlb_vmemmap_alloc(h, page);
  	if (rc) {
  		/* Allocation of vmemmmap failed, we can not demote page */
  		spin_lock_irq(&hugetlb_lock);
@@@ -4057,7 -4052,7 +4057,7 @@@ static int __init hugetlb_init(void
  			default_hstate.max_huge_pages =
  				default_hstate_max_huge_pages;
  
 -			for (i = 0; i < nr_online_nodes; i++)
 +			for_each_online_node(i)
  				default_hstate.max_huge_pages_node[i] =
  					default_hugepages_in_node[i];
  		}
@@@ -4124,20 -4119,6 +4124,20 @@@ bool __init __weak hugetlb_node_alloc_s
  {
  	return true;
  }
 +
 +static void __init hugepages_clear_pages_in_node(void)
 +{
 +	if (!hugetlb_max_hstate) {
 +		default_hstate_max_huge_pages = 0;
 +		memset(default_hugepages_in_node, 0,
 +			MAX_NUMNODES * sizeof(unsigned int));
 +	} else {
 +		parsed_hstate->max_huge_pages = 0;
 +		memset(parsed_hstate->max_huge_pages_node, 0,
 +			MAX_NUMNODES * sizeof(unsigned int));
 +	}
 +}
 +
  /*
   * hugepages command line processing
   * hugepages normally follows a valid hugepagsz or default_hugepagsz
@@@ -4157,7 -4138,7 +4157,7 @@@ static int __init hugepages_setup(char 
  	if (!parsed_valid_hugepagesz) {
  		pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
  		parsed_valid_hugepagesz = true;
 -		return 0;
 +		return 1;
  	}
  
  	/*
@@@ -4173,7 -4154,7 +4173,7 @@@
  
  	if (mhp == last_mhp) {
  		pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
 -		return 0;
 +		return 1;
  	}
  
  	while (*p) {
@@@ -4184,11 -4165,11 +4184,11 @@@
  		if (p[count] == ':') {
  			if (!hugetlb_node_alloc_supported()) {
  				pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
 -				return 0;
 +				return 1;
  			}
 -			if (tmp >= nr_online_nodes)
 +			if (tmp >= MAX_NUMNODES || !node_online(tmp))
  				goto invalid;
 -			node = array_index_nospec(tmp, nr_online_nodes);
 +			node = array_index_nospec(tmp, MAX_NUMNODES);
  			p += count + 1;
  			/* Parse hugepages */
  			if (sscanf(p, "%lu%n", &tmp, &count) != 1)
@@@ -4225,8 -4206,7 +4225,8 @@@
  
  invalid:
  	pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
 -	return 0;
 +	hugepages_clear_pages_in_node();
 +	return 1;
  }
  __setup("hugepages=", hugepages_setup);
  
@@@ -4247,7 -4227,7 +4247,7 @@@ static int __init hugepagesz_setup(cha
  
  	if (!arch_hugetlb_valid_size(size)) {
  		pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
 -		return 0;
 +		return 1;
  	}
  
  	h = size_to_hstate(size);
@@@ -4262,7 -4242,7 +4262,7 @@@
  		if (!parsed_default_hugepagesz ||  h != &default_hstate ||
  		    default_hstate.max_huge_pages) {
  			pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
 -			return 0;
 +			return 1;
  		}
  
  		/*
@@@ -4293,14 -4273,14 +4293,14 @@@ static int __init default_hugepagesz_se
  	parsed_valid_hugepagesz = false;
  	if (parsed_default_hugepagesz) {
  		pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
 -		return 0;
 +		return 1;
  	}
  
  	size = (unsigned long)memparse(s, NULL);
  
  	if (!arch_hugetlb_valid_size(size)) {
  		pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
 -		return 0;
 +		return 1;
  	}
  
  	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@@ -4317,7 -4297,7 +4317,7 @@@
  	 */
  	if (default_hstate_max_huge_pages) {
  		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
 -		for (i = 0; i < nr_online_nodes; i++)
 +		for_each_online_node(i)
  			default_hstate.max_huge_pages_node[i] =
  				default_hugepages_in_node[i];
  		if (hstate_is_gigantic(&default_hstate))
@@@ -4719,27 -4699,24 +4719,27 @@@ hugetlb_install_page(struct vm_area_str
  }
  
  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 -			    struct vm_area_struct *vma)
 +			    struct vm_area_struct *dst_vma,
 +			    struct vm_area_struct *src_vma)
  {
  	pte_t *src_pte, *dst_pte, entry, dst_entry;
  	struct page *ptepage;
  	unsigned long addr;
 -	bool cow = is_cow_mapping(vma->vm_flags);
 -	struct hstate *h = hstate_vma(vma);
 +	bool cow = is_cow_mapping(src_vma->vm_flags);
 +	struct hstate *h = hstate_vma(src_vma);
  	unsigned long sz = huge_page_size(h);
  	unsigned long npages = pages_per_huge_page(h);
 -	struct address_space *mapping = vma->vm_file->f_mapping;
 +	struct address_space *mapping = src_vma->vm_file->f_mapping;
  	struct mmu_notifier_range range;
  	int ret = 0;
  
  	if (cow) {
 -		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
 -					vma->vm_start,
 -					vma->vm_end);
 +		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
 +					src_vma->vm_start,
 +					src_vma->vm_end);
  		mmu_notifier_invalidate_range_start(&range);
 +		mmap_assert_write_locked(src);
 +		raw_write_seqcount_begin(&src->write_protect_seq);
  	} else {
  		/*
  		 * For shared mappings i_mmap_rwsem must be held to call
@@@ -4750,12 -4727,12 +4750,12 @@@
  		i_mmap_lock_read(mapping);
  	}
  
 -	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
 +	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
  		spinlock_t *src_ptl, *dst_ptl;
  		src_pte = huge_pte_offset(src, addr, sz);
  		if (!src_pte)
  			continue;
 -		dst_pte = huge_pte_alloc(dst, vma, addr, sz);
 +		dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
  		if (!dst_pte) {
  			ret = -ENOMEM;
  			break;
@@@ -4790,9 -4767,8 +4790,9 @@@ again
  		} else if (unlikely(is_hugetlb_entry_migration(entry) ||
  				    is_hugetlb_entry_hwpoisoned(entry))) {
  			swp_entry_t swp_entry = pte_to_swp_entry(entry);
 +			bool uffd_wp = huge_pte_uffd_wp(entry);
  
 -			if (is_writable_migration_entry(swp_entry) && cow) {
 +			if (!is_readable_migration_entry(swp_entry) && cow) {
  				/*
  				 * COW mappings require pages in both
  				 * parent and child to be set to read.
@@@ -4800,53 -4776,38 +4800,53 @@@
  				swp_entry = make_readable_migration_entry(
  							swp_offset(swp_entry));
  				entry = swp_entry_to_pte(swp_entry);
 +				if (userfaultfd_wp(src_vma) && uffd_wp)
 +					entry = huge_pte_mkuffd_wp(entry);
  				set_huge_swap_pte_at(src, addr, src_pte,
  						     entry, sz);
  			}
 +			if (!userfaultfd_wp(dst_vma) && uffd_wp)
 +				entry = huge_pte_clear_uffd_wp(entry);
  			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
 +		} else if (unlikely(is_pte_marker(entry))) {
 +			/*
 +			 * We copy the pte marker only if the dst vma has
 +			 * uffd-wp enabled.
 +			 */
 +			if (userfaultfd_wp(dst_vma))
 +				set_huge_pte_at(dst, addr, dst_pte, entry);
  		} else {
  			entry = huge_ptep_get(src_pte);
  			ptepage = pte_page(entry);
  			get_page(ptepage);
  
  			/*
 -			 * This is a rare case where we see pinned hugetlb
 -			 * pages while they're prone to COW.  We need to do the
 -			 * COW earlier during fork.
 +			 * Failing to duplicate the anon rmap is a rare case
 +			 * where we see pinned hugetlb pages while they're
 +			 * prone to COW. We need to do the COW earlier during
 +			 * fork.
  			 *
  			 * When pre-allocating the page or copying data, we
  			 * need to be without the pgtable locks since we could
  			 * sleep during the process.
  			 */
 -			if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
 +			if (!PageAnon(ptepage)) {
 +				page_dup_file_rmap(ptepage, true);
 +			} else if (page_try_dup_anon_rmap(ptepage, true,
 +							  src_vma)) {
  				pte_t src_pte_old = entry;
  				struct page *new;
  
  				spin_unlock(src_ptl);
  				spin_unlock(dst_ptl);
  				/* Do not use reserve as it's private owned */
 -				new = alloc_huge_page(vma, addr, 1);
 +				new = alloc_huge_page(dst_vma, addr, 1);
  				if (IS_ERR(new)) {
  					put_page(ptepage);
  					ret = PTR_ERR(new);
  					break;
  				}
 -				copy_user_huge_page(new, ptepage, addr, vma,
 +				copy_user_huge_page(new, ptepage, addr, dst_vma,
  						    npages);
  				put_page(ptepage);
  
@@@ -4856,13 -4817,13 +4856,13 @@@
  				spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  				entry = huge_ptep_get(src_pte);
  				if (!pte_same(src_pte_old, entry)) {
 -					restore_reserve_on_error(h, vma, addr,
 +					restore_reserve_on_error(h, dst_vma, addr,
  								new);
  					put_page(new);
  					/* dst_entry won't change as in child */
  					goto again;
  				}
 -				hugetlb_install_page(vma, dst_pte, addr, new);
 +				hugetlb_install_page(dst_vma, dst_pte, addr, new);
  				spin_unlock(src_ptl);
  				spin_unlock(dst_ptl);
  				continue;
@@@ -4880,6 -4841,7 +4880,6 @@@
  				entry = huge_pte_wrprotect(entry);
  			}
  
 -			page_dup_rmap(ptepage, true);
  			set_huge_pte_at(dst, addr, dst_pte, entry);
  			hugetlb_count_add(npages, dst);
  		}
@@@ -4887,12 -4849,10 +4887,12 @@@
  		spin_unlock(dst_ptl);
  	}
  
 -	if (cow)
 +	if (cow) {
 +		raw_write_seqcount_end(&src->write_protect_seq);
  		mmu_notifier_invalidate_range_end(&range);
 -	else
 +	} else {
  		i_mmap_unlock_read(mapping);
 +	}
  
  	return ret;
  }
@@@ -4936,17 -4896,10 +4936,17 @@@ int move_hugetlb_page_tables(struct vm_
  	unsigned long old_addr_copy;
  	pte_t *src_pte, *dst_pte;
  	struct mmu_notifier_range range;
 +	bool shared_pmd = false;
  
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
  				old_end);
  	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 +	/*
 +	 * In case of shared PMDs, we should cover the maximum possible
 +	 * range.
 +	 */
 +	flush_cache_range(vma, range.start, range.end);
 +
  	mmu_notifier_invalidate_range_start(&range);
  	/* Prevent race with file truncation */
  	i_mmap_lock_write(mapping);
@@@ -4963,10 -4916,8 +4963,10 @@@
  		 */
  		old_addr_copy = old_addr;
  
 -		if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
 +		if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
 +			shared_pmd = true;
  			continue;
 +		}
  
  		dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
  		if (!dst_pte)
@@@ -4974,11 -4925,7 +4974,11 @@@
  
  		move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
  	}
 -	flush_tlb_range(vma, old_end - len, old_end);
 +
 +	if (shared_pmd)
 +		flush_tlb_range(vma, range.start, range.end);
 +	else
 +		flush_tlb_range(vma, old_end - len, old_end);
  	mmu_notifier_invalidate_range_end(&range);
  	i_mmap_unlock_write(mapping);
  
@@@ -4987,7 -4934,7 +4987,7 @@@
  
  static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
  				   unsigned long start, unsigned long end,
 -				   struct page *ref_page)
 +				   struct page *ref_page, zap_flags_t zap_flags)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address;
@@@ -5043,18 -4990,7 +5043,18 @@@
  		 * unmapped and its refcount is dropped, so just clear pte here.
  		 */
  		if (unlikely(!pte_present(pte))) {
 -			huge_pte_clear(mm, address, ptep, sz);
 +			/*
 +			 * If the pte was wr-protected by uffd-wp in any of the
 +			 * swap forms, meanwhile the caller does not want to
 +			 * drop the uffd-wp bit in this zap, then replace the
 +			 * pte with a marker.
 +			 */
 +			if (pte_swp_uffd_wp_any(pte) &&
 +			    !(zap_flags & ZAP_FLAG_DROP_MARKER))
 +				set_huge_pte_at(mm, address, ptep,
 +						make_pte_marker(PTE_MARKER_UFFD_WP));
 +			else
 +				huge_pte_clear(mm, address, ptep, sz);
  			spin_unlock(ptl);
  			continue;
  		}
@@@ -5082,11 -5018,7 +5082,11 @@@
  		tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
  		if (huge_pte_dirty(pte))
  			set_page_dirty(page);
 -
 +		/* Leave a uffd-wp pte marker if needed */
 +		if (huge_pte_uffd_wp(pte) &&
 +		    !(zap_flags & ZAP_FLAG_DROP_MARKER))
 +			set_huge_pte_at(mm, address, ptep,
 +					make_pte_marker(PTE_MARKER_UFFD_WP));
  		hugetlb_count_sub(pages_per_huge_page(h), mm);
  		page_remove_rmap(page, vma, true);
  
@@@ -5120,10 -5052,9 +5120,10 @@@
  
  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
  			  struct vm_area_struct *vma, unsigned long start,
 -			  unsigned long end, struct page *ref_page)
 +			  unsigned long end, struct page *ref_page,
 +			  zap_flags_t zap_flags)
  {
 -	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
 +	__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
  
  	/*
  	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
@@@ -5139,13 -5070,12 +5139,13 @@@
  }
  
  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 -			  unsigned long end, struct page *ref_page)
 +			  unsigned long end, struct page *ref_page,
 +			  zap_flags_t zap_flags)
  {
  	struct mmu_gather tlb;
  
  	tlb_gather_mmu(&tlb, vma->vm_mm);
 -	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
 +	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
  	tlb_finish_mmu(&tlb);
  }
  
@@@ -5200,22 -5130,21 +5200,22 @@@ static void unmap_ref_private(struct mm
  		 */
  		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
  			unmap_hugepage_range(iter_vma, address,
 -					     address + huge_page_size(h), page);
 +					     address + huge_page_size(h), page, 0);
  	}
  	i_mmap_unlock_write(mapping);
  }
  
  /*
 - * Hugetlb_cow() should be called with page lock of the original hugepage held.
 + * hugetlb_wp() should be called with page lock of the original hugepage held.
   * Called with hugetlb_fault_mutex_table held and pte_page locked so we
   * cannot race with other handlers or page migration.
   * Keep the pte_same checks anyway to make transition from the mutex easier.
   */
 -static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 -		       unsigned long address, pte_t *ptep,
 +static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 +		       unsigned long address, pte_t *ptep, unsigned int flags,
  		       struct page *pagecache_page, spinlock_t *ptl)
  {
 +	const bool unshare = flags & FAULT_FLAG_UNSHARE;
  	pte_t pte;
  	struct hstate *h = hstate_vma(vma);
  	struct page *old_page, *new_page;
@@@ -5224,26 -5153,17 +5224,26 @@@
  	unsigned long haddr = address & huge_page_mask(h);
  	struct mmu_notifier_range range;
  
 +	VM_BUG_ON(unshare && (flags & FOLL_WRITE));
 +	VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
 +
  	pte = huge_ptep_get(ptep);
  	old_page = pte_page(pte);
  
  retry_avoidcopy:
 -	/* If no-one else is actually using this page, avoid the copy
 -	 * and just make the page writable */
 +	/*
 +	 * If no-one else is actually using this page, we're the exclusive
 +	 * owner and can reuse this page.
 +	 */
  	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
 -		page_move_anon_rmap(old_page, vma);
 -		set_huge_ptep_writable(vma, haddr, ptep);
 +		if (!PageAnonExclusive(old_page))
 +			page_move_anon_rmap(old_page, vma);
 +		if (likely(!unshare))
 +			set_huge_ptep_writable(vma, haddr, ptep);
  		return 0;
  	}
 +	VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
 +		       old_page);
  
  	/*
  	 * If the process that created a MAP_PRIVATE mapping is about to
@@@ -5342,13 -5262,13 +5342,13 @@@
  	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
  		ClearHPageRestoreReserve(new_page);
  
 -		/* Break COW */
 +		/* Break COW or unshare */
  		huge_ptep_clear_flush(vma, haddr, ptep);
  		mmu_notifier_invalidate_range(mm, range.start, range.end);
  		page_remove_rmap(old_page, vma, true);
  		hugepage_add_new_anon_rmap(new_page, vma, haddr);
  		set_huge_pte_at(mm, haddr, ptep,
 -				make_huge_pte(vma, new_page, 1));
 +				make_huge_pte(vma, new_page, !unshare));
  		SetHPageMigratable(new_page);
  		/* Make the old page be freed below */
  		new_page = old_page;
@@@ -5356,10 -5276,7 +5356,10 @@@
  	spin_unlock(ptl);
  	mmu_notifier_invalidate_range_end(&range);
  out_release_all:
 -	/* No restore in case of successful pagetable update (Break COW) */
 +	/*
 +	 * No restore in case of successful pagetable update (Break COW or
 +	 * unshare)
 +	 */
  	if (new_page != old_page)
  		restore_reserve_on_error(h, vma, haddr, new_page);
  	put_page(new_page);
@@@ -5469,8 -5386,7 +5469,8 @@@ static inline vm_fault_t hugetlb_handle
  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
  			struct vm_area_struct *vma,
  			struct address_space *mapping, pgoff_t idx,
 -			unsigned long address, pte_t *ptep, unsigned int flags)
 +			unsigned long address, pte_t *ptep,
 +			pte_t old_pte, unsigned int flags)
  {
  	struct hstate *h = hstate_vma(vma);
  	vm_fault_t ret = VM_FAULT_SIGBUS;
@@@ -5485,8 -5401,7 +5485,8 @@@
  	/*
  	 * Currently, we are forced to kill the process in the event the
  	 * original mapper has unmapped pages from the child due to a failed
 -	 * COW. Warn that such a situation has occurred as it may not be obvious
 +	 * COW/unsharing. Warn that such a situation has occurred as it may not
 +	 * be obvious.
  	 */
  	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
  		pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
@@@ -5597,29 -5512,22 +5597,29 @@@ retry
  
  	ptl = huge_pte_lock(h, mm, ptep);
  	ret = 0;
 -	if (!huge_pte_none(huge_ptep_get(ptep)))
 +	/* If pte changed from under us, retry */
 +	if (!pte_same(huge_ptep_get(ptep), old_pte))
  		goto backout;
  
  	if (anon_rmap) {
  		ClearHPageRestoreReserve(page);
  		hugepage_add_new_anon_rmap(page, vma, haddr);
  	} else
 -		page_dup_rmap(page, true);
 +		page_dup_file_rmap(page, true);
  	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
  				&& (vma->vm_flags & VM_SHARED)));
 +	/*
 +	 * If this pte was previously wr-protected, keep it wr-protected even
 +	 * if populated.
 +	 */
 +	if (unlikely(pte_marker_uffd_wp(old_pte)))
 +		new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
  	set_huge_pte_at(mm, haddr, ptep, new_pte);
  
  	hugetlb_count_add(pages_per_huge_page(h), mm);
  	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
  		/* Optimization, do the COW without a second fault */
 -		ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
 +		ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
  	}
  
  	spin_unlock(ptl);
@@@ -5731,10 -5639,8 +5731,10 @@@ vm_fault_t hugetlb_fault(struct mm_stru
  	mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
  	entry = huge_ptep_get(ptep);
 -	if (huge_pte_none(entry)) {
 -		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
 +	/* PTE markers should be handled the same way as none pte */
 +	if (huge_pte_none_mostly(entry)) {
 +		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
 +				      entry, flags);
  		goto out_mutex;
  	}
  
@@@ -5751,15 -5657,14 +5751,15 @@@
  		goto out_mutex;
  
  	/*
 -	 * If we are going to COW the mapping later, we examine the pending
 -	 * reservations for this page now. This will ensure that any
 +	 * If we are going to COW/unshare the mapping later, we examine the
 +	 * pending reservations for this page now. This will ensure that any
  	 * allocations necessary to record that reservation occur outside the
  	 * spinlock. For private mappings, we also lookup the pagecache
  	 * page now as it is used to determine if a reservation has been
  	 * consumed.
  	 */
 -	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
 +	if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
 +	    !huge_pte_write(entry)) {
  		if (vma_needs_reservation(h, vma, haddr) < 0) {
  			ret = VM_FAULT_OOM;
  			goto out_mutex;
@@@ -5774,32 -5679,12 +5774,32 @@@
  
  	ptl = huge_pte_lock(h, mm, ptep);
  
 -	/* Check for a racing update before calling hugetlb_cow */
 +	/* Check for a racing update before calling hugetlb_wp() */
  	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
  		goto out_ptl;
  
 +	/* Handle userfault-wp first, before trying to lock more pages */
 +	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
 +	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
 +		struct vm_fault vmf = {
 +			.vma = vma,
 +			.address = haddr,
 +			.real_address = address,
 +			.flags = flags,
 +		};
 +
 +		spin_unlock(ptl);
 +		if (pagecache_page) {
 +			unlock_page(pagecache_page);
 +			put_page(pagecache_page);
 +		}
 +		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 +		i_mmap_unlock_read(mapping);
 +		return handle_userfault(&vmf, VM_UFFD_WP);
 +	}
 +
  	/*
 -	 * hugetlb_cow() requires page locks of pte_page(entry) and
 +	 * hugetlb_wp() requires page locks of pte_page(entry) and
  	 * pagecache_page, so here we need take the former one
  	 * when page != pagecache_page or !pagecache_page.
  	 */
@@@ -5812,14 -5697,13 +5812,14 @@@
  
  	get_page(page);
  
 -	if (flags & FAULT_FLAG_WRITE) {
 +	if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
  		if (!huge_pte_write(entry)) {
 -			ret = hugetlb_cow(mm, vma, address, ptep,
 -					  pagecache_page, ptl);
 +			ret = hugetlb_wp(mm, vma, address, ptep, flags,
 +					 pagecache_page, ptl);
  			goto out_put_page;
 +		} else if (likely(flags & FAULT_FLAG_WRITE)) {
 +			entry = huge_pte_mkdirty(entry);
  		}
 -		entry = huge_pte_mkdirty(entry);
  	}
  	entry = pte_mkyoung(entry);
  	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
@@@ -5862,8 -5746,7 +5862,8 @@@ int hugetlb_mcopy_atomic_pte(struct mm_
  			    unsigned long dst_addr,
  			    unsigned long src_addr,
  			    enum mcopy_atomic_mode mode,
 -			    struct page **pagep)
 +			    struct page **pagep,
 +			    bool wp_copy)
  {
  	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
  	struct hstate *h = hstate_vma(dst_vma);
@@@ -5993,43 -5876,27 +5993,43 @@@
  		goto out_release_unlock;
  
  	ret = -EEXIST;
 -	if (!huge_pte_none(huge_ptep_get(dst_pte)))
 +	/*
 +	 * We allow to overwrite a pte marker: consider when both MISSING|WP
 +	 * registered, we firstly wr-protect a none pte which has no page cache
 +	 * page backing it, then access the page.
 +	 */
 +	if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
  		goto out_release_unlock;
  
  	if (vm_shared) {
 -		page_dup_rmap(page, true);
 +		page_dup_file_rmap(page, true);
  	} else {
  		ClearHPageRestoreReserve(page);
  		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
  	}
  
 -	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
 -	if (is_continue && !vm_shared)
 +	/*
 +	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
 +	 * with wp flag set, don't set pte write bit.
 +	 */
 +	if (wp_copy || (is_continue && !vm_shared))
  		writable = 0;
  	else
  		writable = dst_vma->vm_flags & VM_WRITE;
  
  	_dst_pte = make_huge_pte(dst_vma, page, writable);
 -	if (writable)
 -		_dst_pte = huge_pte_mkdirty(_dst_pte);
 +	/*
 +	 * Always mark UFFDIO_COPY page dirty; note that this may not be
 +	 * extremely important for hugetlbfs for now since swapping is not
 +	 * supported, but we should still be clear in that this page cannot be
 +	 * thrown away at will, even if write bit not set.
 +	 */
 +	_dst_pte = huge_pte_mkdirty(_dst_pte);
  	_dst_pte = pte_mkyoung(_dst_pte);
  
 +	if (wp_copy)
 +		_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
 +
  	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  
  	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
@@@ -6073,25 -5940,6 +6073,25 @@@ static void record_subpages_vmas(struc
  	}
  }
  
 +static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
 +					       bool *unshare)
 +{
 +	pte_t pteval = huge_ptep_get(pte);
 +
 +	*unshare = false;
 +	if (is_swap_pte(pteval))
 +		return true;
 +	if (huge_pte_write(pteval))
 +		return false;
 +	if (flags & FOLL_WRITE)
 +		return true;
 +	if (gup_must_unshare(flags, pte_page(pteval))) {
 +		*unshare = true;
 +		return true;
 +	}
 +	return false;
 +}
 +
  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
  			 struct page **pages, struct vm_area_struct **vmas,
  			 unsigned long *position, unsigned long *nr_pages,
@@@ -6106,7 -5954,6 +6106,7 @@@
  	while (vaddr < vma->vm_end && remainder) {
  		pte_t *pte;
  		spinlock_t *ptl = NULL;
 +		bool unshare = false;
  		int absent;
  		struct page *page;
  
@@@ -6157,8 -6004,9 +6157,8 @@@
  		 * both cases, and because we can't follow correct pages
  		 * directly from any kind of swap entries.
  		 */
 -		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
 -		    ((flags & FOLL_WRITE) &&
 -		      !huge_pte_write(huge_ptep_get(pte)))) {
 +		if (absent ||
 +		    __follow_hugetlb_must_fault(flags, pte, &unshare)) {
  			vm_fault_t ret;
  			unsigned int fault_flags = 0;
  
@@@ -6166,8 -6014,6 +6166,8 @@@
  				spin_unlock(ptl);
  			if (flags & FOLL_WRITE)
  				fault_flags |= FAULT_FLAG_WRITE;
 +			else if (unshare)
 +				fault_flags |= FAULT_FLAG_UNSHARE;
  			if (locked)
  				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
  					FAULT_FLAG_KILLABLE;
@@@ -6209,9 -6055,6 +6209,9 @@@
  		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
  		page = pte_page(huge_ptep_get(pte));
  
 +		VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
 +			       !PageAnonExclusive(page), page);
 +
  		/*
  		 * If subpage information not requested, update counters
  		 * and skip the same_page loop below.
@@@ -6274,19 -6117,16 +6274,19 @@@
  }
  
  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 -		unsigned long address, unsigned long end, pgprot_t newprot)
 +		unsigned long address, unsigned long end,
 +		pgprot_t newprot, unsigned long cp_flags)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long start = address;
  	pte_t *ptep;
  	pte_t pte;
  	struct hstate *h = hstate_vma(vma);
 -	unsigned long pages = 0;
 +	unsigned long pages = 0, psize = huge_page_size(h);
  	bool shared_pmd = false;
  	struct mmu_notifier_range range;
 +	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 +	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
  
  	/*
  	 * In the case of shared PMDs, the area to flush could be beyond
@@@ -6302,19 -6142,13 +6302,19 @@@
  
  	mmu_notifier_invalidate_range_start(&range);
  	i_mmap_lock_write(vma->vm_file->f_mapping);
 -	for (; address < end; address += huge_page_size(h)) {
 +	for (; address < end; address += psize) {
  		spinlock_t *ptl;
 -		ptep = huge_pte_offset(mm, address, huge_page_size(h));
 +		ptep = huge_pte_offset(mm, address, psize);
  		if (!ptep)
  			continue;
  		ptl = huge_pte_lock(h, mm, ptep);
  		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
 +			/*
 +			 * When uffd-wp is enabled on the vma, unshare
 +			 * shouldn't happen at all.  Warn about it if it
 +			 * happened due to some reason.
 +			 */
 +			WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
  			pages++;
  			spin_unlock(ptl);
  			shared_pmd = true;
@@@ -6327,37 -6161,20 +6327,37 @@@
  		}
  		if (unlikely(is_hugetlb_entry_migration(pte))) {
  			swp_entry_t entry = pte_to_swp_entry(pte);
 +			struct page *page = pfn_swap_entry_to_page(entry);
  
 -			if (is_writable_migration_entry(entry)) {
 +			if (!is_readable_migration_entry(entry)) {
  				pte_t newpte;
  
 -				entry = make_readable_migration_entry(
 -							swp_offset(entry));
 +				if (PageAnon(page))
 +					entry = make_readable_exclusive_migration_entry(
 +								swp_offset(entry));
 +				else
 +					entry = make_readable_migration_entry(
 +								swp_offset(entry));
  				newpte = swp_entry_to_pte(entry);
 +				if (uffd_wp)
 +					newpte = pte_swp_mkuffd_wp(newpte);
 +				else if (uffd_wp_resolve)
 +					newpte = pte_swp_clear_uffd_wp(newpte);
  				set_huge_swap_pte_at(mm, address, ptep,
 -						     newpte, huge_page_size(h));
 +						     newpte, psize);
  				pages++;
  			}
  			spin_unlock(ptl);
  			continue;
  		}
 +		if (unlikely(pte_marker_uffd_wp(pte))) {
 +			/*
 +			 * This is changing a non-present pte into a none pte,
 +			 * no need for huge_ptep_modify_prot_start/commit().
 +			 */
 +			if (uffd_wp_resolve)
 +				huge_pte_clear(mm, address, ptep, psize);
 +		}
  		if (!huge_pte_none(pte)) {
  			pte_t old_pte;
  			unsigned int shift = huge_page_shift(hstate_vma(vma));
@@@ -6365,18 -6182,8 +6365,18 @@@
  			old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
  			pte = huge_pte_modify(old_pte, newprot);
  			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
 +			if (uffd_wp)
 +				pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
 +			else if (uffd_wp_resolve)
 +				pte = huge_pte_clear_uffd_wp(pte);
  			huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
  			pages++;
 +		} else {
 +			/* None pte */
 +			if (unlikely(uffd_wp))
 +				/* Safe to modify directly (none->non-present). */
 +				set_huge_pte_at(mm, address, ptep,
 +						make_pte_marker(PTE_MARKER_UFFD_WP));
  		}
  		spin_unlock(ptl);
  	}
@@@ -6755,7 -6562,14 +6755,14 @@@ int huge_pmd_unshare(struct mm_struct *
  	pud_clear(pud);
  	put_page(virt_to_page(ptep));
  	mm_dec_nr_pmds(mm);
- 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+ 	/*
+ 	 * This update of passed address optimizes loops sequentially
+ 	 * processing addresses in increments of huge page size (PMD_SIZE
+ 	 * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
+ 	 * Update address to the 'last page' in the cleared area so that
+ 	 * calling loop can move to first page past this area.
+ 	 */
+ 	*addr |= PUD_SIZE - PMD_SIZE;
  	return 1;
  }
  
@@@ -6879,11 -6693,9 +6886,11 @@@ follow_huge_pmd(struct mm_struct *mm, u
  	spinlock_t *ptl;
  	pte_t pte;
  
 -	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
 -	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
 -			 (FOLL_PIN | FOLL_GET)))
 +	/*
 +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
 +	 * follow_hugetlb_page().
 +	 */
 +	if (WARN_ON_ONCE(flags & FOLL_PIN))
  		return NULL;
  
  retry:
@@@ -6971,9 -6783,7 +6978,9 @@@ int get_hwpoison_huge_page(struct page 
  	spin_lock_irq(&hugetlb_lock);
  	if (PageHeadHuge(page)) {
  		*hugetlb = true;
 -		if (HPageFreed(page) || HPageMigratable(page))
 +		if (HPageFreed(page))
 +			ret = 0;
 +		else if (HPageMigratable(page))
  			ret = get_page_unless_zero(page);
  		else
  			ret = -EBUSY;
@@@ -7063,7 -6873,6 +7070,7 @@@ void hugetlb_unshare_all_pmds(struct vm
  	if (start >= end)
  		return;
  
 +	flush_cache_range(vma, start, end);
  	/*
  	 * No need to call adjust_range_if_pmd_sharing_possible(), because
  	 * we have already done the PUD_SIZE alignment.
@@@ -7149,7 -6958,7 +7156,7 @@@ void __init hugetlb_cma_reserve(int ord
  		if (hugetlb_cma_size_in_node[nid] == 0)
  			continue;
  
 -		if (!node_state(nid, N_ONLINE)) {
 +		if (!node_online(nid)) {
  			pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
  			hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
  			hugetlb_cma_size_in_node[nid] = 0;
@@@ -7188,7 -6997,7 +7195,7 @@@
  	}
  
  	reserved = 0;
 -	for_each_node_state(nid, N_ONLINE) {
 +	for_each_online_node(nid) {
  		int res;
  		char name[CMA_MAX_NAME];
  
diff --combined mm/page_alloc.c
index bc93a82,5ced6cb..149f2ab
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -81,7 -81,6 +81,7 @@@
  #include "internal.h"
  #include "shuffle.h"
  #include "page_reporting.h"
 +#include "swap.h"
  
  /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
  typedef int __bitwise fpi_t;
@@@ -868,6 -867,40 +868,6 @@@ static inline void set_buddy_order(stru
  	__SetPageBuddy(page);
  }
  
 -/*
 - * This function checks whether a page is free && is the buddy
 - * we can coalesce a page and its buddy if
 - * (a) the buddy is not in a hole (check before calling!) &&
 - * (b) the buddy is in the buddy system &&
 - * (c) a page and its buddy have the same order &&
 - * (d) a page and its buddy are in the same zone.
 - *
 - * For recording whether a page is in the buddy system, we set PageBuddy.
 - * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
 - *
 - * For recording page's order, we use page_private(page).
 - */
 -static inline bool page_is_buddy(struct page *page, struct page *buddy,
 -							unsigned int order)
 -{
 -	if (!page_is_guard(buddy) && !PageBuddy(buddy))
 -		return false;
 -
 -	if (buddy_order(buddy) != order)
 -		return false;
 -
 -	/*
 -	 * zone check is done late to avoid uselessly calculating
 -	 * zone/node ids for pages that could never merge.
 -	 */
 -	if (page_zone_id(page) != page_zone_id(buddy))
 -		return false;
 -
 -	VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
 -
 -	return true;
 -}
 -
  #ifdef CONFIG_COMPACTION
  static inline struct capture_control *task_capc(struct zone *zone)
  {
@@@ -976,17 -1009,18 +976,17 @@@ static inline boo
  buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
  		   struct page *page, unsigned int order)
  {
 -	struct page *higher_page, *higher_buddy;
 -	unsigned long combined_pfn;
 +	unsigned long higher_page_pfn;
 +	struct page *higher_page;
  
  	if (order >= MAX_ORDER - 2)
  		return false;
  
 -	combined_pfn = buddy_pfn & pfn;
 -	higher_page = page + (combined_pfn - pfn);
 -	buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
 -	higher_buddy = higher_page + (buddy_pfn - combined_pfn);
 +	higher_page_pfn = buddy_pfn & pfn;
 +	higher_page = page + (higher_page_pfn - pfn);
  
 -	return page_is_buddy(higher_page, higher_buddy, order + 1);
 +	return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
 +			NULL) != NULL;
  }
  
  /*
@@@ -1019,6 -1053,7 +1019,6 @@@ static inline void __free_one_page(stru
  		int migratetype, fpi_t fpi_flags)
  {
  	struct capture_control *capc = task_capc(zone);
 -	unsigned int max_order = pageblock_order;
  	unsigned long buddy_pfn;
  	unsigned long combined_pfn;
  	struct page *buddy;
@@@ -1034,32 -1069,18 +1034,32 @@@
  	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
  	VM_BUG_ON_PAGE(bad_range(zone, page), page);
  
 -continue_merging:
 -	while (order < max_order) {
 +	while (order < MAX_ORDER - 1) {
  		if (compaction_capture(capc, page, order, migratetype)) {
  			__mod_zone_freepage_state(zone, -(1 << order),
  								migratetype);
  			return;
  		}
 -		buddy_pfn = __find_buddy_pfn(pfn, order);
 -		buddy = page + (buddy_pfn - pfn);
  
 -		if (!page_is_buddy(page, buddy, order))
 +		buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
 +		if (!buddy)
  			goto done_merging;
 +
 +		if (unlikely(order >= pageblock_order)) {
 +			/*
 +			 * We want to prevent merge between freepages on pageblock
 +			 * without fallbacks and normal pageblock. Without this,
 +			 * pageblock isolation could cause incorrect freepage or CMA
 +			 * accounting or HIGHATOMIC accounting.
 +			 */
 +			int buddy_mt = get_pageblock_migratetype(buddy);
 +
 +			if (migratetype != buddy_mt
 +					&& (!migratetype_is_mergeable(migratetype) ||
 +						!migratetype_is_mergeable(buddy_mt)))
 +				goto done_merging;
 +		}
 +
  		/*
  		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
  		 * merge with it and move up one order.
@@@ -1073,6 -1094,32 +1073,6 @@@
  		pfn = combined_pfn;
  		order++;
  	}
 -	if (order < MAX_ORDER - 1) {
 -		/* If we are here, it means order is >= pageblock_order.
 -		 * We want to prevent merge between freepages on pageblock
 -		 * without fallbacks and normal pageblock. Without this,
 -		 * pageblock isolation could cause incorrect freepage or CMA
 -		 * accounting or HIGHATOMIC accounting.
 -		 *
 -		 * We don't want to hit this code for the more frequent
 -		 * low-order merging.
 -		 */
 -		int buddy_mt;
 -
 -		buddy_pfn = __find_buddy_pfn(pfn, order);
 -		buddy = page + (buddy_pfn - pfn);
 -
 -		if (!page_is_buddy(page, buddy, order))
 -			goto done_merging;
 -		buddy_mt = get_pageblock_migratetype(buddy);
 -
 -		if (migratetype != buddy_mt
 -				&& (!migratetype_is_mergeable(migratetype) ||
 -					!migratetype_is_mergeable(buddy_mt)))
 -			goto done_merging;
 -		max_order = order + 1;
 -		goto continue_merging;
 -	}
  
  done_merging:
  	set_buddy_order(page, order);
@@@ -1094,48 -1141,6 +1094,48 @@@
  		page_reporting_notify_free(order);
  }
  
 +/**
 + * split_free_page() -- split a free page at split_pfn_offset
 + * @free_page:		the original free page
 + * @order:		the order of the page
 + * @split_pfn_offset:	split offset within the page
 + *
 + * It is used when the free page crosses two pageblocks with different migratetypes
 + * at split_pfn_offset within the page. The split free page will be put into
 + * separate migratetype lists afterwards. Otherwise, the function achieves
 + * nothing.
 + */
 +void split_free_page(struct page *free_page,
 +				int order, unsigned long split_pfn_offset)
 +{
 +	struct zone *zone = page_zone(free_page);
 +	unsigned long free_page_pfn = page_to_pfn(free_page);
 +	unsigned long pfn;
 +	unsigned long flags;
 +	int free_page_order;
 +
 +	if (split_pfn_offset == 0)
 +		return;
 +
 +	spin_lock_irqsave(&zone->lock, flags);
 +	del_page_from_free_list(free_page, zone, order);
 +	for (pfn = free_page_pfn;
 +	     pfn < free_page_pfn + (1UL << order);) {
 +		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
 +
 +		free_page_order = min_t(int,
 +					pfn ? __ffs(pfn) : order,
 +					__fls(split_pfn_offset));
 +		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
 +				mt, FPI_NONE);
 +		pfn += 1UL << free_page_order;
 +		split_pfn_offset -= (1UL << free_page_order);
 +		/* we have done the first part, now switch to second part */
 +		if (split_pfn_offset == 0)
 +			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
 +	}
 +	spin_unlock_irqrestore(&zone->lock, flags);
 +}
  /*
   * A bad page could be due to a number of fields. Instead of multiple branches,
   * try and check multiple fields with one check. The caller must do a detailed
@@@ -2471,9 -2476,6 +2471,9 @@@ struct page *__rmqueue_smallest(struct 
  		del_page_from_free_list(page, zone, current_order);
  		expand(zone, page, order, current_order, migratetype);
  		set_pcppage_migratetype(page, migratetype);
 +		trace_mm_page_alloc_zone_locked(page, order, migratetype,
 +				pcp_allowed_order(order) &&
 +				migratetype < MIGRATE_PCPTYPES);
  		return page;
  	}
  
@@@ -2997,7 -2999,7 +2997,7 @@@ __rmqueue(struct zone *zone, unsigned i
  		    zone_page_state(zone, NR_FREE_PAGES) / 2) {
  			page = __rmqueue_cma_fallback(zone, order);
  			if (page)
 -				goto out;
 +				return page;
  		}
  	}
  retry:
@@@ -3010,6 -3012,9 +3010,6 @@@
  								alloc_flags))
  			goto retry;
  	}
 -out:
 -	if (page)
 -		trace_mm_page_alloc_zone_locked(page, order, migratetype);
  	return page;
  }
  
@@@ -3728,8 -3733,11 +3728,8 @@@ struct page *rmqueue(struct zone *prefe
  		 * reserved for high-order atomic allocation, so order-0
  		 * request should skip it.
  		 */
 -		if (order > 0 && alloc_flags & ALLOC_HARDER) {
 +		if (order > 0 && alloc_flags & ALLOC_HARDER)
  			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
 -			if (page)
 -				trace_mm_page_alloc_zone_locked(page, order, migratetype);
 -		}
  		if (!page) {
  			page = __rmqueue(zone, order, migratetype, alloc_flags);
  			if (!page)
@@@ -3791,9 -3799,6 +3791,9 @@@ static bool __should_fail_alloc_page(gf
  			(gfp_mask & __GFP_DIRECT_RECLAIM))
  		return false;
  
 +	if (gfp_mask & __GFP_NOWARN)
 +		fail_page_alloc.attr.no_warn = true;
 +
  	return should_fail(&fail_page_alloc.attr, 1 << order);
  }
  
@@@ -4063,8 -4068,7 +4063,8 @@@ get_page_from_freelist(gfp_t gfp_mask, 
  {
  	struct zoneref *z;
  	struct zone *zone;
 -	struct pglist_data *last_pgdat_dirty_limit = NULL;
 +	struct pglist_data *last_pgdat = NULL;
 +	bool last_pgdat_dirty_ok = false;
  	bool no_fallback;
  
  retry:
@@@ -4103,13 -4107,13 +4103,13 @@@
  		 * dirty-throttling and the flusher threads.
  		 */
  		if (ac->spread_dirty_pages) {
 -			if (last_pgdat_dirty_limit == zone->zone_pgdat)
 -				continue;
 +			if (last_pgdat != zone->zone_pgdat) {
 +				last_pgdat = zone->zone_pgdat;
 +				last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
 +			}
  
 -			if (!node_dirty_ok(zone->zone_pgdat)) {
 -				last_pgdat_dirty_limit = zone->zone_pgdat;
 +			if (!last_pgdat_dirty_ok)
  				continue;
 -			}
  		}
  
  		if (no_fallback && nr_online_nodes > 1 &&
@@@ -4342,8 -4346,7 +4342,8 @@@ __alloc_pages_may_oom(gfp_t gfp_mask, u
  	 */
  
  	/* Exhausted what can be done so it's blame time */
 -	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
 +	if (out_of_memory(&oc) ||
 +	    WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
  		*did_some_progress = 1;
  
  		/*
@@@ -4674,12 -4677,9 +4674,12 @@@ static void wake_all_kswapds(unsigned i
  
  	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
  					ac->nodemask) {
 -		if (last_pgdat != zone->zone_pgdat)
 +		if (!managed_zone(zone))
 +			continue;
 +		if (last_pgdat != zone->zone_pgdat) {
  			wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
 -		last_pgdat = zone->zone_pgdat;
 +			last_pgdat = zone->zone_pgdat;
 +		}
  	}
  }
  
@@@ -5117,7 -5117,7 +5117,7 @@@ nopage
  		 * All existing users of the __GFP_NOFAIL are blockable, so warn
  		 * of any new users that actually require GFP_NOWAIT
  		 */
 -		if (WARN_ON_ONCE(!can_direct_reclaim))
 +		if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
  			goto fail;
  
  		/*
@@@ -5125,7 -5125,7 +5125,7 @@@
  		 * because we cannot reclaim anything and only can loop waiting
  		 * for somebody to do a work for us
  		 */
 -		WARN_ON_ONCE(current->flags & PF_MEMALLOC);
 +		WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
  
  		/*
  		 * non failing costly orders are a hard requirement which we
@@@ -5133,7 -5133,7 +5133,7 @@@
  		 * so that we can identify them and convert them to something
  		 * else.
  		 */
 -		WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
 +		WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask);
  
  		/*
  		 * Help non-failing allocations by giving them access to memory
@@@ -5324,8 -5324,8 +5324,8 @@@ unsigned long __alloc_pages_bulk(gfp_t 
  		page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
  								pcp, pcp_list);
  		if (unlikely(!page)) {
- 			/* Try and get at least one page */
- 			if (!nr_populated)
+ 			/* Try and allocate at least one page */
+ 			if (!nr_account)
  				goto failed_irq;
  			break;
  		}
@@@ -5379,8 -5379,10 +5379,8 @@@ struct page *__alloc_pages(gfp_t gfp, u
  	 * There are several places where we assume that the order value is sane
  	 * so bail out early if the request is out of bound.
  	 */
 -	if (unlikely(order >= MAX_ORDER)) {
 -		WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
 +	if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
  		return NULL;
 -	}
  
  	gfp &= gfp_allowed_mask;
  	/*
@@@ -6169,6 -6171,7 +6169,6 @@@ int numa_zonelist_order_handler(struct 
  }
  
  
 -#define MAX_NODE_LOAD (nr_online_nodes)
  static int node_load[MAX_NUMNODES];
  
  /**
@@@ -6215,7 -6218,7 +6215,7 @@@ int find_next_best_node(int node, nodem
  			val += PENALTY_FOR_NODE_WITH_CPUS;
  
  		/* Slight preference for less loaded node */
 -		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 +		val *= MAX_NUMNODES;
  		val += node_load[n];
  
  		if (val < min_val) {
@@@ -6281,12 -6284,13 +6281,12 @@@ static void build_thisnode_zonelists(pg
  static void build_zonelists(pg_data_t *pgdat)
  {
  	static int node_order[MAX_NUMNODES];
 -	int node, load, nr_nodes = 0;
 +	int node, nr_nodes = 0;
  	nodemask_t used_mask = NODE_MASK_NONE;
  	int local_node, prev_node;
  
  	/* NUMA-aware ordering of nodes */
  	local_node = pgdat->node_id;
 -	load = nr_online_nodes;
  	prev_node = local_node;
  
  	memset(node_order, 0, sizeof(node_order));
@@@ -6298,10 -6302,11 +6298,10 @@@
  		 */
  		if (node_distance(local_node, node) !=
  		    node_distance(local_node, prev_node))
 -			node_load[node] += load;
 +			node_load[node] += 1;
  
  		node_order[nr_nodes++] = node;
  		prev_node = node;
 -		load--;
  	}
  
  	build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
@@@ -6640,21 -6645,6 +6640,21 @@@ static void __ref __init_zone_device_pa
  	}
  }
  
 +/*
 + * With compound page geometry and when struct pages are stored in ram most
 + * tail pages are reused. Consequently, the amount of unique struct pages to
 + * initialize is a lot smaller that the total amount of struct pages being
 + * mapped. This is a paired / mild layering violation with explicit knowledge
 + * of how the sparse_vmemmap internals handle compound pages in the lack
 + * of an altmap. See vmemmap_populate_compound_pages().
 + */
 +static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
 +					      unsigned long nr_pages)
 +{
 +	return is_power_of_2(sizeof(struct page)) &&
 +		!altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
 +}
 +
  static void __ref memmap_init_compound(struct page *head,
  				       unsigned long head_pfn,
  				       unsigned long zone_idx, int nid,
@@@ -6719,7 -6709,7 +6719,7 @@@ void __ref memmap_init_zone_device(stru
  			continue;
  
  		memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
 -				     pfns_per_compound);
 +				     compound_nr_pages(altmap, pfns_per_compound));
  	}
  
  	pr_info("%s initialised %lu pages in %ums\n", __func__,
@@@ -7880,7 -7870,7 +7880,7 @@@ static void __init find_zone_movable_pf
  
  			usable_startpfn = memblock_region_memory_base_pfn(r);
  
 -			if (usable_startpfn < 0x100000) {
 +			if (usable_startpfn < PHYS_PFN(SZ_4G)) {
  				mem_below_4gb_not_mirrored = true;
  				continue;
  			}
@@@ -8959,7 -8949,136 +8959,7 @@@ void *__init alloc_large_system_hash(co
  	return table;
  }
  
 -/*
 - * This function checks whether pageblock includes unmovable pages or not.
 - *
 - * PageLRU check without isolation or lru_lock could race so that
 - * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
 - * check without lock_page also may miss some movable non-lru pages at
 - * race condition. So you can't expect this function should be exact.
 - *
 - * Returns a page without holding a reference. If the caller wants to
 - * dereference that page (e.g., dumping), it has to make sure that it
 - * cannot get removed (e.g., via memory unplug) concurrently.
 - *
 - */
 -struct page *has_unmovable_pages(struct zone *zone, struct page *page,
 -				 int migratetype, int flags)
 -{
 -	unsigned long iter = 0;
 -	unsigned long pfn = page_to_pfn(page);
 -	unsigned long offset = pfn % pageblock_nr_pages;
 -
 -	if (is_migrate_cma_page(page)) {
 -		/*
 -		 * CMA allocations (alloc_contig_range) really need to mark
 -		 * isolate CMA pageblocks even when they are not movable in fact
 -		 * so consider them movable here.
 -		 */
 -		if (is_migrate_cma(migratetype))
 -			return NULL;
 -
 -		return page;
 -	}
 -
 -	for (; iter < pageblock_nr_pages - offset; iter++) {
 -		page = pfn_to_page(pfn + iter);
 -
 -		/*
 -		 * Both, bootmem allocations and memory holes are marked
 -		 * PG_reserved and are unmovable. We can even have unmovable
 -		 * allocations inside ZONE_MOVABLE, for example when
 -		 * specifying "movablecore".
 -		 */
 -		if (PageReserved(page))
 -			return page;
 -
 -		/*
 -		 * If the zone is movable and we have ruled out all reserved
 -		 * pages then it should be reasonably safe to assume the rest
 -		 * is movable.
 -		 */
 -		if (zone_idx(zone) == ZONE_MOVABLE)
 -			continue;
 -
 -		/*
 -		 * Hugepages are not in LRU lists, but they're movable.
 -		 * THPs are on the LRU, but need to be counted as #small pages.
 -		 * We need not scan over tail pages because we don't
 -		 * handle each tail page individually in migration.
 -		 */
 -		if (PageHuge(page) || PageTransCompound(page)) {
 -			struct page *head = compound_head(page);
 -			unsigned int skip_pages;
 -
 -			if (PageHuge(page)) {
 -				if (!hugepage_migration_supported(page_hstate(head)))
 -					return page;
 -			} else if (!PageLRU(head) && !__PageMovable(head)) {
 -				return page;
 -			}
 -
 -			skip_pages = compound_nr(head) - (page - head);
 -			iter += skip_pages - 1;
 -			continue;
 -		}
 -
 -		/*
 -		 * We can't use page_count without pin a page
 -		 * because another CPU can free compound page.
 -		 * This check already skips compound tails of THP
 -		 * because their page->_refcount is zero at all time.
 -		 */
 -		if (!page_ref_count(page)) {
 -			if (PageBuddy(page))
 -				iter += (1 << buddy_order(page)) - 1;
 -			continue;
 -		}
 -
 -		/*
 -		 * The HWPoisoned page may be not in buddy system, and
 -		 * page_count() is not 0.
 -		 */
 -		if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
 -			continue;
 -
 -		/*
 -		 * We treat all PageOffline() pages as movable when offlining
 -		 * to give drivers a chance to decrement their reference count
 -		 * in MEM_GOING_OFFLINE in order to indicate that these pages
 -		 * can be offlined as there are no direct references anymore.
 -		 * For actually unmovable PageOffline() where the driver does
 -		 * not support this, we will fail later when trying to actually
 -		 * move these pages that still have a reference count > 0.
 -		 * (false negatives in this function only)
 -		 */
 -		if ((flags & MEMORY_OFFLINE) && PageOffline(page))
 -			continue;
 -
 -		if (__PageMovable(page) || PageLRU(page))
 -			continue;
 -
 -		/*
 -		 * If there are RECLAIMABLE pages, we need to check
 -		 * it.  But now, memory offline itself doesn't call
 -		 * shrink_node_slabs() and it still to be fixed.
 -		 */
 -		return page;
 -	}
 -	return NULL;
 -}
 -
  #ifdef CONFIG_CONTIG_ALLOC
 -static unsigned long pfn_max_align_down(unsigned long pfn)
 -{
 -	return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
 -}
 -
 -static unsigned long pfn_max_align_up(unsigned long pfn)
 -{
 -	return ALIGN(pfn, MAX_ORDER_NR_PAGES);
 -}
 -
  #if defined(CONFIG_DYNAMIC_DEBUG) || \
  	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
  /* Usage: See admin-guide/dynamic-debug-howto.rst */
@@@ -8982,7 -9101,7 +8982,7 @@@ static inline void alloc_contig_dump_pa
  #endif
  
  /* [start, end) must belong to a single zone. */
 -static int __alloc_contig_migrate_range(struct compact_control *cc,
 +int __alloc_contig_migrate_range(struct compact_control *cc,
  					unsigned long start, unsigned long end)
  {
  	/* This function is based on compact_zone() from compaction.c. */
@@@ -9032,7 -9151,7 +9032,7 @@@
  
  	lru_cache_enable();
  	if (ret < 0) {
 -		if (ret == -EBUSY)
 +		if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
  			alloc_contig_dump_pages(&cc->migratepages);
  		putback_movable_pages(&cc->migratepages);
  		return ret;
@@@ -9050,8 -9169,8 +9050,8 @@@
   *			be either of the two.
   * @gfp_mask:	GFP mask to use during compaction
   *
 - * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
 - * aligned.  The PFN range must belong to a single zone.
 + * The PFN range does not have to be pageblock aligned. The PFN range must
 + * belong to a single zone.
   *
   * The first thing this routine does is attempt to MIGRATE_ISOLATE all
   * pageblocks in the range.  Once isolated, the pageblocks should not
@@@ -9065,7 -9184,7 +9065,7 @@@ int alloc_contig_range(unsigned long st
  		       unsigned migratetype, gfp_t gfp_mask)
  {
  	unsigned long outer_start, outer_end;
 -	unsigned int order;
 +	int order;
  	int ret = 0;
  
  	struct compact_control cc = {
@@@ -9084,11 -9203,14 +9084,11 @@@
  	 * What we do here is we mark all pageblocks in range as
  	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
  	 * have different sizes, and due to the way page allocator
 -	 * work, we align the range to biggest of the two pages so
 -	 * that page allocator won't try to merge buddies from
 -	 * different pageblocks and change MIGRATE_ISOLATE to some
 -	 * other migration type.
 +	 * work, start_isolate_page_range() has special handlings for this.
  	 *
  	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
  	 * migrate the pages from an unaligned range (ie. pages that
 -	 * we are interested in).  This will put all the pages in
 +	 * we are interested in). This will put all the pages in
  	 * range back to page allocator as MIGRATE_ISOLATE.
  	 *
  	 * When this is done, we take the pages in range from page
@@@ -9101,9 -9223,10 +9101,9 @@@
  	 * put back to page allocator so that buddy can use them.
  	 */
  
 -	ret = start_isolate_page_range(pfn_max_align_down(start),
 -				       pfn_max_align_up(end), migratetype, 0);
 +	ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
  	if (ret)
 -		return ret;
 +		goto done;
  
  	drain_all_pages(cc.zone);
  
@@@ -9123,7 -9246,7 +9123,7 @@@
  	ret = 0;
  
  	/*
 -	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 +	 * Pages from [start, end) are within a pageblock_nr_pages
  	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
  	 * more, all pages in [start, end) are free in page allocator.
  	 * What we are going to do is to allocate all pages from
@@@ -9182,7 -9305,8 +9182,7 @@@
  		free_contig_range(end, outer_end - end);
  
  done:
 -	undo_isolate_page_range(pfn_max_align_down(start),
 -				pfn_max_align_up(end), migratetype);
 +	undo_isolate_page_range(start, end, migratetype);
  	return ret;
  }
  EXPORT_SYMBOL(alloc_contig_range);
@@@ -9501,6 -9625,7 +9501,6 @@@ bool put_page_back_buddy(struct page *p
  		ClearPageHWPoisonTakenOff(page);
  		__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
  		if (TestClearPageHWPoison(page)) {
 -			num_poisoned_pages_dec();
  			ret = true;
  		}
  	}
diff --combined mm/page_table_check.c
index 3692bea,bc55be2..e206274
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@@ -52,6 -52,23 +52,6 @@@ static struct page_table_check *get_pag
  	return (void *)(page_ext) + page_table_check_ops.offset;
  }
  
 -static inline bool pte_user_accessible_page(pte_t pte)
 -{
 -	return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
 -}
 -
 -static inline bool pmd_user_accessible_page(pmd_t pmd)
 -{
 -	return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) &&
 -		(pmd_val(pmd) & _PAGE_USER);
 -}
 -
 -static inline bool pud_user_accessible_page(pud_t pud)
 -{
 -	return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) &&
 -		(pud_val(pud) & _PAGE_USER);
 -}
 -
  /*
   * An enty is removed from the page table, decrement the counters for that page
   * verify that it is of correct type and counters do not become negative.
@@@ -160,7 -177,7 +160,7 @@@ void __page_table_check_pmd_clear(struc
  
  	if (pmd_user_accessible_page(pmd)) {
  		page_table_check_clear(mm, addr, pmd_pfn(pmd),
 -				       PMD_PAGE_SIZE >> PAGE_SHIFT);
 +				       PMD_SIZE >> PAGE_SHIFT);
  	}
  }
  EXPORT_SYMBOL(__page_table_check_pmd_clear);
@@@ -173,7 -190,7 +173,7 @@@ void __page_table_check_pud_clear(struc
  
  	if (pud_user_accessible_page(pud)) {
  		page_table_check_clear(mm, addr, pud_pfn(pud),
 -				       PUD_PAGE_SIZE >> PAGE_SHIFT);
 +				       PUD_SIZE >> PAGE_SHIFT);
  	}
  }
  EXPORT_SYMBOL(__page_table_check_pud_clear);
@@@ -202,7 -219,7 +202,7 @@@ void __page_table_check_pmd_set(struct 
  	__page_table_check_pmd_clear(mm, addr, *pmdp);
  	if (pmd_user_accessible_page(pmd)) {
  		page_table_check_set(mm, addr, pmd_pfn(pmd),
 -				     PMD_PAGE_SIZE >> PAGE_SHIFT,
 +				     PMD_SIZE >> PAGE_SHIFT,
  				     pmd_write(pmd));
  	}
  }
@@@ -217,7 -234,7 +217,7 @@@ void __page_table_check_pud_set(struct 
  	__page_table_check_pud_clear(mm, addr, *pudp);
  	if (pud_user_accessible_page(pud)) {
  		page_table_check_set(mm, addr, pud_pfn(pud),
 -				     PUD_PAGE_SIZE >> PAGE_SHIFT,
 +				     PUD_SIZE >> PAGE_SHIFT,
  				     pud_write(pud));
  	}
  }
@@@ -234,11 -251,11 +234,11 @@@ void __page_table_check_pte_clear_range
  		pte_t *ptep = pte_offset_map(&pmd, addr);
  		unsigned long i;
  
- 		pte_unmap(ptep);
  		for (i = 0; i < PTRS_PER_PTE; i++) {
  			__page_table_check_pte_clear(mm, addr, *ptep);
  			addr += PAGE_SIZE;
  			ptep++;
  		}
+ 		pte_unmap(ptep - PTRS_PER_PTE);
  	}
  }