From: Linus Torvalds Date: Fri, 27 May 2022 18:29:35 +0000 (-0700) Subject: Merge tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kerne... X-Git-Tag: v6.6.17~7420 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=77fb622de1393b1d54f24f4f7ed98f84feeda502;hp=-c;p=platform%2Fkernel%2Flinux-rpi.git Merge tag 'mm-hotfixes-stable-2022-05-27' of git://git./linux/kernel/git/akpm/mm Pull hotfixes from Andrew Morton: "Six hotfixes. The page_table_check one from Miaohe Lin is considered a minor thing so it isn't marked for -stable. The remainder address pre-5.19 issues and are cc:stable" * tag 'mm-hotfixes-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: mm/page_table_check: fix accessing unmapped ptep kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add] mm/page_alloc: always attempt to allocate at least one page during bulk allocation hugetlb: fix huge_pmd_unshare address update zsmalloc: fix races between asynchronous zspage free and page migration Revert "mm/cma.c: remove redundant cma_mutex lock" --- 77fb622de1393b1d54f24f4f7ed98f84feeda502 diff --combined mm/hugetlb.c index 01f0e2e,410bbb0..7c468ac --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@@ -370,7 -370,7 +370,7 @@@ static void coalesce_file_region(struc } static inline long -hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from, +hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, long to, struct hstate *h, struct hugetlb_cgroup *cg, long *regions_needed) { @@@ -379,7 -379,7 +379,7 @@@ if (!regions_needed) { nrg = get_file_region_entry_from_cache(map, from, to); record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); - list_add(&nrg->link, rg->link.prev); + list_add(&nrg->link, rg); coalesce_file_region(map, nrg); } else *regions_needed += 1; @@@ -402,52 -402,47 +402,52 @@@ static long add_reservation_in_range(st long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; - struct file_region *rg = NULL, *trg = NULL; + struct file_region *iter, *trg = NULL; + struct list_head *rg = NULL; if (regions_needed) *regions_needed = 0; /* In this loop, we essentially handle an entry for the range - * [last_accounted_offset, rg->from), at every iteration, with some + * [last_accounted_offset, iter->from), at every iteration, with some * bounds checking. */ - list_for_each_entry_safe(rg, trg, head, link) { + list_for_each_entry_safe(iter, trg, head, link) { /* Skip irrelevant regions that start before our range. */ - if (rg->from < f) { + if (iter->from < f) { /* If this region ends after the last accounted offset, * then we need to update last_accounted_offset. */ - if (rg->to > last_accounted_offset) - last_accounted_offset = rg->to; + if (iter->to > last_accounted_offset) + last_accounted_offset = iter->to; continue; } /* When we find a region that starts beyond our range, we've * finished. */ - if (rg->from >= t) + if (iter->from >= t) { + rg = iter->link.prev; break; + } - /* Add an entry for last_accounted_offset -> rg->from, and + /* Add an entry for last_accounted_offset -> iter->from, and * update last_accounted_offset. */ - if (rg->from > last_accounted_offset) - add += hugetlb_resv_map_add(resv, rg, + if (iter->from > last_accounted_offset) + add += hugetlb_resv_map_add(resv, iter->link.prev, last_accounted_offset, - rg->from, h, h_cg, + iter->from, h, h_cg, regions_needed); - last_accounted_offset = rg->to; + last_accounted_offset = iter->to; } /* Handle the case where our range extends beyond * last_accounted_offset. */ + if (!rg) + rg = head->prev; if (last_accounted_offset < t) add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, t, h, h_cg, regions_needed); @@@ -1540,7 -1535,7 +1540,7 @@@ static void __update_and_free_page(stru if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - if (alloc_huge_page_vmemmap(h, page)) { + if (hugetlb_vmemmap_alloc(h, page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@@ -1617,7 -1612,7 +1617,7 @@@ static DECLARE_WORK(free_hpage_work, fr static inline void flush_free_hpage_work(struct hstate *h) { - if (free_vmemmap_pages_per_hpage(h)) + if (hugetlb_optimize_vmemmap_pages(h)) flush_work(&free_hpage_work); } @@@ -1677,8 -1672,6 +1677,8 @@@ void free_huge_page(struct page *page VM_BUG_ON_PAGE(page_mapcount(page), page); hugetlb_set_page_subpool(page, NULL); + if (PageAnon(page)) + __ClearPageAnonExclusive(page); page->mapping = NULL; restore_reserve = HPageRestoreReserve(page); ClearHPageRestoreReserve(page); @@@ -1739,7 -1732,7 +1739,7 @@@ static void __prep_account_new_huge_pag static void __prep_new_huge_page(struct hstate *h, struct page *page) { - free_huge_page_vmemmap(h, page); + hugetlb_vmemmap_free(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@@ -2112,7 -2105,7 +2112,7 @@@ retry * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = alloc_huge_page_vmemmap(h, head); + rc = hugetlb_vmemmap_alloc(h, head); if (!rc) { /* * Move PageHWPoison flag from head page to the raw @@@ -2986,6 -2979,8 +2986,6 @@@ int __alloc_bootmem_huge_page(struct hs struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node; - if (nid != NUMA_NO_NODE && nid >= nr_online_nodes) - return 0; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), @@@ -3093,7 -3088,7 +3093,7 @@@ static void __init hugetlb_hstate_alloc } /* do node specific alloc */ - for (i = 0; i < nr_online_nodes; i++) { + for_each_online_node(i) { if (h->max_huge_pages_node[i] > 0) { hugetlb_hstate_alloc_pages_onenode(h, i); node_specific_alloc = true; @@@ -3425,7 -3420,7 +3425,7 @@@ static int demote_free_huge_page(struc remove_hugetlb_page_for_demote(h, page, false); spin_unlock_irq(&hugetlb_lock); - rc = alloc_huge_page_vmemmap(h, page); + rc = hugetlb_vmemmap_alloc(h, page); if (rc) { /* Allocation of vmemmmap failed, we can not demote page */ spin_lock_irq(&hugetlb_lock); @@@ -4057,7 -4052,7 +4057,7 @@@ static int __init hugetlb_init(void default_hstate.max_huge_pages = default_hstate_max_huge_pages; - for (i = 0; i < nr_online_nodes; i++) + for_each_online_node(i) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; } @@@ -4124,20 -4119,6 +4124,20 @@@ bool __init __weak hugetlb_node_alloc_s { return true; } + +static void __init hugepages_clear_pages_in_node(void) +{ + if (!hugetlb_max_hstate) { + default_hstate_max_huge_pages = 0; + memset(default_hugepages_in_node, 0, + MAX_NUMNODES * sizeof(unsigned int)); + } else { + parsed_hstate->max_huge_pages = 0; + memset(parsed_hstate->max_huge_pages_node, 0, + MAX_NUMNODES * sizeof(unsigned int)); + } +} + /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz @@@ -4157,7 -4138,7 +4157,7 @@@ static int __init hugepages_setup(char if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; - return 0; + return 1; } /* @@@ -4173,7 -4154,7 +4173,7 @@@ if (mhp == last_mhp) { pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); - return 0; + return 1; } while (*p) { @@@ -4184,11 -4165,11 +4184,11 @@@ if (p[count] == ':') { if (!hugetlb_node_alloc_supported()) { pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); - return 0; + return 1; } - if (tmp >= nr_online_nodes) + if (tmp >= MAX_NUMNODES || !node_online(tmp)) goto invalid; - node = array_index_nospec(tmp, nr_online_nodes); + node = array_index_nospec(tmp, MAX_NUMNODES); p += count + 1; /* Parse hugepages */ if (sscanf(p, "%lu%n", &tmp, &count) != 1) @@@ -4225,8 -4206,7 +4225,8 @@@ invalid: pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); - return 0; + hugepages_clear_pages_in_node(); + return 1; } __setup("hugepages=", hugepages_setup); @@@ -4247,7 -4227,7 +4247,7 @@@ static int __init hugepagesz_setup(cha if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); - return 0; + return 1; } h = size_to_hstate(size); @@@ -4262,7 -4242,7 +4262,7 @@@ if (!parsed_default_hugepagesz || h != &default_hstate || default_hstate.max_huge_pages) { pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); - return 0; + return 1; } /* @@@ -4293,14 -4273,14 +4293,14 @@@ static int __init default_hugepagesz_se parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); - return 0; + return 1; } size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); - return 0; + return 1; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); @@@ -4317,7 -4297,7 +4317,7 @@@ */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; - for (i = 0; i < nr_online_nodes; i++) + for_each_online_node(i) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; if (hstate_is_gigantic(&default_hstate)) @@@ -4719,27 -4699,24 +4719,27 @@@ hugetlb_install_page(struct vm_area_str } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; - bool cow = is_cow_mapping(vma->vm_flags); - struct hstate *h = hstate_vma(vma); + bool cow = is_cow_mapping(src_vma->vm_flags); + struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; int ret = 0; if (cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, - vma->vm_start, - vma->vm_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src, + src_vma->vm_start, + src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); + mmap_assert_write_locked(src); + raw_write_seqcount_begin(&src->write_protect_seq); } else { /* * For shared mappings i_mmap_rwsem must be held to call @@@ -4750,12 -4727,12 +4750,12 @@@ i_mmap_lock_read(mapping); } - for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, vma, addr, sz); + dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; @@@ -4790,9 -4767,8 +4790,9 @@@ again } else if (unlikely(is_hugetlb_entry_migration(entry) || is_hugetlb_entry_hwpoisoned(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); + bool uffd_wp = huge_pte_uffd_wp(entry); - if (is_writable_migration_entry(swp_entry) && cow) { + if (!is_readable_migration_entry(swp_entry) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. @@@ -4800,53 -4776,38 +4800,53 @@@ swp_entry = make_readable_migration_entry( swp_offset(swp_entry)); entry = swp_entry_to_pte(swp_entry); + if (userfaultfd_wp(src_vma) && uffd_wp) + entry = huge_pte_mkuffd_wp(entry); set_huge_swap_pte_at(src, addr, src_pte, entry, sz); } + if (!userfaultfd_wp(dst_vma) && uffd_wp) + entry = huge_pte_clear_uffd_wp(entry); set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); + } else if (unlikely(is_pte_marker(entry))) { + /* + * We copy the pte marker only if the dst vma has + * uffd-wp enabled. + */ + if (userfaultfd_wp(dst_vma)) + set_huge_pte_at(dst, addr, dst_pte, entry); } else { entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); /* - * This is a rare case where we see pinned hugetlb - * pages while they're prone to COW. We need to do the - * COW earlier during fork. + * Failing to duplicate the anon rmap is a rare case + * where we see pinned hugetlb pages while they're + * prone to COW. We need to do the COW earlier during + * fork. * * When pre-allocating the page or copying data, we * need to be without the pgtable locks since we could * sleep during the process. */ - if (unlikely(page_needs_cow_for_dma(vma, ptepage))) { + if (!PageAnon(ptepage)) { + page_dup_file_rmap(ptepage, true); + } else if (page_try_dup_anon_rmap(ptepage, true, + src_vma)) { pte_t src_pte_old = entry; struct page *new; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new = alloc_huge_page(vma, addr, 1); + new = alloc_huge_page(dst_vma, addr, 1); if (IS_ERR(new)) { put_page(ptepage); ret = PTR_ERR(new); break; } - copy_user_huge_page(new, ptepage, addr, vma, + copy_user_huge_page(new, ptepage, addr, dst_vma, npages); put_page(ptepage); @@@ -4856,13 -4817,13 +4856,13 @@@ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { - restore_reserve_on_error(h, vma, addr, + restore_reserve_on_error(h, dst_vma, addr, new); put_page(new); /* dst_entry won't change as in child */ goto again; } - hugetlb_install_page(vma, dst_pte, addr, new); + hugetlb_install_page(dst_vma, dst_pte, addr, new); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; @@@ -4880,6 -4841,7 +4880,6 @@@ entry = huge_pte_wrprotect(entry); } - page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); hugetlb_count_add(npages, dst); } @@@ -4887,12 -4849,10 +4887,12 @@@ spin_unlock(dst_ptl); } - if (cow) + if (cow) { + raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); - else + } else { i_mmap_unlock_read(mapping); + } return ret; } @@@ -4936,17 -4896,10 +4936,17 @@@ int move_hugetlb_page_tables(struct vm_ unsigned long old_addr_copy; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; + bool shared_pmd = false; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + /* + * In case of shared PMDs, we should cover the maximum possible + * range. + */ + flush_cache_range(vma, range.start, range.end); + mmu_notifier_invalidate_range_start(&range); /* Prevent race with file truncation */ i_mmap_lock_write(mapping); @@@ -4963,10 -4916,8 +4963,10 @@@ */ old_addr_copy = old_addr; - if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) + if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) { + shared_pmd = true; continue; + } dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); if (!dst_pte) @@@ -4974,11 -4925,7 +4974,11 @@@ move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte); } - flush_tlb_range(vma, old_end - len, old_end); + + if (shared_pmd) + flush_tlb_range(vma, range.start, range.end); + else + flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); @@@ -4987,7 -4934,7 +4987,7 @@@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page) + struct page *ref_page, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@@ -5043,18 -4990,7 +5043,18 @@@ * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { - huge_pte_clear(mm, address, ptep, sz); + /* + * If the pte was wr-protected by uffd-wp in any of the + * swap forms, meanwhile the caller does not want to + * drop the uffd-wp bit in this zap, then replace the + * pte with a marker. + */ + if (pte_swp_uffd_wp_any(pte) && + !(zap_flags & ZAP_FLAG_DROP_MARKER)) + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP)); + else + huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } @@@ -5082,11 -5018,7 +5082,11 @@@ tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); - + /* Leave a uffd-wp pte marker if needed */ + if (huge_pte_uffd_wp(pte) && + !(zap_flags & ZAP_FLAG_DROP_MARKER)) + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP)); hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, vma, true); @@@ -5120,10 -5052,9 +5120,10 @@@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) { - __unmap_hugepage_range(tlb, vma, start, end, ref_page); + __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); /* * Clear this flag so that x86's huge_pmd_share page_table_shareable @@@ -5139,13 -5070,12 +5139,13 @@@ } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) { struct mmu_gather tlb; tlb_gather_mmu(&tlb, vma->vm_mm); - __unmap_hugepage_range(&tlb, vma, start, end, ref_page); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); tlb_finish_mmu(&tlb); } @@@ -5200,22 -5130,21 +5200,22 @@@ static void unmap_ref_private(struct mm */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, - address + huge_page_size(h), page); + address + huge_page_size(h), page, 0); } i_mmap_unlock_write(mapping); } /* - * Hugetlb_cow() should be called with page lock of the original hugepage held. + * hugetlb_wp() should be called with page lock of the original hugepage held. * Called with hugetlb_fault_mutex_table held and pte_page locked so we * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, +static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int flags, struct page *pagecache_page, spinlock_t *ptl) { + const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; @@@ -5224,26 -5153,17 +5224,26 @@@ unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; + VM_BUG_ON(unshare && (flags & FOLL_WRITE)); + VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); + pte = huge_ptep_get(ptep); old_page = pte_page(pte); retry_avoidcopy: - /* If no-one else is actually using this page, avoid the copy - * and just make the page writable */ + /* + * If no-one else is actually using this page, we're the exclusive + * owner and can reuse this page. + */ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { - page_move_anon_rmap(old_page, vma); - set_huge_ptep_writable(vma, haddr, ptep); + if (!PageAnonExclusive(old_page)) + page_move_anon_rmap(old_page, vma); + if (likely(!unshare)) + set_huge_ptep_writable(vma, haddr, ptep); return 0; } + VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page), + old_page); /* * If the process that created a MAP_PRIVATE mapping is about to @@@ -5342,13 -5262,13 +5342,13 @@@ if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearHPageRestoreReserve(new_page); - /* Break COW */ + /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(old_page, vma, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, 1)); + make_huge_pte(vma, new_page, !unshare)); SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; @@@ -5356,10 -5276,7 +5356,10 @@@ spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: - /* No restore in case of successful pagetable update (Break COW) */ + /* + * No restore in case of successful pagetable update (Break COW or + * unshare) + */ if (new_page != old_page) restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); @@@ -5469,8 -5386,7 +5469,8 @@@ static inline vm_fault_t hugetlb_handle static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, unsigned int flags) + unsigned long address, pte_t *ptep, + pte_t old_pte, unsigned int flags) { struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; @@@ -5485,8 -5401,7 +5485,8 @@@ /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed - * COW. Warn that such a situation has occurred as it may not be obvious + * COW/unsharing. Warn that such a situation has occurred as it may not + * be obvious. */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", @@@ -5597,29 -5512,22 +5597,29 @@@ retry ptl = huge_pte_lock(h, mm, ptep); ret = 0; - if (!huge_pte_none(huge_ptep_get(ptep))) + /* If pte changed from under us, retry */ + if (!pte_same(huge_ptep_get(ptep), old_pte)) goto backout; if (anon_rmap) { ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, vma, haddr); } else - page_dup_rmap(page, true); + page_dup_file_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); + /* + * If this pte was previously wr-protected, keep it wr-protected even + * if populated. + */ + if (unlikely(pte_marker_uffd_wp(old_pte))) + new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte)); set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); } spin_unlock(ptl); @@@ -5731,10 -5639,8 +5731,10 @@@ vm_fault_t hugetlb_fault(struct mm_stru mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); - if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); + /* PTE markers should be handled the same way as none pte */ + if (huge_pte_none_mostly(entry)) { + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + entry, flags); goto out_mutex; } @@@ -5751,15 -5657,14 +5751,15 @@@ goto out_mutex; /* - * If we are going to COW the mapping later, we examine the pending - * reservations for this page now. This will ensure that any + * If we are going to COW/unshare the mapping later, we examine the + * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the * spinlock. For private mappings, we also lookup the pagecache * page now as it is used to determine if a reservation has been * consumed. */ - if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !huge_pte_write(entry)) { if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@@ -5774,32 -5679,12 +5774,32 @@@ ptl = huge_pte_lock(h, mm, ptep); - /* Check for a racing update before calling hugetlb_cow */ + /* Check for a racing update before calling hugetlb_wp() */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) goto out_ptl; + /* Handle userfault-wp first, before trying to lock more pages */ + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && + (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + }; + + spin_unlock(ptl); + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + return handle_userfault(&vmf, VM_UFFD_WP); + } + /* - * hugetlb_cow() requires page locks of pte_page(entry) and + * hugetlb_wp() requires page locks of pte_page(entry) and * pagecache_page, so here we need take the former one * when page != pagecache_page or !pagecache_page. */ @@@ -5812,14 -5697,13 +5812,14 @@@ get_page(page); - if (flags & FAULT_FLAG_WRITE) { + if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, - pagecache_page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, + pagecache_page, ptl); goto out_put_page; + } else if (likely(flags & FAULT_FLAG_WRITE)) { + entry = huge_pte_mkdirty(entry); } - entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, @@@ -5862,8 -5746,7 +5862,8 @@@ int hugetlb_mcopy_atomic_pte(struct mm_ unsigned long dst_addr, unsigned long src_addr, enum mcopy_atomic_mode mode, - struct page **pagep) + struct page **pagep, + bool wp_copy) { bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct hstate *h = hstate_vma(dst_vma); @@@ -5993,43 -5876,27 +5993,43 @@@ goto out_release_unlock; ret = -EEXIST; - if (!huge_pte_none(huge_ptep_get(dst_pte))) + /* + * We allow to overwrite a pte marker: consider when both MISSING|WP + * registered, we firstly wr-protect a none pte which has no page cache + * page backing it, then access the page. + */ + if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; if (vm_shared) { - page_dup_rmap(page, true); + page_dup_file_rmap(page, true); } else { ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } - /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ - if (is_continue && !vm_shared) + /* + * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY + * with wp flag set, don't set pte write bit. + */ + if (wp_copy || (is_continue && !vm_shared)) writable = 0; else writable = dst_vma->vm_flags & VM_WRITE; _dst_pte = make_huge_pte(dst_vma, page, writable); - if (writable) - _dst_pte = huge_pte_mkdirty(_dst_pte); + /* + * Always mark UFFDIO_COPY page dirty; note that this may not be + * extremely important for hugetlbfs for now since swapping is not + * supported, but we should still be clear in that this page cannot be + * thrown away at will, even if write bit not set. + */ + _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); + if (wp_copy) + _dst_pte = huge_pte_mkuffd_wp(_dst_pte); + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, @@@ -6073,25 -5940,6 +6073,25 @@@ static void record_subpages_vmas(struc } } +static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, + bool *unshare) +{ + pte_t pteval = huge_ptep_get(pte); + + *unshare = false; + if (is_swap_pte(pteval)) + return true; + if (huge_pte_write(pteval)) + return false; + if (flags & FOLL_WRITE) + return true; + if (gup_must_unshare(flags, pte_page(pteval))) { + *unshare = true; + return true; + } + return false; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, @@@ -6106,7 -5954,6 +6106,7 @@@ while (vaddr < vma->vm_end && remainder) { pte_t *pte; spinlock_t *ptl = NULL; + bool unshare = false; int absent; struct page *page; @@@ -6157,8 -6004,9 +6157,8 @@@ * both cases, and because we can't follow correct pages * directly from any kind of swap entries. */ - if (absent || is_swap_pte(huge_ptep_get(pte)) || - ((flags & FOLL_WRITE) && - !huge_pte_write(huge_ptep_get(pte)))) { + if (absent || + __follow_hugetlb_must_fault(flags, pte, &unshare)) { vm_fault_t ret; unsigned int fault_flags = 0; @@@ -6166,8 -6014,6 +6166,8 @@@ spin_unlock(ptl); if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; + else if (unshare) + fault_flags |= FAULT_FLAG_UNSHARE; if (locked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; @@@ -6209,9 -6055,6 +6209,9 @@@ pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + /* * If subpage information not requested, update counters * and skip the same_page loop below. @@@ -6274,19 -6117,16 +6274,19 @@@ } unsigned long hugetlb_change_protection(struct vm_area_struct *vma, - unsigned long address, unsigned long end, pgprot_t newprot) + unsigned long address, unsigned long end, + pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long start = address; pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); - unsigned long pages = 0; + unsigned long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* * In the case of shared PMDs, the area to flush could be beyond @@@ -6302,19 -6142,13 +6302,19 @@@ mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); - for (; address < end; address += huge_page_size(h)) { + for (; address < end; address += psize) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address, huge_page_size(h)); + ptep = huge_pte_offset(mm, address, psize); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, &address, ptep)) { + /* + * When uffd-wp is enabled on the vma, unshare + * shouldn't happen at all. Warn about it if it + * happened due to some reason. + */ + WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); shared_pmd = true; @@@ -6327,37 -6161,20 +6327,37 @@@ } if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); + struct page *page = pfn_swap_entry_to_page(entry); - if (is_writable_migration_entry(entry)) { + if (!is_readable_migration_entry(entry)) { pte_t newpte; - entry = make_readable_migration_entry( - swp_offset(entry)); + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry( + swp_offset(entry)); + else + entry = make_readable_migration_entry( + swp_offset(entry)); newpte = swp_entry_to_pte(entry); + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); set_huge_swap_pte_at(mm, address, ptep, - newpte, huge_page_size(h)); + newpte, psize); pages++; } spin_unlock(ptl); continue; } + if (unlikely(pte_marker_uffd_wp(pte))) { + /* + * This is changing a non-present pte into a none pte, + * no need for huge_ptep_modify_prot_start/commit(). + */ + if (uffd_wp_resolve) + huge_pte_clear(mm, address, ptep, psize); + } if (!huge_pte_none(pte)) { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); @@@ -6365,18 -6182,8 +6365,18 @@@ old_pte = huge_ptep_modify_prot_start(vma, address, ptep); pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); + if (uffd_wp) + pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte)); + else if (uffd_wp_resolve) + pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; + } else { + /* None pte */ + if (unlikely(uffd_wp)) + /* Safe to modify directly (none->non-present). */ + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP)); } spin_unlock(ptl); } @@@ -6755,7 -6562,14 +6755,14 @@@ int huge_pmd_unshare(struct mm_struct * pud_clear(pud); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); - *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; + /* + * This update of passed address optimizes loops sequentially + * processing addresses in increments of huge page size (PMD_SIZE + * in this case). By clearing the pud, a PUD_SIZE area is unmapped. + * Update address to the 'last page' in the cleared area so that + * calling loop can move to first page past this area. + */ + *addr |= PUD_SIZE - PMD_SIZE; return 1; } @@@ -6879,11 -6693,9 +6886,11 @@@ follow_huge_pmd(struct mm_struct *mm, u spinlock_t *ptl; pte_t pte; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) + /* + * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via + * follow_hugetlb_page(). + */ + if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; retry: @@@ -6971,9 -6783,7 +6978,9 @@@ int get_hwpoison_huge_page(struct page spin_lock_irq(&hugetlb_lock); if (PageHeadHuge(page)) { *hugetlb = true; - if (HPageFreed(page) || HPageMigratable(page)) + if (HPageFreed(page)) + ret = 0; + else if (HPageMigratable(page)) ret = get_page_unless_zero(page); else ret = -EBUSY; @@@ -7063,7 -6873,6 +7070,7 @@@ void hugetlb_unshare_all_pmds(struct vm if (start >= end) return; + flush_cache_range(vma, start, end); /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. @@@ -7149,7 -6958,7 +7156,7 @@@ void __init hugetlb_cma_reserve(int ord if (hugetlb_cma_size_in_node[nid] == 0) continue; - if (!node_state(nid, N_ONLINE)) { + if (!node_online(nid)) { pr_warn("hugetlb_cma: invalid node %d specified\n", nid); hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; hugetlb_cma_size_in_node[nid] = 0; @@@ -7188,7 -6997,7 +7195,7 @@@ } reserved = 0; - for_each_node_state(nid, N_ONLINE) { + for_each_online_node(nid) { int res; char name[CMA_MAX_NAME]; diff --combined mm/page_alloc.c index bc93a82,5ced6cb..149f2ab --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@@ -81,7 -81,6 +81,7 @@@ #include "internal.h" #include "shuffle.h" #include "page_reporting.h" +#include "swap.h" /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ typedef int __bitwise fpi_t; @@@ -868,6 -867,40 +868,6 @@@ static inline void set_buddy_order(stru __SetPageBuddy(page); } -/* - * This function checks whether a page is free && is the buddy - * we can coalesce a page and its buddy if - * (a) the buddy is not in a hole (check before calling!) && - * (b) the buddy is in the buddy system && - * (c) a page and its buddy have the same order && - * (d) a page and its buddy are in the same zone. - * - * For recording whether a page is in the buddy system, we set PageBuddy. - * Setting, clearing, and testing PageBuddy is serialized by zone->lock. - * - * For recording page's order, we use page_private(page). - */ -static inline bool page_is_buddy(struct page *page, struct page *buddy, - unsigned int order) -{ - if (!page_is_guard(buddy) && !PageBuddy(buddy)) - return false; - - if (buddy_order(buddy) != order) - return false; - - /* - * zone check is done late to avoid uselessly calculating - * zone/node ids for pages that could never merge. - */ - if (page_zone_id(page) != page_zone_id(buddy)) - return false; - - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - - return true; -} - #ifdef CONFIG_COMPACTION static inline struct capture_control *task_capc(struct zone *zone) { @@@ -976,17 -1009,18 +976,17 @@@ static inline boo buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, struct page *page, unsigned int order) { - struct page *higher_page, *higher_buddy; - unsigned long combined_pfn; + unsigned long higher_page_pfn; + struct page *higher_page; if (order >= MAX_ORDER - 2) return false; - combined_pfn = buddy_pfn & pfn; - higher_page = page + (combined_pfn - pfn); - buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); - higher_buddy = higher_page + (buddy_pfn - combined_pfn); + higher_page_pfn = buddy_pfn & pfn; + higher_page = page + (higher_page_pfn - pfn); - return page_is_buddy(higher_page, higher_buddy, order + 1); + return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1, + NULL) != NULL; } /* @@@ -1019,6 -1053,7 +1019,6 @@@ static inline void __free_one_page(stru int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); - unsigned int max_order = pageblock_order; unsigned long buddy_pfn; unsigned long combined_pfn; struct page *buddy; @@@ -1034,32 -1069,18 +1034,32 @@@ VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); -continue_merging: - while (order < max_order) { + while (order < MAX_ORDER - 1) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); return; } - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - if (!page_is_buddy(page, buddy, order)) + buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); + if (!buddy) goto done_merging; + + if (unlikely(order >= pageblock_order)) { + /* + * We want to prevent merge between freepages on pageblock + * without fallbacks and normal pageblock. Without this, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. + */ + int buddy_mt = get_pageblock_migratetype(buddy); + + if (migratetype != buddy_mt + && (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; + } + /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. @@@ -1073,6 -1094,32 +1073,6 @@@ pfn = combined_pfn; order++; } - if (order < MAX_ORDER - 1) { - /* If we are here, it means order is >= pageblock_order. - * We want to prevent merge between freepages on pageblock - * without fallbacks and normal pageblock. Without this, - * pageblock isolation could cause incorrect freepage or CMA - * accounting or HIGHATOMIC accounting. - * - * We don't want to hit this code for the more frequent - * low-order merging. - */ - int buddy_mt; - - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - - if (!page_is_buddy(page, buddy, order)) - goto done_merging; - buddy_mt = get_pageblock_migratetype(buddy); - - if (migratetype != buddy_mt - && (!migratetype_is_mergeable(migratetype) || - !migratetype_is_mergeable(buddy_mt))) - goto done_merging; - max_order = order + 1; - goto continue_merging; - } done_merging: set_buddy_order(page, order); @@@ -1094,48 -1141,6 +1094,48 @@@ page_reporting_notify_free(order); } +/** + * split_free_page() -- split a free page at split_pfn_offset + * @free_page: the original free page + * @order: the order of the page + * @split_pfn_offset: split offset within the page + * + * It is used when the free page crosses two pageblocks with different migratetypes + * at split_pfn_offset within the page. The split free page will be put into + * separate migratetype lists afterwards. Otherwise, the function achieves + * nothing. + */ +void split_free_page(struct page *free_page, + int order, unsigned long split_pfn_offset) +{ + struct zone *zone = page_zone(free_page); + unsigned long free_page_pfn = page_to_pfn(free_page); + unsigned long pfn; + unsigned long flags; + int free_page_order; + + if (split_pfn_offset == 0) + return; + + spin_lock_irqsave(&zone->lock, flags); + del_page_from_free_list(free_page, zone, order); + for (pfn = free_page_pfn; + pfn < free_page_pfn + (1UL << order);) { + int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); + + free_page_order = min_t(int, + pfn ? __ffs(pfn) : order, + __fls(split_pfn_offset)); + __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, + mt, FPI_NONE); + pfn += 1UL << free_page_order; + split_pfn_offset -= (1UL << free_page_order); + /* we have done the first part, now switch to second part */ + if (split_pfn_offset == 0) + split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); + } + spin_unlock_irqrestore(&zone->lock, flags); +} /* * A bad page could be due to a number of fields. Instead of multiple branches, * try and check multiple fields with one check. The caller must do a detailed @@@ -2471,9 -2476,6 +2471,9 @@@ struct page *__rmqueue_smallest(struct del_page_from_free_list(page, zone, current_order); expand(zone, page, order, current_order, migratetype); set_pcppage_migratetype(page, migratetype); + trace_mm_page_alloc_zone_locked(page, order, migratetype, + pcp_allowed_order(order) && + migratetype < MIGRATE_PCPTYPES); return page; } @@@ -2997,7 -2999,7 +2997,7 @@@ __rmqueue(struct zone *zone, unsigned i zone_page_state(zone, NR_FREE_PAGES) / 2) { page = __rmqueue_cma_fallback(zone, order); if (page) - goto out; + return page; } } retry: @@@ -3010,6 -3012,9 +3010,6 @@@ alloc_flags)) goto retry; } -out: - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } @@@ -3728,8 -3733,11 +3728,8 @@@ struct page *rmqueue(struct zone *prefe * reserved for high-order atomic allocation, so order-0 * request should skip it. */ - if (order > 0 && alloc_flags & ALLOC_HARDER) { + if (order > 0 && alloc_flags & ALLOC_HARDER) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } if (!page) { page = __rmqueue(zone, order, migratetype, alloc_flags); if (!page) @@@ -3791,9 -3799,6 +3791,9 @@@ static bool __should_fail_alloc_page(gf (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; + if (gfp_mask & __GFP_NOWARN) + fail_page_alloc.attr.no_warn = true; + return should_fail(&fail_page_alloc.attr, 1 << order); } @@@ -4063,8 -4068,7 +4063,8 @@@ get_page_from_freelist(gfp_t gfp_mask, { struct zoneref *z; struct zone *zone; - struct pglist_data *last_pgdat_dirty_limit = NULL; + struct pglist_data *last_pgdat = NULL; + bool last_pgdat_dirty_ok = false; bool no_fallback; retry: @@@ -4103,13 -4107,13 +4103,13 @@@ * dirty-throttling and the flusher threads. */ if (ac->spread_dirty_pages) { - if (last_pgdat_dirty_limit == zone->zone_pgdat) - continue; + if (last_pgdat != zone->zone_pgdat) { + last_pgdat = zone->zone_pgdat; + last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat); + } - if (!node_dirty_ok(zone->zone_pgdat)) { - last_pgdat_dirty_limit = zone->zone_pgdat; + if (!last_pgdat_dirty_ok) continue; - } } if (no_fallback && nr_online_nodes > 1 && @@@ -4342,8 -4346,7 +4342,8 @@@ __alloc_pages_may_oom(gfp_t gfp_mask, u */ /* Exhausted what can be done so it's blame time */ - if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { + if (out_of_memory(&oc) || + WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) { *did_some_progress = 1; /* @@@ -4674,12 -4677,9 +4674,12 @@@ static void wake_all_kswapds(unsigned i for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { - if (last_pgdat != zone->zone_pgdat) + if (!managed_zone(zone)) + continue; + if (last_pgdat != zone->zone_pgdat) { wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); - last_pgdat = zone->zone_pgdat; + last_pgdat = zone->zone_pgdat; + } } } @@@ -5117,7 -5117,7 +5117,7 @@@ nopage * All existing users of the __GFP_NOFAIL are blockable, so warn * of any new users that actually require GFP_NOWAIT */ - if (WARN_ON_ONCE(!can_direct_reclaim)) + if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask)) goto fail; /* @@@ -5125,7 -5125,7 +5125,7 @@@ * because we cannot reclaim anything and only can loop waiting * for somebody to do a work for us */ - WARN_ON_ONCE(current->flags & PF_MEMALLOC); + WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask); /* * non failing costly orders are a hard requirement which we @@@ -5133,7 -5133,7 +5133,7 @@@ * so that we can identify them and convert them to something * else. */ - WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask); /* * Help non-failing allocations by giving them access to memory @@@ -5324,8 -5324,8 +5324,8 @@@ unsigned long __alloc_pages_bulk(gfp_t page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, pcp, pcp_list); if (unlikely(!page)) { - /* Try and get at least one page */ - if (!nr_populated) + /* Try and allocate at least one page */ + if (!nr_account) goto failed_irq; break; } @@@ -5379,8 -5379,10 +5379,8 @@@ struct page *__alloc_pages(gfp_t gfp, u * There are several places where we assume that the order value is sane * so bail out early if the request is out of bound. */ - if (unlikely(order >= MAX_ORDER)) { - WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); + if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp)) return NULL; - } gfp &= gfp_allowed_mask; /* @@@ -6169,6 -6171,7 +6169,6 @@@ int numa_zonelist_order_handler(struct } -#define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** @@@ -6215,7 -6218,7 +6215,7 @@@ int find_next_best_node(int node, nodem val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ - val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val *= MAX_NUMNODES; val += node_load[n]; if (val < min_val) { @@@ -6281,12 -6284,13 +6281,12 @@@ static void build_thisnode_zonelists(pg static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; - int node, load, nr_nodes = 0; + int node, nr_nodes = 0; nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = nr_online_nodes; prev_node = local_node; memset(node_order, 0, sizeof(node_order)); @@@ -6298,10 -6302,11 +6298,10 @@@ */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] += load; + node_load[node] += 1; node_order[nr_nodes++] = node; prev_node = node; - load--; } build_zonelists_in_node_order(pgdat, node_order, nr_nodes); @@@ -6640,21 -6645,6 +6640,21 @@@ static void __ref __init_zone_device_pa } } +/* + * With compound page geometry and when struct pages are stored in ram most + * tail pages are reused. Consequently, the amount of unique struct pages to + * initialize is a lot smaller that the total amount of struct pages being + * mapped. This is a paired / mild layering violation with explicit knowledge + * of how the sparse_vmemmap internals handle compound pages in the lack + * of an altmap. See vmemmap_populate_compound_pages(). + */ +static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, + unsigned long nr_pages) +{ + return is_power_of_2(sizeof(struct page)) && + !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages; +} + static void __ref memmap_init_compound(struct page *head, unsigned long head_pfn, unsigned long zone_idx, int nid, @@@ -6719,7 -6709,7 +6719,7 @@@ void __ref memmap_init_zone_device(stru continue; memmap_init_compound(page, pfn, zone_idx, nid, pgmap, - pfns_per_compound); + compound_nr_pages(altmap, pfns_per_compound)); } pr_info("%s initialised %lu pages in %ums\n", __func__, @@@ -7880,7 -7870,7 +7880,7 @@@ static void __init find_zone_movable_pf usable_startpfn = memblock_region_memory_base_pfn(r); - if (usable_startpfn < 0x100000) { + if (usable_startpfn < PHYS_PFN(SZ_4G)) { mem_below_4gb_not_mirrored = true; continue; } @@@ -8959,7 -8949,136 +8959,7 @@@ void *__init alloc_large_system_hash(co return table; } -/* - * This function checks whether pageblock includes unmovable pages or not. - * - * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable - * check without lock_page also may miss some movable non-lru pages at - * race condition. So you can't expect this function should be exact. - * - * Returns a page without holding a reference. If the caller wants to - * dereference that page (e.g., dumping), it has to make sure that it - * cannot get removed (e.g., via memory unplug) concurrently. - * - */ -struct page *has_unmovable_pages(struct zone *zone, struct page *page, - int migratetype, int flags) -{ - unsigned long iter = 0; - unsigned long pfn = page_to_pfn(page); - unsigned long offset = pfn % pageblock_nr_pages; - - if (is_migrate_cma_page(page)) { - /* - * CMA allocations (alloc_contig_range) really need to mark - * isolate CMA pageblocks even when they are not movable in fact - * so consider them movable here. - */ - if (is_migrate_cma(migratetype)) - return NULL; - - return page; - } - - for (; iter < pageblock_nr_pages - offset; iter++) { - page = pfn_to_page(pfn + iter); - - /* - * Both, bootmem allocations and memory holes are marked - * PG_reserved and are unmovable. We can even have unmovable - * allocations inside ZONE_MOVABLE, for example when - * specifying "movablecore". - */ - if (PageReserved(page)) - return page; - - /* - * If the zone is movable and we have ruled out all reserved - * pages then it should be reasonably safe to assume the rest - * is movable. - */ - if (zone_idx(zone) == ZONE_MOVABLE) - continue; - - /* - * Hugepages are not in LRU lists, but they're movable. - * THPs are on the LRU, but need to be counted as #small pages. - * We need not scan over tail pages because we don't - * handle each tail page individually in migration. - */ - if (PageHuge(page) || PageTransCompound(page)) { - struct page *head = compound_head(page); - unsigned int skip_pages; - - if (PageHuge(page)) { - if (!hugepage_migration_supported(page_hstate(head))) - return page; - } else if (!PageLRU(head) && !__PageMovable(head)) { - return page; - } - - skip_pages = compound_nr(head) - (page - head); - iter += skip_pages - 1; - continue; - } - - /* - * We can't use page_count without pin a page - * because another CPU can free compound page. - * This check already skips compound tails of THP - * because their page->_refcount is zero at all time. - */ - if (!page_ref_count(page)) { - if (PageBuddy(page)) - iter += (1 << buddy_order(page)) - 1; - continue; - } - - /* - * The HWPoisoned page may be not in buddy system, and - * page_count() is not 0. - */ - if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) - continue; - - /* - * We treat all PageOffline() pages as movable when offlining - * to give drivers a chance to decrement their reference count - * in MEM_GOING_OFFLINE in order to indicate that these pages - * can be offlined as there are no direct references anymore. - * For actually unmovable PageOffline() where the driver does - * not support this, we will fail later when trying to actually - * move these pages that still have a reference count > 0. - * (false negatives in this function only) - */ - if ((flags & MEMORY_OFFLINE) && PageOffline(page)) - continue; - - if (__PageMovable(page) || PageLRU(page)) - continue; - - /* - * If there are RECLAIMABLE pages, we need to check - * it. But now, memory offline itself doesn't call - * shrink_node_slabs() and it still to be fixed. - */ - return page; - } - return NULL; -} - #ifdef CONFIG_CONTIG_ALLOC -static unsigned long pfn_max_align_down(unsigned long pfn) -{ - return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES); -} - -static unsigned long pfn_max_align_up(unsigned long pfn) -{ - return ALIGN(pfn, MAX_ORDER_NR_PAGES); -} - #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* Usage: See admin-guide/dynamic-debug-howto.rst */ @@@ -8982,7 -9101,7 +8982,7 @@@ static inline void alloc_contig_dump_pa #endif /* [start, end) must belong to a single zone. */ -static int __alloc_contig_migrate_range(struct compact_control *cc, +int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ @@@ -9032,7 -9151,7 +9032,7 @@@ lru_cache_enable(); if (ret < 0) { - if (ret == -EBUSY) + if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; @@@ -9050,8 -9169,8 +9050,8 @@@ * be either of the two. * @gfp_mask: GFP mask to use during compaction * - * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES - * aligned. The PFN range must belong to a single zone. + * The PFN range does not have to be pageblock aligned. The PFN range must + * belong to a single zone. * * The first thing this routine does is attempt to MIGRATE_ISOLATE all * pageblocks in the range. Once isolated, the pageblocks should not @@@ -9065,7 -9184,7 +9065,7 @@@ int alloc_contig_range(unsigned long st unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; - unsigned int order; + int order; int ret = 0; struct compact_control cc = { @@@ -9084,11 -9203,14 +9084,11 @@@ * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may * have different sizes, and due to the way page allocator - * work, we align the range to biggest of the two pages so - * that page allocator won't try to merge buddies from - * different pageblocks and change MIGRATE_ISOLATE to some - * other migration type. + * work, start_isolate_page_range() has special handlings for this. * * Once the pageblocks are marked as MIGRATE_ISOLATE, we * migrate the pages from an unaligned range (ie. pages that - * we are interested in). This will put all the pages in + * we are interested in). This will put all the pages in * range back to page allocator as MIGRATE_ISOLATE. * * When this is done, we take the pages in range from page @@@ -9101,9 -9223,10 +9101,9 @@@ * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype, 0); + ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); if (ret) - return ret; + goto done; drain_all_pages(cc.zone); @@@ -9123,7 -9246,7 +9123,7 @@@ ret = 0; /* - * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * Pages from [start, end) are within a pageblock_nr_pages * aligned blocks that are marked as MIGRATE_ISOLATE. What's * more, all pages in [start, end) are free in page allocator. * What we are going to do is to allocate all pages from @@@ -9182,7 -9305,8 +9182,7 @@@ free_contig_range(end, outer_end - end); done: - undo_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype); + undo_isolate_page_range(start, end, migratetype); return ret; } EXPORT_SYMBOL(alloc_contig_range); @@@ -9501,6 -9625,7 +9501,6 @@@ bool put_page_back_buddy(struct page *p ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); if (TestClearPageHWPoison(page)) { - num_poisoned_pages_dec(); ret = true; } } diff --combined mm/page_table_check.c index 3692bea,bc55be2..e206274 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@@ -52,6 -52,23 +52,6 @@@ static struct page_table_check *get_pag return (void *)(page_ext) + page_table_check_ops.offset; } -static inline bool pte_user_accessible_page(pte_t pte) -{ - return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); -} - -static inline bool pmd_user_accessible_page(pmd_t pmd) -{ - return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && - (pmd_val(pmd) & _PAGE_USER); -} - -static inline bool pud_user_accessible_page(pud_t pud) -{ - return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && - (pud_val(pud) & _PAGE_USER); -} - /* * An enty is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. @@@ -160,7 -177,7 +160,7 @@@ void __page_table_check_pmd_clear(struc if (pmd_user_accessible_page(pmd)) { page_table_check_clear(mm, addr, pmd_pfn(pmd), - PMD_PAGE_SIZE >> PAGE_SHIFT); + PMD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pmd_clear); @@@ -173,7 -190,7 +173,7 @@@ void __page_table_check_pud_clear(struc if (pud_user_accessible_page(pud)) { page_table_check_clear(mm, addr, pud_pfn(pud), - PUD_PAGE_SIZE >> PAGE_SHIFT); + PUD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pud_clear); @@@ -202,7 -219,7 +202,7 @@@ void __page_table_check_pmd_set(struct __page_table_check_pmd_clear(mm, addr, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(mm, addr, pmd_pfn(pmd), - PMD_PAGE_SIZE >> PAGE_SHIFT, + PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); } } @@@ -217,7 -234,7 +217,7 @@@ void __page_table_check_pud_set(struct __page_table_check_pud_clear(mm, addr, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(mm, addr, pud_pfn(pud), - PUD_PAGE_SIZE >> PAGE_SHIFT, + PUD_SIZE >> PAGE_SHIFT, pud_write(pud)); } } @@@ -234,11 -251,11 +234,11 @@@ void __page_table_check_pte_clear_range pte_t *ptep = pte_offset_map(&pmd, addr); unsigned long i; - pte_unmap(ptep); for (i = 0; i < PTRS_PER_PTE; i++) { __page_table_check_pte_clear(mm, addr, *ptep); addr += PAGE_SIZE; ptep++; } + pte_unmap(ptep - PTRS_PER_PTE); } }