Merge tag 'ata-6.1-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal...
[platform/kernel/linux-starfive.git] / mm / khugepaged.c
index 70b7ac6..4734315 100644 (file)
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
+#include "mm_slot.h"
 
 enum scan_result {
        SCAN_FAIL,
        SCAN_SUCCEED,
        SCAN_PMD_NULL,
+       SCAN_PMD_NONE,
+       SCAN_PMD_MAPPED,
        SCAN_EXCEED_NONE_PTE,
        SCAN_EXCEED_SWAP_PTE,
        SCAN_EXCEED_SHARED_PTE,
        SCAN_PTE_NON_PRESENT,
        SCAN_PTE_UFFD_WP,
+       SCAN_PTE_MAPPED_HUGEPAGE,
        SCAN_PAGE_RO,
        SCAN_LACK_REFERENCED_PAGE,
        SCAN_PAGE_NULL,
@@ -73,6 +77,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  * default collapse hugepages if there is at least one pte mapped like
  * it would have happened if the vma was large enough during page
  * fault.
+ *
+ * Note that these are only respected if collapse was initiated by khugepaged.
  */
 static unsigned int khugepaged_max_ptes_none __read_mostly;
 static unsigned int khugepaged_max_ptes_swap __read_mostly;
@@ -85,18 +91,24 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
 
 #define MAX_PTE_MAPPED_THP 8
 
+struct collapse_control {
+       bool is_khugepaged;
+
+       /* Num pages scanned per node */
+       u32 node_load[MAX_NUMNODES];
+
+       /* Last target selected in hpage_collapse_find_target_node() */
+       int last_target_node;
+};
+
 /**
- * struct mm_slot - hash lookup from mm to mm_slot
- * @hash: hash collision list
- * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
- * @mm: the mm that this information is valid for
+ * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
+ * @slot: hash lookup from mm to mm_slot
  * @nr_pte_mapped_thp: number of pte mapped THP
  * @pte_mapped_thp: address array corresponding pte mapped THP
  */
-struct mm_slot {
-       struct hlist_node hash;
-       struct list_head mm_node;
-       struct mm_struct *mm;
+struct khugepaged_mm_slot {
+       struct mm_slot slot;
 
        /* pte-mapped THP in this mm */
        int nr_pte_mapped_thp;
@@ -113,7 +125,7 @@ struct mm_slot {
  */
 struct khugepaged_scan {
        struct list_head mm_head;
-       struct mm_slot *mm_slot;
+       struct khugepaged_mm_slot *mm_slot;
        unsigned long address;
 };
 
@@ -377,8 +389,9 @@ int hugepage_madvise(struct vm_area_struct *vma,
 int __init khugepaged_init(void)
 {
        mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
-                                         sizeof(struct mm_slot),
-                                         __alignof__(struct mm_slot), 0, NULL);
+                                         sizeof(struct khugepaged_mm_slot),
+                                         __alignof__(struct khugepaged_mm_slot),
+                                         0, NULL);
        if (!mm_slot_cache)
                return -ENOMEM;
 
@@ -395,65 +408,38 @@ void __init khugepaged_destroy(void)
        kmem_cache_destroy(mm_slot_cache);
 }
 
-static inline struct mm_slot *alloc_mm_slot(void)
-{
-       if (!mm_slot_cache)     /* initialization failed */
-               return NULL;
-       return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
-}
-
-static inline void free_mm_slot(struct mm_slot *mm_slot)
-{
-       kmem_cache_free(mm_slot_cache, mm_slot);
-}
-
-static struct mm_slot *get_mm_slot(struct mm_struct *mm)
-{
-       struct mm_slot *mm_slot;
-
-       hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
-               if (mm == mm_slot->mm)
-                       return mm_slot;
-
-       return NULL;
-}
-
-static void insert_to_mm_slots_hash(struct mm_struct *mm,
-                                   struct mm_slot *mm_slot)
-{
-       mm_slot->mm = mm;
-       hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
-}
-
-static inline int khugepaged_test_exit(struct mm_struct *mm)
+static inline int hpage_collapse_test_exit(struct mm_struct *mm)
 {
        return atomic_read(&mm->mm_users) == 0;
 }
 
 void __khugepaged_enter(struct mm_struct *mm)
 {
-       struct mm_slot *mm_slot;
+       struct khugepaged_mm_slot *mm_slot;
+       struct mm_slot *slot;
        int wakeup;
 
-       mm_slot = alloc_mm_slot();
+       mm_slot = mm_slot_alloc(mm_slot_cache);
        if (!mm_slot)
                return;
 
+       slot = &mm_slot->slot;
+
        /* __khugepaged_exit() must not run from under us */
-       VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
+       VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
-               free_mm_slot(mm_slot);
+               mm_slot_free(mm_slot_cache, mm_slot);
                return;
        }
 
        spin_lock(&khugepaged_mm_lock);
-       insert_to_mm_slots_hash(mm, mm_slot);
+       mm_slot_insert(mm_slots_hash, mm, slot);
        /*
         * Insert just behind the scanning cursor, to let the area settle
         * down a little.
         */
        wakeup = list_empty(&khugepaged_scan.mm_head);
-       list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+       list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
        spin_unlock(&khugepaged_mm_lock);
 
        mmgrab(mm);
@@ -466,37 +452,38 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 {
        if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
            hugepage_flags_enabled()) {
-               if (hugepage_vma_check(vma, vm_flags, false, false))
+               if (hugepage_vma_check(vma, vm_flags, false, false, true))
                        __khugepaged_enter(vma->vm_mm);
        }
 }
 
 void __khugepaged_exit(struct mm_struct *mm)
 {
-       struct mm_slot *mm_slot;
+       struct khugepaged_mm_slot *mm_slot;
+       struct mm_slot *slot;
        int free = 0;
 
        spin_lock(&khugepaged_mm_lock);
-       mm_slot = get_mm_slot(mm);
+       slot = mm_slot_lookup(mm_slots_hash, mm);
+       mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
-               hash_del(&mm_slot->hash);
-               list_del(&mm_slot->mm_node);
+               hash_del(&slot->hash);
+               list_del(&slot->mm_node);
                free = 1;
        }
        spin_unlock(&khugepaged_mm_lock);
 
        if (free) {
                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
-               free_mm_slot(mm_slot);
+               mm_slot_free(mm_slot_cache, mm_slot);
                mmdrop(mm);
        } else if (mm_slot) {
                /*
                 * This is required to serialize against
-                * khugepaged_test_exit() (which is guaranteed to run
-                * under mmap sem read mode). Stop here (after we
-                * return all pagetables will be destroyed) until
-                * khugepaged has finished working on the pagetables
-                * under the mmap_lock.
+                * hpage_collapse_test_exit() (which is guaranteed to run
+                * under mmap sem read mode). Stop here (after we return all
+                * pagetables will be destroyed) until khugepaged has finished
+                * working on the pagetables under the mmap_lock.
                 */
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
@@ -546,11 +533,12 @@ static bool is_refcount_suitable(struct page *page)
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pte_t *pte,
+                                       struct collapse_control *cc,
                                        struct list_head *compound_pagelist)
 {
        struct page *page = NULL;
        pte_t *_pte;
-       int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
+       int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
        bool writable = false;
 
        for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
@@ -558,8 +546,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                pte_t pteval = *_pte;
                if (pte_none(pteval) || (pte_present(pteval) &&
                                is_zero_pfn(pte_pfn(pteval)))) {
+                       ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none) {
+                           (!cc->is_khugepaged ||
+                            none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
@@ -579,11 +569,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
                VM_BUG_ON_PAGE(!PageAnon(page), page);
 
-               if (page_mapcount(page) > 1 &&
-                               ++shared > khugepaged_max_ptes_shared) {
-                       result = SCAN_EXCEED_SHARED_PTE;
-                       count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
-                       goto out;
+               if (page_mapcount(page) > 1) {
+                       ++shared;
+                       if (cc->is_khugepaged &&
+                           shared > khugepaged_max_ptes_shared) {
+                               result = SCAN_EXCEED_SHARED_PTE;
+                               count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+                               goto out;
+                       }
                }
 
                if (PageCompound(page)) {
@@ -646,10 +639,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                if (PageCompound(page))
                        list_add_tail(&page->lru, compound_pagelist);
 next:
-               /* There should be enough young pte to collapse the page */
-               if (pte_young(pteval) ||
-                   page_is_young(page) || PageReferenced(page) ||
-                   mmu_notifier_test_young(vma->vm_mm, address))
+               /*
+                * If collapse was initiated by khugepaged, check that there is
+                * enough young pte to justify collapsing the page
+                */
+               if (cc->is_khugepaged &&
+                   (pte_young(pteval) || page_is_young(page) ||
+                    PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+                                                                    address)))
                        referenced++;
 
                if (pte_write(pteval))
@@ -658,19 +655,19 @@ next:
 
        if (unlikely(!writable)) {
                result = SCAN_PAGE_RO;
-       } else if (unlikely(!referenced)) {
+       } else if (unlikely(cc->is_khugepaged && !referenced)) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
                trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                                    referenced, writable, result);
-               return 1;
+               return result;
        }
 out:
        release_pte_pages(pte, _pte, compound_pagelist);
        trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                            referenced, writable, result);
-       return 0;
+       return result;
 }
 
 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -730,14 +727,17 @@ static void khugepaged_alloc_sleep(void)
        DEFINE_WAIT(wait);
 
        add_wait_queue(&khugepaged_wait, &wait);
-       freezable_schedule_timeout_interruptible(
-               msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+       __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+       schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
        remove_wait_queue(&khugepaged_wait, &wait);
 }
 
-static int khugepaged_node_load[MAX_NUMNODES];
+struct collapse_control khugepaged_collapse_control = {
+       .is_khugepaged = true,
+       .last_target_node = NUMA_NO_NODE,
+};
 
-static bool khugepaged_scan_abort(int nid)
+static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
 {
        int i;
 
@@ -749,11 +749,11 @@ static bool khugepaged_scan_abort(int nid)
                return false;
 
        /* If there is a count for this node already, it must be acceptable */
-       if (khugepaged_node_load[nid])
+       if (cc->node_load[nid])
                return false;
 
        for (i = 0; i < MAX_NUMNODES; i++) {
-               if (!khugepaged_node_load[i])
+               if (!cc->node_load[i])
                        continue;
                if (node_distance(nid, i) > node_reclaim_distance)
                        return true;
@@ -772,146 +772,63 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
 }
 
 #ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
 {
-       static int last_khugepaged_target_node = NUMA_NO_NODE;
        int nid, target_node = 0, max_value = 0;
 
        /* find first node with max normal pages hit */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
-               if (khugepaged_node_load[nid] > max_value) {
-                       max_value = khugepaged_node_load[nid];
+               if (cc->node_load[nid] > max_value) {
+                       max_value = cc->node_load[nid];
                        target_node = nid;
                }
 
        /* do some balance if several nodes have the same hit record */
-       if (target_node <= last_khugepaged_target_node)
-               for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
-                               nid++)
-                       if (max_value == khugepaged_node_load[nid]) {
+       if (target_node <= cc->last_target_node)
+               for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES;
+                    nid++)
+                       if (max_value == cc->node_load[nid]) {
                                target_node = nid;
                                break;
                        }
 
-       last_khugepaged_target_node = target_node;
+       cc->last_target_node = target_node;
        return target_node;
 }
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+#else
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
 {
-       if (IS_ERR(*hpage)) {
-               if (!*wait)
-                       return false;
-
-               *wait = false;
-               *hpage = NULL;
-               khugepaged_alloc_sleep();
-       } else if (*hpage) {
-               put_page(*hpage);
-               *hpage = NULL;
-       }
-
-       return true;
+       return 0;
 }
+#endif
 
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node)
 {
-       VM_BUG_ON_PAGE(*hpage, *hpage);
-
        *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-               *hpage = ERR_PTR(-ENOMEM);
-               return NULL;
+               return false;
        }
 
        prep_transhuge_page(*hpage);
        count_vm_event(THP_COLLAPSE_ALLOC);
-       return *hpage;
-}
-#else
-static int khugepaged_find_target_node(void)
-{
-       return 0;
-}
-
-static inline struct page *alloc_khugepaged_hugepage(void)
-{
-       struct page *page;
-
-       page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
-                          HPAGE_PMD_ORDER);
-       if (page)
-               prep_transhuge_page(page);
-       return page;
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
-       struct page *hpage;
-
-       do {
-               hpage = alloc_khugepaged_hugepage();
-               if (!hpage) {
-                       count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                       if (!*wait)
-                               return NULL;
-
-                       *wait = false;
-                       khugepaged_alloc_sleep();
-               } else
-                       count_vm_event(THP_COLLAPSE_ALLOC);
-       } while (unlikely(!hpage) && likely(hugepage_flags_enabled()));
-
-       return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
-       /*
-        * If the hpage allocated earlier was briefly exposed in page cache
-        * before collapse_file() failed, it is possible that racing lookups
-        * have not yet completed, and would then be unpleasantly surprised by
-        * finding the hpage reused for the same mapping at a different offset.
-        * Just release the previous allocation if there is any danger of that.
-        */
-       if (*hpage && page_count(*hpage) > 1) {
-               put_page(*hpage);
-               *hpage = NULL;
-       }
-
-       if (!*hpage)
-               *hpage = khugepaged_alloc_hugepage(wait);
-
-       if (unlikely(!*hpage))
-               return false;
-
        return true;
 }
 
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
-{
-       VM_BUG_ON(!*hpage);
-
-       return  *hpage;
-}
-#endif
-
 /*
  * If mmap_lock temporarily dropped, revalidate vma
  * before taking mmap_lock.
- * Return 0 if succeeds, otherwise return none-zero
- * value (scan code).
+ * Returns enum scan_result value.
  */
 
 static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
-               struct vm_area_struct **vmap)
+                                  bool expect_anon,
+                                  struct vm_area_struct **vmap,
+                                  struct collapse_control *cc)
 {
        struct vm_area_struct *vma;
 
-       if (unlikely(khugepaged_test_exit(mm)))
+       if (unlikely(hpage_collapse_test_exit(mm)))
                return SCAN_ANY_PROCESS;
 
        *vmap = vma = find_vma(mm, address);
@@ -920,7 +837,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 
        if (!transhuge_vma_suitable(vma, address))
                return SCAN_ADDRESS_RANGE;
-       if (!hugepage_vma_check(vma, vma->vm_flags, false, false))
+       if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
+                               cc->is_khugepaged))
                return SCAN_VMA_CHECK;
        /*
         * Anon VMA expected, the address may be unmapped then
@@ -929,23 +847,62 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
         * hugepage_vma_check may return true for qualified file
         * vmas.
         */
-       if (!vma->anon_vma || !vma_is_anonymous(vma))
-               return SCAN_VMA_CHECK;
-       return 0;
+       if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
+               return SCAN_PAGE_ANON;
+       return SCAN_SUCCEED;
+}
+
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+                                  unsigned long address,
+                                  pmd_t **pmd)
+{
+       pmd_t pmde;
+
+       *pmd = mm_find_pmd(mm, address);
+       if (!*pmd)
+               return SCAN_PMD_NULL;
+
+       pmde = pmd_read_atomic(*pmd);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
+       barrier();
+#endif
+       if (pmd_none(pmde))
+               return SCAN_PMD_NONE;
+       if (pmd_trans_huge(pmde))
+               return SCAN_PMD_MAPPED;
+       if (pmd_bad(pmde))
+               return SCAN_PMD_NULL;
+       return SCAN_SUCCEED;
+}
+
+static int check_pmd_still_valid(struct mm_struct *mm,
+                                unsigned long address,
+                                pmd_t *pmd)
+{
+       pmd_t *new_pmd;
+       int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
+
+       if (result != SCAN_SUCCEED)
+               return result;
+       if (new_pmd != pmd)
+               return SCAN_FAIL;
+       return SCAN_SUCCEED;
 }
 
 /*
  * Bring missing pages in from swap, to complete THP collapse.
- * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
  *
  * Called and returns without pte mapped or spinlocks held.
  * Note that if false is returned, mmap_lock will be released.
  */
 
-static bool __collapse_huge_page_swapin(struct mm_struct *mm,
-                                       struct vm_area_struct *vma,
-                                       unsigned long haddr, pmd_t *pmd,
-                                       int referenced)
+static int __collapse_huge_page_swapin(struct mm_struct *mm,
+                                      struct vm_area_struct *vma,
+                                      unsigned long haddr, pmd_t *pmd,
+                                      int referenced)
 {
        int swapped_in = 0;
        vm_fault_t ret = 0;
@@ -976,12 +933,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                 */
                if (ret & VM_FAULT_RETRY) {
                        trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
-                       return false;
+                       /* Likely, but not guaranteed, that page lock failed */
+                       return SCAN_PAGE_LOCK;
                }
                if (ret & VM_FAULT_ERROR) {
                        mmap_read_unlock(mm);
                        trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
-                       return false;
+                       return SCAN_FAIL;
                }
                swapped_in++;
        }
@@ -991,30 +949,41 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                lru_add_drain();
 
        trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
-       return true;
+       return SCAN_SUCCEED;
 }
 
-static void collapse_huge_page(struct mm_struct *mm,
-                                  unsigned long address,
-                                  struct page **hpage,
-                                  int node, int referenced, int unmapped)
+static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
+                             struct collapse_control *cc)
+{
+       /* Only allocate from the target node */
+       gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
+                    GFP_TRANSHUGE) | __GFP_THISNODE;
+       int node = hpage_collapse_find_target_node(cc);
+
+       if (!hpage_collapse_alloc_page(hpage, gfp, node))
+               return SCAN_ALLOC_HUGE_PAGE_FAIL;
+       if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
+               return SCAN_CGROUP_CHARGE_FAIL;
+       count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+       return SCAN_SUCCEED;
+}
+
+static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
+                             int referenced, int unmapped,
+                             struct collapse_control *cc)
 {
        LIST_HEAD(compound_pagelist);
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pgtable_t pgtable;
-       struct page *new_page;
+       struct page *hpage;
        spinlock_t *pmd_ptl, *pte_ptl;
-       int isolated = 0, result = 0;
+       int result = SCAN_FAIL;
        struct vm_area_struct *vma;
        struct mmu_notifier_range range;
-       gfp_t gfp;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
-       /* Only allocate from the target node */
-       gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
        /*
         * Before allocating the hugepage, release the mmap_lock read lock.
         * The allocation can take potentially a long time if it involves
@@ -1022,40 +991,34 @@ static void collapse_huge_page(struct mm_struct *mm,
         * that. We will recheck the vma after taking it again in write mode.
         */
        mmap_read_unlock(mm);
-       new_page = khugepaged_alloc_page(hpage, gfp, node);
-       if (!new_page) {
-               result = SCAN_ALLOC_HUGE_PAGE_FAIL;
-               goto out_nolock;
-       }
 
-       if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
-               result = SCAN_CGROUP_CHARGE_FAIL;
+       result = alloc_charge_hpage(&hpage, mm, cc);
+       if (result != SCAN_SUCCEED)
                goto out_nolock;
-       }
-       count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
 
        mmap_read_lock(mm);
-       result = hugepage_vma_revalidate(mm, address, &vma);
-       if (result) {
+       result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+       if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }
 
-       pmd = mm_find_pmd(mm, address);
-       if (!pmd) {
-               result = SCAN_PMD_NULL;
+       result = find_pmd_or_thp_or_none(mm, address, &pmd);
+       if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }
 
-       /*
-        * __collapse_huge_page_swapin will return with mmap_lock released
-        * when it fails. So we jump out_nolock directly in that case.
-        * Continuing to collapse causes inconsistency.
-        */
-       if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
-                                                    pmd, referenced)) {
-               goto out_nolock;
+       if (unmapped) {
+               /*
+                * __collapse_huge_page_swapin will return with mmap_lock
+                * released when it fails. So we jump out_nolock directly in
+                * that case.  Continuing to collapse causes inconsistency.
+                */
+               result = __collapse_huge_page_swapin(mm, vma, address, pmd,
+                                                    referenced);
+               if (result != SCAN_SUCCEED)
+                       goto out_nolock;
        }
 
        mmap_read_unlock(mm);
@@ -1065,11 +1028,12 @@ static void collapse_huge_page(struct mm_struct *mm,
         * handled by the anon_vma lock + PG_lock.
         */
        mmap_write_lock(mm);
-       result = hugepage_vma_revalidate(mm, address, &vma);
-       if (result)
+       result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+       if (result != SCAN_SUCCEED)
                goto out_up_write;
        /* check if the pmd is still valid */
-       if (mm_find_pmd(mm, address) != pmd)
+       result = check_pmd_still_valid(mm, address, pmd);
+       if (result != SCAN_SUCCEED)
                goto out_up_write;
 
        anon_vma_lock_write(vma->anon_vma);
@@ -1095,11 +1059,11 @@ static void collapse_huge_page(struct mm_struct *mm,
        mmu_notifier_invalidate_range_end(&range);
 
        spin_lock(pte_ptl);
-       isolated = __collapse_huge_page_isolate(vma, address, pte,
-                       &compound_pagelist);
+       result =  __collapse_huge_page_isolate(vma, address, pte, cc,
+                                              &compound_pagelist);
        spin_unlock(pte_ptl);
 
-       if (unlikely(!isolated)) {
+       if (unlikely(result != SCAN_SUCCEED)) {
                pte_unmap(pte);
                spin_lock(pmd_ptl);
                BUG_ON(!pmd_none(*pmd));
@@ -1111,7 +1075,6 @@ static void collapse_huge_page(struct mm_struct *mm,
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
-               result = SCAN_FAIL;
                goto out_up_write;
        }
 
@@ -1121,8 +1084,8 @@ static void collapse_huge_page(struct mm_struct *mm,
         */
        anon_vma_unlock_write(vma->anon_vma);
 
-       __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
-                       &compound_pagelist);
+       __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
+                                 &compound_pagelist);
        pte_unmap(pte);
        /*
         * spin_lock() below is not the equivalent of smp_wmb(), but
@@ -1130,42 +1093,43 @@ static void collapse_huge_page(struct mm_struct *mm,
         * avoid the copy_huge_page writes to become visible after
         * the set_pmd_at() write.
         */
-       __SetPageUptodate(new_page);
+       __SetPageUptodate(hpage);
        pgtable = pmd_pgtable(_pmd);
 
-       _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+       _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
 
        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
-       page_add_new_anon_rmap(new_page, vma, address);
-       lru_cache_add_inactive_or_unevictable(new_page, vma);
+       page_add_new_anon_rmap(hpage, vma, address);
+       lru_cache_add_inactive_or_unevictable(hpage, vma);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
        spin_unlock(pmd_ptl);
 
-       *hpage = NULL;
+       hpage = NULL;
 
-       khugepaged_pages_collapsed++;
        result = SCAN_SUCCEED;
 out_up_write:
        mmap_write_unlock(mm);
 out_nolock:
-       if (!IS_ERR_OR_NULL(*hpage))
-               mem_cgroup_uncharge(page_folio(*hpage));
-       trace_mm_collapse_huge_page(mm, isolated, result);
-       return;
+       if (hpage) {
+               mem_cgroup_uncharge(page_folio(hpage));
+               put_page(hpage);
+       }
+       trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+       return result;
 }
 
-static int khugepaged_scan_pmd(struct mm_struct *mm,
-                              struct vm_area_struct *vma,
-                              unsigned long address,
-                              struct page **hpage)
+static int hpage_collapse_scan_pmd(struct mm_struct *mm,
+                                  struct vm_area_struct *vma,
+                                  unsigned long address, bool *mmap_locked,
+                                  struct collapse_control *cc)
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
-       int ret = 0, result = 0, referenced = 0;
+       int result = SCAN_FAIL, referenced = 0;
        int none_or_zero = 0, shared = 0;
        struct page *page = NULL;
        unsigned long _address;
@@ -1175,19 +1139,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
-       pmd = mm_find_pmd(mm, address);
-       if (!pmd) {
-               result = SCAN_PMD_NULL;
+       result = find_pmd_or_thp_or_none(mm, address, &pmd);
+       if (result != SCAN_SUCCEED)
                goto out;
-       }
 
-       memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+       memset(cc->node_load, 0, sizeof(cc->node_load));
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (is_swap_pte(pteval)) {
-                       if (++unmapped <= khugepaged_max_ptes_swap) {
+                       ++unmapped;
+                       if (!cc->is_khugepaged ||
+                           unmapped <= khugepaged_max_ptes_swap) {
                                /*
                                 * Always be strict with uffd-wp
                                 * enabled swap entries.  Please see
@@ -1205,8 +1169,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        }
                }
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+                       ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none) {
+                           (!cc->is_khugepaged ||
+                            none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
@@ -1236,27 +1202,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        goto out_unmap;
                }
 
-               if (page_mapcount(page) > 1 &&
-                               ++shared > khugepaged_max_ptes_shared) {
-                       result = SCAN_EXCEED_SHARED_PTE;
-                       count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
-                       goto out_unmap;
+               if (page_mapcount(page) > 1) {
+                       ++shared;
+                       if (cc->is_khugepaged &&
+                           shared > khugepaged_max_ptes_shared) {
+                               result = SCAN_EXCEED_SHARED_PTE;
+                               count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+                               goto out_unmap;
+                       }
                }
 
                page = compound_head(page);
 
                /*
                 * Record which node the original page is from and save this
-                * information to khugepaged_node_load[].
+                * information to cc->node_load[].
                 * Khugepaged will allocate hugepage from the node has the max
                 * hit record.
                 */
                node = page_to_nid(page);
-               if (khugepaged_scan_abort(node)) {
+               if (hpage_collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        goto out_unmap;
                }
-               khugepaged_node_load[node]++;
+               cc->node_load[node]++;
                if (!PageLRU(page)) {
                        result = SCAN_PAGE_LRU;
                        goto out_unmap;
@@ -1291,43 +1260,51 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        result = SCAN_PAGE_COUNT;
                        goto out_unmap;
                }
-               if (pte_young(pteval) ||
-                   page_is_young(page) || PageReferenced(page) ||
-                   mmu_notifier_test_young(vma->vm_mm, address))
+
+               /*
+                * If collapse was initiated by khugepaged, check that there is
+                * enough young pte to justify collapsing the page
+                */
+               if (cc->is_khugepaged &&
+                   (pte_young(pteval) || page_is_young(page) ||
+                    PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+                                                                    address)))
                        referenced++;
        }
        if (!writable) {
                result = SCAN_PAGE_RO;
-       } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+       } else if (cc->is_khugepaged &&
+                  (!referenced ||
+                   (unmapped && referenced < HPAGE_PMD_NR / 2))) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
-               ret = 1;
        }
 out_unmap:
        pte_unmap_unlock(pte, ptl);
-       if (ret) {
-               node = khugepaged_find_target_node();
+       if (result == SCAN_SUCCEED) {
+               result = collapse_huge_page(mm, address, referenced,
+                                           unmapped, cc);
                /* collapse_huge_page will return with the mmap_lock released */
-               collapse_huge_page(mm, address, hpage, node,
-                               referenced, unmapped);
+               *mmap_locked = false;
        }
 out:
        trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
                                     none_or_zero, result, unmapped);
-       return ret;
+       return result;
 }
 
-static void collect_mm_slot(struct mm_slot *mm_slot)
+static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
 {
-       struct mm_struct *mm = mm_slot->mm;
+       struct mm_slot *slot = &mm_slot->slot;
+       struct mm_struct *mm = slot->mm;
 
        lockdep_assert_held(&khugepaged_mm_lock);
 
-       if (khugepaged_test_exit(mm)) {
+       if (hpage_collapse_test_exit(mm)) {
                /* free mm_slot */
-               hash_del(&mm_slot->hash);
-               list_del(&mm_slot->mm_node);
+               hash_del(&slot->hash);
+               list_del(&slot->mm_node);
 
                /*
                 * Not strictly needed because the mm exited already.
@@ -1336,7 +1313,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
                 */
 
                /* khugepaged_mm_lock actually not necessary for the below */
-               free_mm_slot(mm_slot);
+               mm_slot_free(mm_slot_cache, mm_slot);
                mmdrop(mm);
        }
 }
@@ -1345,19 +1322,66 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
 /*
  * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
  * khugepaged should try to collapse the page table.
+ *
+ * Note that following race exists:
+ * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A,
+ *     emptying the A's ->pte_mapped_thp[] array.
+ * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and
+ *     retract_page_tables() finds a VMA in mm_struct A mapping the same extent
+ *     (at virtual address X) and adds an entry (for X) into mm_struct A's
+ *     ->pte-mapped_thp[] array.
+ * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X,
+ *     sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry
+ *     (for X) into mm_struct A's ->pte-mapped_thp[] array.
+ * Thus, it's possible the same address is added multiple times for the same
+ * mm_struct.  Should this happen, we'll simply attempt
+ * collapse_pte_mapped_thp() multiple times for the same address, under the same
+ * exclusive mmap_lock, and assuming the first call is successful, subsequent
+ * attempts will return quickly (without grabbing any additional locks) when
+ * a huge pmd is found in find_pmd_or_thp_or_none().  Since this is a cheap
+ * check, and since this is a rare occurrence, the cost of preventing this
+ * "multiple-add" is thought to be more expensive than just handling it, should
+ * it occur.
  */
-static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
                                          unsigned long addr)
 {
-       struct mm_slot *mm_slot;
+       struct khugepaged_mm_slot *mm_slot;
+       struct mm_slot *slot;
+       bool ret = false;
 
        VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
 
        spin_lock(&khugepaged_mm_lock);
-       mm_slot = get_mm_slot(mm);
-       if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+       slot = mm_slot_lookup(mm_slots_hash, mm);
+       mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
+       if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) {
                mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+               ret = true;
+       }
        spin_unlock(&khugepaged_mm_lock);
+       return ret;
+}
+
+/* hpage must be locked, and mmap_lock must be held in write */
+static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
+                       pmd_t *pmdp, struct page *hpage)
+{
+       struct vm_fault vmf = {
+               .vma = vma,
+               .address = addr,
+               .flags = 0,
+               .pmd = pmdp,
+       };
+
+       VM_BUG_ON(!PageTransHuge(hpage));
+       mmap_assert_write_locked(vma->vm_mm);
+
+       if (do_set_pmd(&vmf, hpage))
+               return SCAN_FAIL;
+
+       get_page(hpage);
+       return SCAN_SUCCEED;
 }
 
 static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -1381,52 +1405,80 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
  *
  * @mm: process address space where collapse happens
  * @addr: THP collapse address
+ * @install_pmd: If a huge PMD should be installed
  *
  * This function checks whether all the PTEs in the PMD are pointing to the
  * right THP. If so, retract the page table so the THP can refault in with
- * as pmd-mapped.
+ * as pmd-mapped. Possibly install a huge PMD mapping the THP.
  */
-void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+                           bool install_pmd)
 {
        unsigned long haddr = addr & HPAGE_PMD_MASK;
-       struct vm_area_struct *vma = find_vma(mm, haddr);
+       struct vm_area_struct *vma = vma_lookup(mm, haddr);
        struct page *hpage;
        pte_t *start_pte, *pte;
        pmd_t *pmd;
        spinlock_t *ptl;
-       int count = 0;
+       int count = 0, result = SCAN_FAIL;
        int i;
 
+       mmap_assert_write_locked(mm);
+
+       /* Fast check before locking page if already PMD-mapped */
+       result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+       if (result == SCAN_PMD_MAPPED)
+               return result;
+
        if (!vma || !vma->vm_file ||
            !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
-               return;
+               return SCAN_VMA_CHECK;
 
        /*
-        * This vm_flags may not have VM_HUGEPAGE if the page was not
-        * collapsed by this mm. But we can still collapse if the page is
-        * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
-        * will not fail the vma for missing VM_HUGEPAGE
+        * If we are here, we've succeeded in replacing all the native pages
+        * in the page cache with a single hugepage. If a mm were to fault-in
+        * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
+        * and map it by a PMD, regardless of sysfs THP settings. As such, let's
+        * analogously elide sysfs THP settings here.
         */
-       if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false))
-               return;
+       if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+               return SCAN_VMA_CHECK;
 
        /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
        if (userfaultfd_wp(vma))
-               return;
+               return SCAN_PTE_UFFD_WP;
 
        hpage = find_lock_page(vma->vm_file->f_mapping,
                               linear_page_index(vma, haddr));
        if (!hpage)
-               return;
+               return SCAN_PAGE_NULL;
+
+       if (!PageHead(hpage)) {
+               result = SCAN_FAIL;
+               goto drop_hpage;
+       }
 
-       if (!PageHead(hpage))
+       if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+               result = SCAN_PAGE_COMPOUND;
                goto drop_hpage;
+       }
 
-       pmd = mm_find_pmd(mm, haddr);
-       if (!pmd)
+       switch (result) {
+       case SCAN_SUCCEED:
+               break;
+       case SCAN_PMD_NONE:
+               /*
+                * In MADV_COLLAPSE path, possible race with khugepaged where
+                * all pte entries have been removed and pmd cleared.  If so,
+                * skip all the pte checks and just update the pmd mapping.
+                */
+               goto maybe_install_pmd;
+       default:
                goto drop_hpage;
+       }
 
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+       result = SCAN_FAIL;
 
        /* step 1: check all mapped PTEs are to the right huge page */
        for (i = 0, addr = haddr, pte = start_pte;
@@ -1438,8 +1490,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
                        continue;
 
                /* page swapped out, abort */
-               if (!pte_present(*pte))
+               if (!pte_present(*pte)) {
+                       result = SCAN_PTE_NON_PRESENT;
                        goto abort;
+               }
 
                page = vm_normal_page(vma, addr, *pte);
                if (WARN_ON_ONCE(page && is_zone_device_page(page)))
@@ -1474,21 +1528,29 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
                add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
        }
 
-       /* step 4: collapse pmd */
+       /* step 4: remove pte entries */
        collapse_and_free_pmd(mm, vma, haddr, pmd);
+
+maybe_install_pmd:
+       /* step 5: install pmd entry */
+       result = install_pmd
+                       ? set_huge_pmd(vma, haddr, pmd, hpage)
+                       : SCAN_SUCCEED;
+
 drop_hpage:
        unlock_page(hpage);
        put_page(hpage);
-       return;
+       return result;
 
 abort:
        pte_unmap_unlock(start_pte, ptl);
        goto drop_hpage;
 }
 
-static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
 {
-       struct mm_struct *mm = mm_slot->mm;
+       struct mm_slot *slot = &mm_slot->slot;
+       struct mm_struct *mm = slot->mm;
        int i;
 
        if (likely(mm_slot->nr_pte_mapped_thp == 0))
@@ -1497,26 +1559,33 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
        if (!mmap_write_trylock(mm))
                return;
 
-       if (unlikely(khugepaged_test_exit(mm)))
+       if (unlikely(hpage_collapse_test_exit(mm)))
                goto out;
 
        for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
-               collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+               collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false);
 
 out:
        mm_slot->nr_pte_mapped_thp = 0;
        mmap_write_unlock(mm);
 }
 
-static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+                              struct mm_struct *target_mm,
+                              unsigned long target_addr, struct page *hpage,
+                              struct collapse_control *cc)
 {
        struct vm_area_struct *vma;
-       struct mm_struct *mm;
-       unsigned long addr;
-       pmd_t *pmd;
+       int target_result = SCAN_FAIL;
 
        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+               int result = SCAN_FAIL;
+               struct mm_struct *mm = NULL;
+               unsigned long addr = 0;
+               pmd_t *pmd;
+               bool is_target = false;
+
                /*
                 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
                 * got written to. These VMAs are likely not worth investing
@@ -1533,25 +1602,34 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                 * ptl. It has higher chance to recover THP for the VMA, but
                 * has higher cost too.
                 */
-               if (vma->anon_vma)
-                       continue;
+               if (vma->anon_vma) {
+                       result = SCAN_PAGE_ANON;
+                       goto next;
+               }
                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-               if (addr & ~HPAGE_PMD_MASK)
-                       continue;
-               if (vma->vm_end < addr + HPAGE_PMD_SIZE)
-                       continue;
+               if (addr & ~HPAGE_PMD_MASK ||
+                   vma->vm_end < addr + HPAGE_PMD_SIZE) {
+                       result = SCAN_VMA_CHECK;
+                       goto next;
+               }
                mm = vma->vm_mm;
-               pmd = mm_find_pmd(mm, addr);
-               if (!pmd)
-                       continue;
+               is_target = mm == target_mm && addr == target_addr;
+               result = find_pmd_or_thp_or_none(mm, addr, &pmd);
+               if (result != SCAN_SUCCEED)
+                       goto next;
                /*
                 * We need exclusive mmap_lock to retract page table.
                 *
                 * We use trylock due to lock inversion: we need to acquire
                 * mmap_lock while holding page lock. Fault path does it in
                 * reverse order. Trylock is a way to avoid deadlock.
+                *
+                * Also, it's not MADV_COLLAPSE's job to collapse other
+                * mappings - let khugepaged take care of them later.
                 */
-               if (mmap_write_trylock(mm)) {
+               result = SCAN_PTE_MAPPED_HUGEPAGE;
+               if ((cc->is_khugepaged || is_target) &&
+                   mmap_write_trylock(mm)) {
                        /*
                         * When a vma is registered with uffd-wp, we can't
                         * recycle the pmd pgtable because there can be pte
@@ -1560,25 +1638,48 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                         * it'll always mapped in small page size for uffd-wp
                         * registered ranges.
                         */
-                       if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma))
-                               collapse_and_free_pmd(mm, vma, addr, pmd);
+                       if (hpage_collapse_test_exit(mm)) {
+                               result = SCAN_ANY_PROCESS;
+                               goto unlock_next;
+                       }
+                       if (userfaultfd_wp(vma)) {
+                               result = SCAN_PTE_UFFD_WP;
+                               goto unlock_next;
+                       }
+                       collapse_and_free_pmd(mm, vma, addr, pmd);
+                       if (!cc->is_khugepaged && is_target)
+                               result = set_huge_pmd(vma, addr, pmd, hpage);
+                       else
+                               result = SCAN_SUCCEED;
+
+unlock_next:
                        mmap_write_unlock(mm);
-               } else {
-                       /* Try again later */
+                       goto next;
+               }
+               /*
+                * Calling context will handle target mm/addr. Otherwise, let
+                * khugepaged try again later.
+                */
+               if (!is_target) {
                        khugepaged_add_pte_mapped_thp(mm, addr);
+                       continue;
                }
+next:
+               if (is_target)
+                       target_result = result;
        }
        i_mmap_unlock_write(mapping);
+       return target_result;
 }
 
 /**
  * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
  *
  * @mm: process address space where collapse happens
+ * @addr: virtual collapse start address
  * @file: file that collapse on
  * @start: collapse start address
- * @hpage: new allocated huge page for collapse
- * @node: appointed node the new huge page allocate from
+ * @cc: collapse context and scratchpad
  *
  * Basic scheme is simple, details are more complex:
  *  - allocate and lock a new huge page;
@@ -1595,13 +1696,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  *    + restore gaps in the page cache;
  *    + unlock and free huge page;
  */
-static void collapse_file(struct mm_struct *mm,
-               struct file *file, pgoff_t start,
-               struct page **hpage, int node)
+static int collapse_file(struct mm_struct *mm, unsigned long addr,
+                        struct file *file, pgoff_t start,
+                        struct collapse_control *cc)
 {
        struct address_space *mapping = file->f_mapping;
-       gfp_t gfp;
-       struct page *new_page;
+       struct page *hpage;
        pgoff_t index, end = start + HPAGE_PMD_NR;
        LIST_HEAD(pagelist);
        XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
@@ -1612,20 +1712,9 @@ static void collapse_file(struct mm_struct *mm,
        VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
        VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
 
-       /* Only allocate from the target node */
-       gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
-       new_page = khugepaged_alloc_page(hpage, gfp, node);
-       if (!new_page) {
-               result = SCAN_ALLOC_HUGE_PAGE_FAIL;
-               goto out;
-       }
-
-       if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
-               result = SCAN_CGROUP_CHARGE_FAIL;
+       result = alloc_charge_hpage(&hpage, mm, cc);
+       if (result != SCAN_SUCCEED)
                goto out;
-       }
-       count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
 
        /*
         * Ensure we have slots for all the pages in the range.  This is
@@ -1643,14 +1732,14 @@ static void collapse_file(struct mm_struct *mm,
                }
        } while (1);
 
-       __SetPageLocked(new_page);
+       __SetPageLocked(hpage);
        if (is_shmem)
-               __SetPageSwapBacked(new_page);
-       new_page->index = start;
-       new_page->mapping = mapping;
+               __SetPageSwapBacked(hpage);
+       hpage->index = start;
+       hpage->mapping = mapping;
 
        /*
-        * At this point the new_page is locked and not up-to-date.
+        * At this point the hpage is locked and not up-to-date.
         * It's safe to insert it into the page cache, because nobody would
         * be able to map it or use it in another way until we unlock it.
         */
@@ -1678,19 +1767,22 @@ static void collapse_file(struct mm_struct *mm,
                                        result = SCAN_FAIL;
                                        goto xa_locked;
                                }
-                               xas_store(&xas, new_page);
+                               xas_store(&xas, hpage);
                                nr_none++;
                                continue;
                        }
 
                        if (xa_is_value(page) || !PageUptodate(page)) {
+                               struct folio *folio;
+
                                xas_unlock_irq(&xas);
                                /* swap in or instantiate fallocated page */
-                               if (shmem_getpage(mapping->host, index, &page,
-                                                 SGP_NOALLOC)) {
+                               if (shmem_get_folio(mapping->host, index,
+                                               &folio, SGP_NOALLOC)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
+                               page = folio_file_page(folio, index);
                        } else if (trylock_page(page)) {
                                get_page(page);
                                xas_unlock_irq(&xas);
@@ -1757,9 +1849,16 @@ static void collapse_file(struct mm_struct *mm,
                /*
                 * If file was truncated then extended, or hole-punched, before
                 * we locked the first page, then a THP might be there already.
+                * This will be discovered on the first iteration.
                 */
                if (PageTransCompound(page)) {
-                       result = SCAN_PAGE_COMPOUND;
+                       struct page *head = compound_head(page);
+
+                       result = compound_order(head) == HPAGE_PMD_ORDER &&
+                                       head->index == start
+                                       /* Maybe PMD-mapped */
+                                       ? SCAN_PTE_MAPPED_HUGEPAGE
+                                       : SCAN_PAGE_COMPOUND;
                        goto out_unlock;
                }
 
@@ -1820,19 +1919,19 @@ static void collapse_file(struct mm_struct *mm,
                list_add_tail(&page->lru, &pagelist);
 
                /* Finally, replace with the new page. */
-               xas_store(&xas, new_page);
+               xas_store(&xas, hpage);
                continue;
 out_unlock:
                unlock_page(page);
                put_page(page);
                goto xa_unlocked;
        }
-       nr = thp_nr_pages(new_page);
+       nr = thp_nr_pages(hpage);
 
        if (is_shmem)
-               __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
+               __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
        else {
-               __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
+               __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
                filemap_nr_thps_inc(mapping);
                /*
                 * Paired with smp_mb() in do_dentry_open() to ensure
@@ -1843,21 +1942,21 @@ out_unlock:
                smp_mb();
                if (inode_is_open_for_write(mapping->host)) {
                        result = SCAN_FAIL;
-                       __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
+                       __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
                        filemap_nr_thps_dec(mapping);
                        goto xa_locked;
                }
        }
 
        if (nr_none) {
-               __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
+               __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
                /* nr_none is always 0 for non-shmem. */
-               __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
+               __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
        }
 
        /* Join all the small entries into a single multi-index entry */
        xas_set_order(&xas, start, HPAGE_PMD_ORDER);
-       xas_store(&xas, new_page);
+       xas_store(&xas, hpage);
 xa_locked:
        xas_unlock_irq(&xas);
 xa_unlocked:
@@ -1879,11 +1978,11 @@ xa_unlocked:
                index = start;
                list_for_each_entry_safe(page, tmp, &pagelist, lru) {
                        while (index < page->index) {
-                               clear_highpage(new_page + (index % HPAGE_PMD_NR));
+                               clear_highpage(hpage + (index % HPAGE_PMD_NR));
                                index++;
                        }
-                       copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
-                                       page);
+                       copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
+                                     page);
                        list_del(&page->lru);
                        page->mapping = NULL;
                        page_ref_unfreeze(page, 1);
@@ -1894,23 +1993,23 @@ xa_unlocked:
                        index++;
                }
                while (index < end) {
-                       clear_highpage(new_page + (index % HPAGE_PMD_NR));
+                       clear_highpage(hpage + (index % HPAGE_PMD_NR));
                        index++;
                }
 
-               SetPageUptodate(new_page);
-               page_ref_add(new_page, HPAGE_PMD_NR - 1);
+               SetPageUptodate(hpage);
+               page_ref_add(hpage, HPAGE_PMD_NR - 1);
                if (is_shmem)
-                       set_page_dirty(new_page);
-               lru_cache_add(new_page);
+                       set_page_dirty(hpage);
+               lru_cache_add(hpage);
 
                /*
                 * Remove pte page tables, so we can re-fault the page as huge.
                 */
-               retract_page_tables(mapping, start);
-               *hpage = NULL;
-
-               khugepaged_pages_collapsed++;
+               result = retract_page_tables(mapping, start, mm, addr, hpage,
+                                            cc);
+               unlock_page(hpage);
+               hpage = NULL;
        } else {
                struct page *page;
 
@@ -1949,19 +2048,24 @@ xa_unlocked:
                VM_BUG_ON(nr_none);
                xas_unlock_irq(&xas);
 
-               new_page->mapping = NULL;
+               hpage->mapping = NULL;
        }
 
-       unlock_page(new_page);
+       if (hpage)
+               unlock_page(hpage);
 out:
        VM_BUG_ON(!list_empty(&pagelist));
-       if (!IS_ERR_OR_NULL(*hpage))
-               mem_cgroup_uncharge(page_folio(*hpage));
+       if (hpage) {
+               mem_cgroup_uncharge(page_folio(hpage));
+               put_page(hpage);
+       }
        /* TODO: tracepoints */
+       return result;
 }
 
-static void khugepaged_scan_file(struct mm_struct *mm,
-               struct file *file, pgoff_t start, struct page **hpage)
+static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+                                   struct file *file, pgoff_t start,
+                                   struct collapse_control *cc)
 {
        struct page *page = NULL;
        struct address_space *mapping = file->f_mapping;
@@ -1972,14 +2076,16 @@ static void khugepaged_scan_file(struct mm_struct *mm,
 
        present = 0;
        swap = 0;
-       memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+       memset(cc->node_load, 0, sizeof(cc->node_load));
        rcu_read_lock();
        xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
                if (xas_retry(&xas, page))
                        continue;
 
                if (xa_is_value(page)) {
-                       if (++swap > khugepaged_max_ptes_swap) {
+                       ++swap;
+                       if (cc->is_khugepaged &&
+                           swap > khugepaged_max_ptes_swap) {
                                result = SCAN_EXCEED_SWAP_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
                                break;
@@ -1988,20 +2094,32 @@ static void khugepaged_scan_file(struct mm_struct *mm,
                }
 
                /*
-                * XXX: khugepaged should compact smaller compound pages
+                * TODO: khugepaged should compact smaller compound pages
                 * into a PMD sized page
                 */
                if (PageTransCompound(page)) {
-                       result = SCAN_PAGE_COMPOUND;
+                       struct page *head = compound_head(page);
+
+                       result = compound_order(head) == HPAGE_PMD_ORDER &&
+                                       head->index == start
+                                       /* Maybe PMD-mapped */
+                                       ? SCAN_PTE_MAPPED_HUGEPAGE
+                                       : SCAN_PAGE_COMPOUND;
+                       /*
+                        * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
+                        * by the caller won't touch the page cache, and so
+                        * it's safe to skip LRU and refcount checks before
+                        * returning.
+                        */
                        break;
                }
 
                node = page_to_nid(page);
-               if (khugepaged_scan_abort(node)) {
+               if (hpage_collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        break;
                }
-               khugepaged_node_load[node]++;
+               cc->node_load[node]++;
 
                if (!PageLRU(page)) {
                        result = SCAN_PAGE_LRU;
@@ -2030,54 +2148,68 @@ static void khugepaged_scan_file(struct mm_struct *mm,
        rcu_read_unlock();
 
        if (result == SCAN_SUCCEED) {
-               if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+               if (cc->is_khugepaged &&
+                   present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
                        result = SCAN_EXCEED_NONE_PTE;
                        count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                } else {
-                       node = khugepaged_find_target_node();
-                       collapse_file(mm, file, start, hpage, node);
+                       result = collapse_file(mm, addr, file, start, cc);
                }
        }
 
-       /* TODO: tracepoints */
+       trace_mm_khugepaged_scan_file(mm, page, file->f_path.dentry->d_iname,
+                                     present, swap, result);
+       return result;
 }
 #else
-static void khugepaged_scan_file(struct mm_struct *mm,
-               struct file *file, pgoff_t start, struct page **hpage)
+static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+                                   struct file *file, pgoff_t start,
+                                   struct collapse_control *cc)
 {
        BUILD_BUG();
 }
 
-static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
 {
 }
+
+static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+                                         unsigned long addr)
+{
+       return false;
+}
 #endif
 
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
-                                           struct page **hpage)
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
+                                           struct collapse_control *cc)
        __releases(&khugepaged_mm_lock)
        __acquires(&khugepaged_mm_lock)
 {
-       struct mm_slot *mm_slot;
+       struct vma_iterator vmi;
+       struct khugepaged_mm_slot *mm_slot;
+       struct mm_slot *slot;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        int progress = 0;
 
        VM_BUG_ON(!pages);
        lockdep_assert_held(&khugepaged_mm_lock);
+       *result = SCAN_FAIL;
 
-       if (khugepaged_scan.mm_slot)
+       if (khugepaged_scan.mm_slot) {
                mm_slot = khugepaged_scan.mm_slot;
-       else {
-               mm_slot = list_entry(khugepaged_scan.mm_head.next,
+               slot = &mm_slot->slot;
+       } else {
+               slot = list_entry(khugepaged_scan.mm_head.next,
                                     struct mm_slot, mm_node);
+               mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
                khugepaged_scan.address = 0;
                khugepaged_scan.mm_slot = mm_slot;
        }
        spin_unlock(&khugepaged_mm_lock);
        khugepaged_collapse_pte_mapped_thps(mm_slot);
 
-       mm = mm_slot->mm;
+       mm = slot->mm;
        /*
         * Don't wait for semaphore (to avoid long wait times).  Just move to
         * the next mm on the list.
@@ -2085,19 +2217,21 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
        vma = NULL;
        if (unlikely(!mmap_read_trylock(mm)))
                goto breakouterloop_mmap_lock;
-       if (likely(!khugepaged_test_exit(mm)))
-               vma = find_vma(mm, khugepaged_scan.address);
 
        progress++;
-       for (; vma; vma = vma->vm_next) {
+       if (unlikely(hpage_collapse_test_exit(mm)))
+               goto breakouterloop;
+
+       vma_iter_init(&vmi, mm, khugepaged_scan.address);
+       for_each_vma(vmi, vma) {
                unsigned long hstart, hend;
 
                cond_resched();
-               if (unlikely(khugepaged_test_exit(mm))) {
+               if (unlikely(hpage_collapse_test_exit(mm))) {
                        progress++;
                        break;
                }
-               if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) {
+               if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
 skip:
                        progress++;
                        continue;
@@ -2111,9 +2245,10 @@ skip:
                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
 
                while (khugepaged_scan.address < hend) {
-                       int ret;
+                       bool mmap_locked = true;
+
                        cond_resched();
-                       if (unlikely(khugepaged_test_exit(mm)))
+                       if (unlikely(hpage_collapse_test_exit(mm)))
                                goto breakouterloop;
 
                        VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2125,19 +2260,48 @@ skip:
                                                khugepaged_scan.address);
 
                                mmap_read_unlock(mm);
-                               ret = 1;
-                               khugepaged_scan_file(mm, file, pgoff, hpage);
+                               *result = hpage_collapse_scan_file(mm,
+                                                                  khugepaged_scan.address,
+                                                                  file, pgoff, cc);
+                               mmap_locked = false;
                                fput(file);
                        } else {
-                               ret = khugepaged_scan_pmd(mm, vma,
-                                               khugepaged_scan.address,
-                                               hpage);
+                               *result = hpage_collapse_scan_pmd(mm, vma,
+                                                                 khugepaged_scan.address,
+                                                                 &mmap_locked,
+                                                                 cc);
+                       }
+                       switch (*result) {
+                       case SCAN_PTE_MAPPED_HUGEPAGE: {
+                               pmd_t *pmd;
+
+                               *result = find_pmd_or_thp_or_none(mm,
+                                                                 khugepaged_scan.address,
+                                                                 &pmd);
+                               if (*result != SCAN_SUCCEED)
+                                       break;
+                               if (!khugepaged_add_pte_mapped_thp(mm,
+                                                                  khugepaged_scan.address))
+                                       break;
+                       } fallthrough;
+                       case SCAN_SUCCEED:
+                               ++khugepaged_pages_collapsed;
+                               break;
+                       default:
+                               break;
                        }
+
                        /* move to next address */
                        khugepaged_scan.address += HPAGE_PMD_SIZE;
                        progress += HPAGE_PMD_NR;
-                       if (ret)
-                               /* we released mmap_lock so break loop */
+                       if (!mmap_locked)
+                               /*
+                                * We released mmap_lock so break loop.  Note
+                                * that we drop mmap_lock before all hugepage
+                                * allocations, so if allocation fails, we are
+                                * guaranteed to break here and report the
+                                * correct result back to caller.
+                                */
                                goto breakouterloop_mmap_lock;
                        if (progress >= pages)
                                goto breakouterloop;
@@ -2153,16 +2317,17 @@ breakouterloop_mmap_lock:
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm.
         */
-       if (khugepaged_test_exit(mm) || !vma) {
+       if (hpage_collapse_test_exit(mm) || !vma) {
                /*
                 * Make sure that if mm_users is reaching zero while
                 * khugepaged runs here, khugepaged_exit will find
                 * mm_slot not pointing to the exiting mm.
                 */
-               if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
-                       khugepaged_scan.mm_slot = list_entry(
-                               mm_slot->mm_node.next,
-                               struct mm_slot, mm_node);
+               if (slot->mm_node.next != &khugepaged_scan.mm_head) {
+                       slot = list_entry(slot->mm_node.next,
+                                         struct mm_slot, mm_node);
+                       khugepaged_scan.mm_slot =
+                               mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
                        khugepaged_scan.address = 0;
                } else {
                        khugepaged_scan.mm_slot = NULL;
@@ -2187,19 +2352,16 @@ static int khugepaged_wait_event(void)
                kthread_should_stop();
 }
 
-static void khugepaged_do_scan(void)
+static void khugepaged_do_scan(struct collapse_control *cc)
 {
-       struct page *hpage = NULL;
        unsigned int progress = 0, pass_through_head = 0;
        unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
        bool wait = true;
+       int result = SCAN_SUCCEED;
 
        lru_add_drain_all();
 
-       while (progress < pages) {
-               if (!khugepaged_prealloc_page(&hpage, &wait))
-                       break;
-
+       while (true) {
                cond_resched();
 
                if (unlikely(kthread_should_stop() || try_to_freeze()))
@@ -2211,14 +2373,25 @@ static void khugepaged_do_scan(void)
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        progress += khugepaged_scan_mm_slot(pages - progress,
-                                                           &hpage);
+                                                           &result, cc);
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);
-       }
 
-       if (!IS_ERR_OR_NULL(hpage))
-               put_page(hpage);
+               if (progress >= pages)
+                       break;
+
+               if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
+                       /*
+                        * If fail to allocate the first time, try to sleep for
+                        * a while.  When hit again, cancel the scan.
+                        */
+                       if (!wait)
+                               break;
+                       wait = false;
+                       khugepaged_alloc_sleep();
+               }
+       }
 }
 
 static bool khugepaged_should_wakeup(void)
@@ -2249,13 +2422,13 @@ static void khugepaged_wait_work(void)
 
 static int khugepaged(void *none)
 {
-       struct mm_slot *mm_slot;
+       struct khugepaged_mm_slot *mm_slot;
 
        set_freezable();
        set_user_nice(current, MAX_NICE);
 
        while (!kthread_should_stop()) {
-               khugepaged_do_scan();
+               khugepaged_do_scan(&khugepaged_collapse_control);
                khugepaged_wait_work();
        }
 
@@ -2354,3 +2527,140 @@ void khugepaged_min_free_kbytes_update(void)
                set_recommended_min_free_kbytes();
        mutex_unlock(&khugepaged_mutex);
 }
+
+static int madvise_collapse_errno(enum scan_result r)
+{
+       /*
+        * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
+        * actionable feedback to caller, so they may take an appropriate
+        * fallback measure depending on the nature of the failure.
+        */
+       switch (r) {
+       case SCAN_ALLOC_HUGE_PAGE_FAIL:
+               return -ENOMEM;
+       case SCAN_CGROUP_CHARGE_FAIL:
+               return -EBUSY;
+       /* Resource temporary unavailable - trying again might succeed */
+       case SCAN_PAGE_LOCK:
+       case SCAN_PAGE_LRU:
+       case SCAN_DEL_PAGE_LRU:
+               return -EAGAIN;
+       /*
+        * Other: Trying again likely not to succeed / error intrinsic to
+        * specified memory range. khugepaged likely won't be able to collapse
+        * either.
+        */
+       default:
+               return -EINVAL;
+       }
+}
+
+int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
+                    unsigned long start, unsigned long end)
+{
+       struct collapse_control *cc;
+       struct mm_struct *mm = vma->vm_mm;
+       unsigned long hstart, hend, addr;
+       int thps = 0, last_fail = SCAN_FAIL;
+       bool mmap_locked = true;
+
+       BUG_ON(vma->vm_start > start);
+       BUG_ON(vma->vm_end < end);
+
+       *prev = vma;
+
+       if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+               return -EINVAL;
+
+       cc = kmalloc(sizeof(*cc), GFP_KERNEL);
+       if (!cc)
+               return -ENOMEM;
+       cc->is_khugepaged = false;
+       cc->last_target_node = NUMA_NO_NODE;
+
+       mmgrab(mm);
+       lru_add_drain_all();
+
+       hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+       hend = end & HPAGE_PMD_MASK;
+
+       for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
+               int result = SCAN_FAIL;
+
+               if (!mmap_locked) {
+                       cond_resched();
+                       mmap_read_lock(mm);
+                       mmap_locked = true;
+                       result = hugepage_vma_revalidate(mm, addr, false, &vma,
+                                                        cc);
+                       if (result  != SCAN_SUCCEED) {
+                               last_fail = result;
+                               goto out_nolock;
+                       }
+
+                       hend = vma->vm_end & HPAGE_PMD_MASK;
+               }
+               mmap_assert_locked(mm);
+               memset(cc->node_load, 0, sizeof(cc->node_load));
+               if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+                       struct file *file = get_file(vma->vm_file);
+                       pgoff_t pgoff = linear_page_index(vma, addr);
+
+                       mmap_read_unlock(mm);
+                       mmap_locked = false;
+                       result = hpage_collapse_scan_file(mm, addr, file, pgoff,
+                                                         cc);
+                       fput(file);
+               } else {
+                       result = hpage_collapse_scan_pmd(mm, vma, addr,
+                                                        &mmap_locked, cc);
+               }
+               if (!mmap_locked)
+                       *prev = NULL;  /* Tell caller we dropped mmap_lock */
+
+handle_result:
+               switch (result) {
+               case SCAN_SUCCEED:
+               case SCAN_PMD_MAPPED:
+                       ++thps;
+                       break;
+               case SCAN_PTE_MAPPED_HUGEPAGE:
+                       BUG_ON(mmap_locked);
+                       BUG_ON(*prev);
+                       mmap_write_lock(mm);
+                       result = collapse_pte_mapped_thp(mm, addr, true);
+                       mmap_write_unlock(mm);
+                       goto handle_result;
+               /* Whitelisted set of results where continuing OK */
+               case SCAN_PMD_NULL:
+               case SCAN_PTE_NON_PRESENT:
+               case SCAN_PTE_UFFD_WP:
+               case SCAN_PAGE_RO:
+               case SCAN_LACK_REFERENCED_PAGE:
+               case SCAN_PAGE_NULL:
+               case SCAN_PAGE_COUNT:
+               case SCAN_PAGE_LOCK:
+               case SCAN_PAGE_COMPOUND:
+               case SCAN_PAGE_LRU:
+               case SCAN_DEL_PAGE_LRU:
+                       last_fail = result;
+                       break;
+               default:
+                       last_fail = result;
+                       /* Other error, exit */
+                       goto out_maybelock;
+               }
+       }
+
+out_maybelock:
+       /* Caller expects us to hold mmap_lock on return */
+       if (!mmap_locked)
+               mmap_read_lock(mm);
+out_nolock:
+       mmap_assert_locked(mm);
+       mmdrop(mm);
+       kfree(cc);
+
+       return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
+                       : madvise_collapse_errno(last_fail);
+}