mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()

author Hugh Dickins <hughd@google.com>

Wed, 12 Jul 2023 04:42:19 +0000 (21:42 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Fri, 18 Aug 2023 17:12:25 +0000 (10:12 -0700)
author Hugh Dickins <hughd@google.com>
Wed, 12 Jul 2023 04:42:19 +0000 (21:42 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Fri, 18 Aug 2023 17:12:25 +0000 (10:12 -0700)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c

index 8f88fd6..53d1788 100644 (file)
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1485,7 +1485,7 @@ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
         return ret;
  }
  
-/* hpage must be locked, and mmap_lock must be held in write */
+/* hpage must be locked, and mmap_lock must be held */
  static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
                         pmd_t *pmdp, struct page *hpage)
  {
@@ -1497,7 +1497,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
         };
  
         VM_BUG_ON(!PageTransHuge(hpage));
-       mmap_assert_write_locked(vma->vm_mm);
+       mmap_assert_locked(vma->vm_mm);
  
         if (do_set_pmd(&vmf, hpage))
                 return SCAN_FAIL;
@@ -1506,48 +1506,6 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
         return SCAN_SUCCEED;
  }
  
-/*
- * A note about locking:
- * Trying to take the page table spinlocks would be useless here because those
- * are only used to synchronize:
- *
- *  - modifying terminal entries (ones that point to a data page, not to another
- *    page table)
- *  - installing *new* non-terminal entries
- *
- * Instead, we need roughly the same kind of protection as free_pgtables() or
- * mm_take_all_locks() (but only for a single VMA):
- * The mmap lock together with this VMA's rmap locks covers all paths towards
- * the page table entries we're messing with here, except for hardware page
- * table walks and lockless_pages_from_mm().
- */
-static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
-                                 unsigned long addr, pmd_t *pmdp)
-{
-       pmd_t pmd;
-       struct mmu_notifier_range range;
-
-       mmap_assert_write_locked(mm);
-       if (vma->vm_file)
-               lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
-       /*
-        * All anon_vmas attached to the VMA have the same root and are
-        * therefore locked by the same lock.
-        */
-       if (vma->anon_vma)
-               lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
-
-       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
-                               addr + HPAGE_PMD_SIZE);
-       mmu_notifier_invalidate_range_start(&range);
-       pmd = pmdp_collapse_flush(vma, addr, pmdp);
-       tlb_remove_table_sync_one();
-       mmu_notifier_invalidate_range_end(&range);
-       mm_dec_nr_ptes(mm);
-       page_table_check_pte_clear_range(mm, addr, pmd);
-       pte_free(mm, pmd_pgtable(pmd));
-}
-
  /**
   * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
   * address haddr.
@@ -1563,26 +1521,29 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
  int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                             bool install_pmd)
  {
+       struct mmu_notifier_range range;
+       bool notified = false;
         unsigned long haddr = addr & HPAGE_PMD_MASK;
         struct vm_area_struct *vma = vma_lookup(mm, haddr);
         struct page *hpage;
         pte_t *start_pte, *pte;
-       pmd_t *pmd;
-       spinlock_t *ptl;
-       int count = 0, result = SCAN_FAIL;
+       pmd_t *pmd, pgt_pmd;
+       spinlock_t *pml, *ptl;
+       int nr_ptes = 0, result = SCAN_FAIL;
         int i;
  
-       mmap_assert_write_locked(mm);
+       mmap_assert_locked(mm);
+
+       /* First check VMA found, in case page tables are being torn down */
+       if (!vma || !vma->vm_file ||
+           !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
+               return SCAN_VMA_CHECK;
  
         /* Fast check before locking page if already PMD-mapped */
         result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
         if (result == SCAN_PMD_MAPPED)
                 return result;
  
-       if (!vma || !vma->vm_file ||
-           !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
-               return SCAN_VMA_CHECK;
-
         /*
          * If we are here, we've succeeded in replacing all the native pages
          * in the page cache with a single hugepage. If a mm were to fault-in
@@ -1612,6 +1573,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                 goto drop_hpage;
         }
  
+       result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
         switch (result) {
         case SCAN_SUCCEED:
                 break;
@@ -1625,27 +1587,10 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                 goto drop_hpage;
         }
  
-       /* Lock the vma before taking i_mmap and page table locks */
-       vma_start_write(vma);
-
-       /*
-        * We need to lock the mapping so that from here on, only GUP-fast and
-        * hardware page walks can access the parts of the page tables that
-        * we're operating on.
-        * See collapse_and_free_pmd().
-        */
-       i_mmap_lock_write(vma->vm_file->f_mapping);
-
-       /*
-        * This spinlock should be unnecessary: Nobody else should be accessing
-        * the page tables under spinlock protection here, only
-        * lockless_pages_from_mm() and the hardware page walker can access page
-        * tables while all the high-level locks are held in write mode.
-        */
         result = SCAN_FAIL;
         start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
-       if (!start_pte)
-               goto drop_immap;
+       if (!start_pte)         /* mmap_lock + page lock should prevent this */
+               goto drop_hpage;
  
         /* step 1: check all mapped PTEs are to the right huge page */
         for (i = 0, addr = haddr, pte = start_pte;
@@ -1672,10 +1617,18 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                  */
                 if (hpage + i != page)
                         goto abort;
-               count++;
         }
  
-       /* step 2: adjust rmap */
+       pte_unmap_unlock(start_pte, ptl);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+                               haddr, haddr + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
+       notified = true;
+       start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+       if (!start_pte)         /* mmap_lock + page lock should prevent this */
+               goto abort;
+
+       /* step 2: clear page table and adjust rmap */
         for (i = 0, addr = haddr, pte = start_pte;
              i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
                 struct page *page;
@@ -1683,47 +1636,76 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
  
                 if (pte_none(ptent))
                         continue;
+               /*
+                * We dropped ptl after the first scan, to do the mmu_notifier:
+                * page lock stops more PTEs of the hpage being faulted in, but
+                * does not stop write faults COWing anon copies from existing
+                * PTEs; and does not stop those being swapped out or migrated.
+                */
+               if (!pte_present(ptent)) {
+                       result = SCAN_PTE_NON_PRESENT;
+                       goto abort;
+               }
                 page = vm_normal_page(vma, addr, ptent);
-               if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+               if (hpage + i != page)
                         goto abort;
+
+               /*
+                * Must clear entry, or a racing truncate may re-remove it.
+                * TLB flush can be left until pmdp_collapse_flush() does it.
+                * PTE dirty? Shmem page is already dirty; file is read-only.
+                */
+               ptep_clear(mm, addr, pte);
                 page_remove_rmap(page, vma, false);
+               nr_ptes++;
         }
  
         pte_unmap_unlock(start_pte, ptl);
  
         /* step 3: set proper refcount and mm_counters. */
-       if (count) {
-               page_ref_sub(hpage, count);
-               add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+       if (nr_ptes) {
+               page_ref_sub(hpage, nr_ptes);
+               add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
         }
  
-       /* step 4: remove pte entries */
-       /* we make no change to anon, but protect concurrent anon page lookup */
-       if (vma->anon_vma)
-               anon_vma_lock_write(vma->anon_vma);
+       /* step 4: remove page table */
  
-       collapse_and_free_pmd(mm, vma, haddr, pmd);
+       /* Huge page lock is still held, so page table must remain empty */
+       pml = pmd_lock(mm, pmd);
+       if (ptl != pml)
+               spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+       pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
+       pmdp_get_lockless_sync();
+       if (ptl != pml)
+               spin_unlock(ptl);
+       spin_unlock(pml);
  
-       if (vma->anon_vma)
-               anon_vma_unlock_write(vma->anon_vma);
-       i_mmap_unlock_write(vma->vm_file->f_mapping);
+       mmu_notifier_invalidate_range_end(&range);
+
+       mm_dec_nr_ptes(mm);
+       page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
+       pte_free_defer(mm, pmd_pgtable(pgt_pmd));
  
  maybe_install_pmd:
         /* step 5: install pmd entry */
         result = install_pmd
                         ? set_huge_pmd(vma, haddr, pmd, hpage)
                         : SCAN_SUCCEED;
-
+       goto drop_hpage;
+abort:
+       if (nr_ptes) {
+               flush_tlb_mm(mm);
+               page_ref_sub(hpage, nr_ptes);
+               add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+       }
+       if (start_pte)
+               pte_unmap_unlock(start_pte, ptl);
+       if (notified)
+               mmu_notifier_invalidate_range_end(&range);
  drop_hpage:
         unlock_page(hpage);
         put_page(hpage);
         return result;
-
-abort:
-       pte_unmap_unlock(start_pte, ptl);
-drop_immap:
-       i_mmap_unlock_write(vma->vm_file->f_mapping);
-       goto drop_hpage;
  }
  
  static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
@@ -2856,9 +2838,9 @@ handle_result:
                 case SCAN_PTE_MAPPED_HUGEPAGE:
                         BUG_ON(mmap_locked);
                         BUG_ON(*prev);
-                       mmap_write_lock(mm);
+                       mmap_read_lock(mm);
                         result = collapse_pte_mapped_thp(mm, addr, true);
-                       mmap_write_unlock(mm);
+                       mmap_read_unlock(mm);
                         goto handle_result;
                 /* Whitelisted set of results where continuing OK */
                 case SCAN_PMD_NULL:
author	Hugh Dickins <hughd@google.com>
	Wed, 12 Jul 2023 04:42:19 +0000 (21:42 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Fri, 18 Aug 2023 17:12:25 +0000 (10:12 -0700)