Merge tag 'powerpc-6.1-5' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
[platform/kernel/linux-starfive.git] / mm / rmap.c
index 93d5a6f..2ec925e 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
  * inode->i_rwsem      (while writing or truncating, not reading or faulting)
  *   mm->mmap_lock
  *     mapping->invalidate_lock (in filemap_fault)
- *       page->flags PG_locked (lock_page)   * (see hugetlbfs below)
- *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ *       page->flags PG_locked (lock_page)
+ *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
  *           mapping->i_mmap_rwsem
- *             hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
  *             anon_vma->rwsem
  *               mm->page_table_lock or pte_lock
  *                 swap_lock (in swap_duplicate, swap_info_get)
  *   ->tasklist_lock
  *     pte map lock
  *
- * * hugetlbfs PageHuge() pages take locks in this order:
- *         mapping->i_mmap_rwsem
- *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- *             page->flags PG_locked (lock_page)
+ * hugetlbfs PageHuge() take locks in this order:
+ *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ *     vma_lock (hugetlb specific lock for pmd_sharing)
+ *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
+ *         page->flags PG_locked (lock_page)
  */
 
 #include <linux/mm.h>
@@ -489,16 +489,16 @@ void __init anon_vma_init(void)
  * if there is a mapcount, we can dereference the anon_vma after observing
  * those.
  */
-struct anon_vma *page_get_anon_vma(struct page *page)
+struct anon_vma *folio_get_anon_vma(struct folio *folio)
 {
        struct anon_vma *anon_vma = NULL;
        unsigned long anon_mapping;
 
        rcu_read_lock();
-       anon_mapping = (unsigned long)READ_ONCE(page->mapping);
+       anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
-       if (!page_mapped(page))
+       if (!folio_mapped(folio))
                goto out;
 
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
@@ -508,13 +508,13 @@ struct anon_vma *page_get_anon_vma(struct page *page)
        }
 
        /*
-        * If this page is still mapped, then its anon_vma cannot have been
+        * If this folio is still mapped, then its anon_vma cannot have been
         * freed.  But if it has been unmapped, we have no security against the
         * anon_vma structure being freed and reused (for another anon_vma:
         * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
         * above cannot corrupt).
         */
-       if (!page_mapped(page)) {
+       if (!folio_mapped(folio)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
@@ -526,11 +526,11 @@ out:
 }
 
 /*
- * Similar to page_get_anon_vma() except it locks the anon_vma.
+ * Similar to folio_get_anon_vma() except it locks the anon_vma.
  *
  * Its a little more complex as it tries to keep the fast path to a single
  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
- * reference like with page_get_anon_vma() and then block on the mutex
+ * reference like with folio_get_anon_vma() and then block on the mutex
  * on !rwc->try_lock case.
  */
 struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
@@ -602,11 +602,6 @@ out:
        return anon_vma;
 }
 
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
-{
-       anon_vma_unlock_read(anon_vma);
-}
-
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 /*
  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
@@ -770,13 +765,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
        return vma_address(page, vma);
 }
 
+/*
+ * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
+ * NULL if it doesn't exist.  No guarantees / checks on what the pmd_t*
+ * represents.
+ */
 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 {
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd = NULL;
-       pmd_t pmde;
 
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -791,15 +790,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
                goto out;
 
        pmd = pmd_offset(pud, address);
-       /*
-        * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
-        * without holding anon_vma lock for write.  So when looking for a
-        * genuine pmde (in which to find pte), test present and !THP together.
-        */
-       pmde = *pmd;
-       barrier();
-       if (!pmd_present(pmde) || pmd_trans_huge(pmde))
-               pmd = NULL;
 out:
        return pmd;
 }
@@ -833,6 +823,12 @@ static bool folio_referenced_one(struct folio *folio,
                }
 
                if (pvmw.pte) {
+                       if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
+                           !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+                               lru_gen_look_around(&pvmw);
+                               referenced++;
+                       }
+
                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte)) {
                                /*
@@ -1101,22 +1097,20 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
  */
 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
 {
-       struct anon_vma *anon_vma = vma->anon_vma;
-       struct page *subpage = page;
-
-       page = compound_head(page);
+       void *anon_vma = vma->anon_vma;
+       struct folio *folio = page_folio(page);
 
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_VMA(!anon_vma, vma);
 
-       anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+       anon_vma += PAGE_MAPPING_ANON;
        /*
         * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
         * simultaneously, so a concurrent reader (eg folio_referenced()'s
         * folio_test_anon()) will not see one without the other.
         */
-       WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
-       SetPageAnonExclusive(subpage);
+       WRITE_ONCE(folio->mapping, anon_vma);
+       SetPageAnonExclusive(page);
 }
 
 /**
@@ -1560,33 +1554,45 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
+                        *
+                        * We also must hold hugetlb vma_lock in write mode.
+                        * Lock order dictates acquiring vma_lock BEFORE
+                        * i_mmap_rwsem.  We can only try lock here and fail
+                        * if unsuccessful.
                         */
-                       VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-                       if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-                               flush_tlb_range(vma, range.start, range.end);
-                               mmu_notifier_invalidate_range(mm, range.start,
-                                                             range.end);
-
-                               /*
-                                * The ref count of the PMD page was dropped
-                                * which is part of the way map counting
-                                * is done for shared PMDs.  Return 'true'
-                                * here.  When there is no other sharing,
-                                * huge_pmd_unshare returns false and we will
-                                * unmap the actual page and drop map count
-                                * to zero.
-                                */
-                               page_vma_mapped_walk_done(&pvmw);
-                               break;
+                       if (!anon) {
+                               VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+                               if (!hugetlb_vma_trylock_write(vma)) {
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       ret = false;
+                                       break;
+                               }
+                               if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+                                       hugetlb_vma_unlock_write(vma);
+                                       flush_tlb_range(vma,
+                                               range.start, range.end);
+                                       mmu_notifier_invalidate_range(mm,
+                                               range.start, range.end);
+                                       /*
+                                        * The ref count of the PMD page was
+                                        * dropped which is part of the way map
+                                        * counting is done for shared PMDs.
+                                        * Return 'true' here.  When there is
+                                        * no other sharing, huge_pmd_unshare
+                                        * returns false and we will unmap the
+                                        * actual page and drop map count
+                                        * to zero.
+                                        */
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       break;
+                               }
+                               hugetlb_vma_unlock_write(vma);
                        }
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                } else {
                        flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
-                       /*
-                        * Nuke the page table entry. When having to clear
-                        * PageAnonExclusive(), we always have to flush.
-                        */
-                       if (should_defer_flush(mm, flags) && !anon_exclusive) {
+                       /* Nuke the page table entry. */
+                       if (should_defer_flush(mm, flags)) {
                                /*
                                 * We clear the PTE but do not flush so potentially
                                 * a remote CPU could still be writing to the folio.
@@ -1717,6 +1723,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
+
+                       /* See page_try_share_anon_rmap(): clear PTE first. */
                        if (anon_exclusive &&
                            page_try_share_anon_rmap(subpage)) {
                                swap_free(entry);
@@ -1936,26 +1944,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
+                        *
+                        * We also must hold hugetlb vma_lock in write mode.
+                        * Lock order dictates acquiring vma_lock BEFORE
+                        * i_mmap_rwsem.  We can only try lock here and
+                        * fail if unsuccessful.
                         */
-                       VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-                       if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-                               flush_tlb_range(vma, range.start, range.end);
-                               mmu_notifier_invalidate_range(mm, range.start,
-                                                             range.end);
-
-                               /*
-                                * The ref count of the PMD page was dropped
-                                * which is part of the way map counting
-                                * is done for shared PMDs.  Return 'true'
-                                * here.  When there is no other sharing,
-                                * huge_pmd_unshare returns false and we will
-                                * unmap the actual page and drop map count
-                                * to zero.
-                                */
-                               page_vma_mapped_walk_done(&pvmw);
-                               break;
+                       if (!anon) {
+                               VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+                               if (!hugetlb_vma_trylock_write(vma)) {
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       ret = false;
+                                       break;
+                               }
+                               if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+                                       hugetlb_vma_unlock_write(vma);
+                                       flush_tlb_range(vma,
+                                               range.start, range.end);
+                                       mmu_notifier_invalidate_range(mm,
+                                               range.start, range.end);
+
+                                       /*
+                                        * The ref count of the PMD page was
+                                        * dropped which is part of the way map
+                                        * counting is done for shared PMDs.
+                                        * Return 'true' here.  When there is
+                                        * no other sharing, huge_pmd_unshare
+                                        * returns false and we will unmap the
+                                        * actual page and drop map count
+                                        * to zero.
+                                        */
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       break;
+                               }
+                               hugetlb_vma_unlock_write(vma);
                        }
-
                        /* Nuke the hugetlb page table entry */
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                } else {
@@ -2048,6 +2071,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        }
                        VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
                                       !anon_exclusive, subpage);
+
+                       /* See page_try_share_anon_rmap(): clear PTE first. */
                        if (anon_exclusive &&
                            page_try_share_anon_rmap(subpage)) {
                                if (folio_test_hugetlb(folio))
@@ -2073,7 +2098,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        else
                                entry = make_readable_migration_entry(
                                                        page_to_pfn(subpage));
-
+                       if (pte_young(pteval))
+                               entry = make_migration_entry_young(entry);
+                       if (pte_dirty(pteval))
+                               entry = make_migration_entry_dirty(entry);
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);