hugetlb: use new vma_lock for pmd sharing synchronization

author Mike Kravetz <mike.kravetz@oracle.com>

Wed, 14 Sep 2022 22:18:09 +0000 (15:18 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Mon, 3 Oct 2022 21:03:17 +0000 (14:03 -0700)
author Mike Kravetz <mike.kravetz@oracle.com>
Wed, 14 Sep 2022 22:18:09 +0000 (15:18 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Mon, 3 Oct 2022 21:03:17 +0000 (14:03 -0700)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 3bb1772fce2f72cc08efdaa277b781748cd7b7c2..009ae539b9b245ba599cb7b1f7b465d61f7ef73f 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -434,6 +434,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
                                         struct folio *folio, pgoff_t index)
  {
         struct rb_root_cached *root = &mapping->i_mmap;
+       struct hugetlb_vma_lock *vma_lock;
         struct page *page = &folio->page;
         struct vm_area_struct *vma;
         unsigned long v_start;
@@ -444,7 +445,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
         end = (index + 1) * pages_per_huge_page(h);
  
         i_mmap_lock_write(mapping);
-
+retry:
+       vma_lock = NULL;
         vma_interval_tree_foreach(vma, root, start, end - 1) {
                 v_start = vma_offset_start(vma, start);
                 v_end = vma_offset_end(vma, end);
@@ -452,11 +454,63 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
                 if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
                         continue;
  
+               if (!hugetlb_vma_trylock_write(vma)) {
+                       vma_lock = vma->vm_private_data;
+                       /*
+                        * If we can not get vma lock, we need to drop
+                        * immap_sema and take locks in order.  First,
+                        * take a ref on the vma_lock structure so that
+                        * we can be guaranteed it will not go away when
+                        * dropping immap_sema.
+                        */
+                       kref_get(&vma_lock->refs);
+                       break;
+               }
+
                 unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
                                 NULL, ZAP_FLAG_DROP_MARKER);
+               hugetlb_vma_unlock_write(vma);
         }
  
         i_mmap_unlock_write(mapping);
+
+       if (vma_lock) {
+               /*
+                * Wait on vma_lock.  We know it is still valid as we have
+                * a reference.  We must 'open code' vma locking as we do
+                * not know if vma_lock is still attached to vma.
+                */
+               down_write(&vma_lock->rw_sema);
+               i_mmap_lock_write(mapping);
+
+               vma = vma_lock->vma;
+               if (!vma) {
+                       /*
+                        * If lock is no longer attached to vma, then just
+                        * unlock, drop our reference and retry looking for
+                        * other vmas.
+                        */
+                       up_write(&vma_lock->rw_sema);
+                       kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+                       goto retry;
+               }
+
+               /*
+                * vma_lock is still attached to vma.  Check to see if vma
+                * still maps page and if so, unmap.
+                */
+               v_start = vma_offset_start(vma, start);
+               v_end = vma_offset_end(vma, end);
+               if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+                       unmap_hugepage_range(vma, vma->vm_start + v_start,
+                                               v_end, NULL,
+                                               ZAP_FLAG_DROP_MARKER);
+
+               kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+               hugetlb_vma_unlock_write(vma);
+
+               goto retry;
+       }
  }
  
  static void
@@ -474,11 +528,21 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
                 unsigned long v_start;
                 unsigned long v_end;
  
+               if (!hugetlb_vma_trylock_write(vma))
+                       continue;
+
                 v_start = vma_offset_start(vma, start);
                 v_end = vma_offset_end(vma, end);
  
                 unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
                                      NULL, zap_flags);
+
+               /*
+                * Note that vma lock only exists for shared/non-private
+                * vmas.  Therefore, lock is not held when calling
+                * unmap_hugepage_range for private vmas.
+                */
+               hugetlb_vma_unlock_write(vma);
         }
  }
  
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index f44b79998ac2d29c010fb1219d76cdd6299b5a6e..d78504959df769694b39007ac0c78744643a2991 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4796,6 +4796,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                 mmu_notifier_invalidate_range_start(&range);
                 mmap_assert_write_locked(src);
                 raw_write_seqcount_begin(&src->write_protect_seq);
+       } else {
+               /*
+                * For shared mappings the vma lock must be held before
+                * calling huge_pte_offset in the src vma. Otherwise, the
+                * returned ptep could go away if part of a shared pmd and
+                * another thread calls huge_pmd_unshare.
+                */
+               hugetlb_vma_lock_read(src_vma);
         }
  
         last_addr_mask = hugetlb_mask_last_page(h);
@@ -4942,6 +4950,8 @@ again:
         if (cow) {
                 raw_write_seqcount_end(&src->write_protect_seq);
                 mmu_notifier_invalidate_range_end(&range);
+       } else {
+               hugetlb_vma_unlock_read(src_vma);
         }
  
         return ret;
@@ -5000,6 +5010,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
         mmu_notifier_invalidate_range_start(&range);
         last_addr_mask = hugetlb_mask_last_page(h);
         /* Prevent race with file truncation */
+       hugetlb_vma_lock_write(vma);
         i_mmap_lock_write(mapping);
         for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
                 src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -5031,6 +5042,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
                 flush_tlb_range(vma, old_end - len, old_end);
         mmu_notifier_invalidate_range_end(&range);
         i_mmap_unlock_write(mapping);
+       hugetlb_vma_unlock_write(vma);
  
         return len + old_addr - old_end;
  }
@@ -5350,8 +5362,29 @@ retry_avoidcopy:
                  * may get SIGKILLed if it later faults.
                  */
                 if (outside_reserve) {
+                       struct address_space *mapping = vma->vm_file->f_mapping;
+                       pgoff_t idx;
+                       u32 hash;
+
                         put_page(old_page);
+                       /*
+                        * Drop hugetlb_fault_mutex and vma_lock before
+                        * unmapping.  unmapping needs to hold vma_lock
+                        * in write mode.  Dropping vma_lock in read mode
+                        * here is OK as COW mappings do not interact with
+                        * PMD sharing.
+                        *
+                        * Reacquire both after unmap operation.
+                        */
+                       idx = vma_hugecache_offset(h, vma, haddr);
+                       hash = hugetlb_fault_mutex_hash(mapping, idx);
+                       hugetlb_vma_unlock_read(vma);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
                         unmap_ref_private(mm, vma, old_page, haddr);
+
+                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       hugetlb_vma_lock_read(vma);
                         spin_lock(ptl);
                         ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
                         if (likely(ptep &&
@@ -5500,14 +5533,16 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
         };
  
         /*
-        * hugetlb_fault_mutex and i_mmap_rwsem must be
+        * vma_lock and hugetlb_fault_mutex must be
          * dropped before handling userfault.  Reacquire
          * after handling fault to make calling code simpler.
          */
+       hugetlb_vma_unlock_read(vma);
         hash = hugetlb_fault_mutex_hash(mapping, idx);
         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
         ret = handle_userfault(&vmf, reason);
         mutex_lock(&hugetlb_fault_mutex_table[hash]);
+       hugetlb_vma_lock_read(vma);
  
         return ret;
  }
@@ -5741,6 +5776,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
         if (ptep) {
+               /*
+                * Since we hold no locks, ptep could be stale.  That is
+                * OK as we are only making decisions based on content and
+                * not actually modifying content here.
+                */
                 entry = huge_ptep_get(ptep);
                 if (unlikely(is_hugetlb_entry_migration(entry))) {
                         migration_entry_wait_huge(vma, ptep);
@@ -5748,23 +5788,35 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                         return VM_FAULT_HWPOISON_LARGE |
                                 VM_FAULT_SET_HINDEX(hstate_index(h));
-       } else {
-               ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
-               if (!ptep)
-                       return VM_FAULT_OOM;
         }
  
-       mapping = vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, vma, haddr);
-
         /*
          * Serialize hugepage allocation and instantiation, so that we don't
          * get spurious allocation failures if two CPUs race to instantiate
          * the same page in the page cache.
          */
+       mapping = vma->vm_file->f_mapping;
+       idx = vma_hugecache_offset(h, vma, haddr);
         hash = hugetlb_fault_mutex_hash(mapping, idx);
         mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
+       /*
+        * Acquire vma lock before calling huge_pte_alloc and hold
+        * until finished with ptep.  This prevents huge_pmd_unshare from
+        * being called elsewhere and making the ptep no longer valid.
+        *
+        * ptep could have already be assigned via huge_pte_offset.  That
+        * is OK, as huge_pte_alloc will return the same value unless
+        * something has changed.
+        */
+       hugetlb_vma_lock_read(vma);
+       ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
+       if (!ptep) {
+               hugetlb_vma_unlock_read(vma);
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+               return VM_FAULT_OOM;
+       }
+
         entry = huge_ptep_get(ptep);
         /* PTE markers should be handled the same way as none pte */
         if (huge_pte_none_mostly(entry)) {
@@ -5825,6 +5877,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         unlock_page(pagecache_page);
                         put_page(pagecache_page);
                 }
+               hugetlb_vma_unlock_read(vma);
                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                 return handle_userfault(&vmf, VM_UFFD_WP);
         }
@@ -5868,6 +5921,7 @@ out_ptl:
                 put_page(pagecache_page);
         }
  out_mutex:
+       hugetlb_vma_unlock_read(vma);
         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
         /*
          * Generally it's safe to hold refcount during waiting page lock. But
@@ -6330,8 +6384,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         flush_cache_range(vma, range.start, range.end);
  
         mmu_notifier_invalidate_range_start(&range);
-       last_addr_mask = hugetlb_mask_last_page(h);
+       hugetlb_vma_lock_write(vma);
         i_mmap_lock_write(vma->vm_file->f_mapping);
+       last_addr_mask = hugetlb_mask_last_page(h);
         for (; address < end; address += psize) {
                 spinlock_t *ptl;
                 ptep = huge_pte_offset(mm, address, psize);
@@ -6430,6 +6485,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
          * See Documentation/mm/mmu_notifier.rst
          */
         i_mmap_unlock_write(vma->vm_file->f_mapping);
+       hugetlb_vma_unlock_write(vma);
         mmu_notifier_invalidate_range_end(&range);
  
         return pages << h->order;
@@ -6931,6 +6987,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
         pud_t *pud = pud_offset(p4d, addr);
  
         i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+       hugetlb_vma_assert_locked(vma);
         BUG_ON(page_count(virt_to_page(ptep)) == 0);
         if (page_count(virt_to_page(ptep)) == 1)
                 return 0;
@@ -6943,6 +7000,31 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
  
  #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
  
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+       return 1;
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
  void hugetlb_vma_lock_release(struct kref *kref)
  {
  }
@@ -7325,6 +7407,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
                                 start, end);
         mmu_notifier_invalidate_range_start(&range);
+       hugetlb_vma_lock_write(vma);
         i_mmap_lock_write(vma->vm_file->f_mapping);
         for (address = start; address < end; address += PUD_SIZE) {
                 ptep = huge_pte_offset(mm, address, sz);
@@ -7336,6 +7419,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
         }
         flush_hugetlb_tlb_range(vma, start, end);
         i_mmap_unlock_write(vma->vm_file->f_mapping);
+       hugetlb_vma_unlock_write(vma);
         /*
          * No need to call mmu_notifier_invalidate_range(), see
          * Documentation/mm/mmu_notifier.rst.
diff --git a/mm/memory.c b/mm/memory.c

index c01c12500169d82de2c39e40c124cf6ad26f36e6..b3ed17219d772d0558f7d4027ff65407c8dc4576 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1684,10 +1684,12 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         if (vma->vm_file) {
                                 zap_flags_t zap_flags = details ?
                                     details->zap_flags : 0;
+                               hugetlb_vma_lock_write(vma);
                                 i_mmap_lock_write(vma->vm_file->f_mapping);
                                 __unmap_hugepage_range_final(tlb, vma, start, end,
                                                              NULL, zap_flags);
                                 i_mmap_unlock_write(vma->vm_file->f_mapping);
+                               hugetlb_vma_unlock_write(vma);
                         }
                 } else
                         unmap_page_range(tlb, vma, start, end, details);
diff --git a/mm/rmap.c b/mm/rmap.c

index 0e179c823e0acc300f5efc79cf977eabb19ea2f8..b6743c2b8b5f4b22cffe66dd488f14af7768eba0 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1551,24 +1551,39 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                          * To call huge_pmd_unshare, i_mmap_rwsem must be
                          * held in write mode.  Caller needs to explicitly
                          * do this outside rmap routines.
+                        *
+                        * We also must hold hugetlb vma_lock in write mode.
+                        * Lock order dictates acquiring vma_lock BEFORE
+                        * i_mmap_rwsem.  We can only try lock here and fail
+                        * if unsuccessful.
                          */
-                       VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-                       if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-                               flush_tlb_range(vma, range.start, range.end);
-                               mmu_notifier_invalidate_range(mm, range.start,
-                                                             range.end);
-
-                               /*
-                                * The ref count of the PMD page was dropped
-                                * which is part of the way map counting
-                                * is done for shared PMDs.  Return 'true'
-                                * here.  When there is no other sharing,
-                                * huge_pmd_unshare returns false and we will
-                                * unmap the actual page and drop map count
-                                * to zero.
-                                */
-                               page_vma_mapped_walk_done(&pvmw);
-                               break;
+                       if (!anon) {
+                               VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+                               if (!hugetlb_vma_trylock_write(vma)) {
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       ret = false;
+                                       break;
+                               }
+                               if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+                                       hugetlb_vma_unlock_write(vma);
+                                       flush_tlb_range(vma,
+                                               range.start, range.end);
+                                       mmu_notifier_invalidate_range(mm,
+                                               range.start, range.end);
+                                       /*
+                                        * The ref count of the PMD page was
+                                        * dropped which is part of the way map
+                                        * counting is done for shared PMDs.
+                                        * Return 'true' here.  When there is
+                                        * no other sharing, huge_pmd_unshare
+                                        * returns false and we will unmap the
+                                        * actual page and drop map count
+                                        * to zero.
+                                        */
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       break;
+                               }
+                               hugetlb_vma_unlock_write(vma);
                         }
                         pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                 } else {
@@ -1926,26 +1941,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                          * To call huge_pmd_unshare, i_mmap_rwsem must be
                          * held in write mode.  Caller needs to explicitly
                          * do this outside rmap routines.
+                        *
+                        * We also must hold hugetlb vma_lock in write mode.
+                        * Lock order dictates acquiring vma_lock BEFORE
+                        * i_mmap_rwsem.  We can only try lock here and
+                        * fail if unsuccessful.
                          */
-                       VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-                       if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-                               flush_tlb_range(vma, range.start, range.end);
-                               mmu_notifier_invalidate_range(mm, range.start,
-                                                             range.end);
-
-                               /*
-                                * The ref count of the PMD page was dropped
-                                * which is part of the way map counting
-                                * is done for shared PMDs.  Return 'true'
-                                * here.  When there is no other sharing,
-                                * huge_pmd_unshare returns false and we will
-                                * unmap the actual page and drop map count
-                                * to zero.
-                                */
-                               page_vma_mapped_walk_done(&pvmw);
-                               break;
+                       if (!anon) {
+                               VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+                               if (!hugetlb_vma_trylock_write(vma)) {
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       ret = false;
+                                       break;
+                               }
+                               if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+                                       hugetlb_vma_unlock_write(vma);
+                                       flush_tlb_range(vma,
+                                               range.start, range.end);
+                                       mmu_notifier_invalidate_range(mm,
+                                               range.start, range.end);
+
+                                       /*
+                                        * The ref count of the PMD page was
+                                        * dropped which is part of the way map
+                                        * counting is done for shared PMDs.
+                                        * Return 'true' here.  When there is
+                                        * no other sharing, huge_pmd_unshare
+                                        * returns false and we will unmap the
+                                        * actual page and drop map count
+                                        * to zero.
+                                        */
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       break;
+                               }
+                               hugetlb_vma_unlock_write(vma);
                         }
-
                         /* Nuke the hugetlb page table entry */
                         pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                 } else {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c

index 0fdbd2c05587d6f74924a62d971d3a3389d0f7e6..e24e8a47ce8a2805b8e5513845cd9912c370d02f 100644 (file)
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -379,16 +379,21 @@ retry:
                 BUG_ON(dst_addr >= dst_start + len);
  
                 /*
-                * Serialize via hugetlb_fault_mutex.
+                * Serialize via vma_lock and hugetlb_fault_mutex.
+                * vma_lock ensures the dst_pte remains valid even
+                * in the case of shared pmds.  fault mutex prevents
+                * races with other faulting threads.
                  */
                 idx = linear_page_index(dst_vma, dst_addr);
                 mapping = dst_vma->vm_file->f_mapping;
                 hash = hugetlb_fault_mutex_hash(mapping, idx);
                 mutex_lock(&hugetlb_fault_mutex_table[hash]);
+               hugetlb_vma_lock_read(dst_vma);
  
                 err = -ENOMEM;
                 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
                 if (!dst_pte) {
+                       hugetlb_vma_unlock_read(dst_vma);
                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                         goto out_unlock;
                 }
@@ -396,6 +401,7 @@ retry:
                 if (mode != MCOPY_ATOMIC_CONTINUE &&
                     !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
                         err = -EEXIST;
+                       hugetlb_vma_unlock_read(dst_vma);
                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                         goto out_unlock;
                 }
@@ -404,6 +410,7 @@ retry:
                                                dst_addr, src_addr, mode, &page,
                                                wp_copy);
  
+               hugetlb_vma_unlock_read(dst_vma);
                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
  
                 cond_resched();
author	Mike Kravetz <mike.kravetz@oracle.com>
	Wed, 14 Sep 2022 22:18:09 +0000 (15:18 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Mon, 3 Oct 2022 21:03:17 +0000 (14:03 -0700)
fs/hugetlbfs/inode.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/userfaultfd.c		patch \| blob \| history