hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race

author Mike Kravetz <mike.kravetz@oracle.com>

Wed, 14 Sep 2022 22:18:02 +0000 (15:18 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Mon, 3 Oct 2022 21:03:16 +0000 (14:03 -0700)
author Mike Kravetz <mike.kravetz@oracle.com>
Wed, 14 Sep 2022 22:18:02 +0000 (15:18 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Mon, 3 Oct 2022 21:03:16 +0000 (14:03 -0700)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index f7a5b51..a32031e 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -419,9 +419,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
   *     In this case, we first scan the range and release found pages.
   *     After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
   *     maps and global counts.  Page faults can not race with truncation
- *     in this routine.  hugetlb_no_page() holds i_mmap_rwsem and prevents
- *     page faults in the truncated range by checking i_size.  i_size is
- *     modified while holding i_mmap_rwsem.
+ *     in this routine.  hugetlb_no_page() prevents page faults in the
+ *     truncated range.  It checks i_size before allocation, and again after
+ *     with the page table lock for the page held.  The same lock must be
+ *     acquired to unmap a page.
   * hole punch is indicated if end is not LLONG_MAX
   *     In the hole punch case we scan the range and release found pages.
   *     Only when releasing a page is the associated region/reserve map
@@ -451,16 +452,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                         u32 hash = 0;
  
                         index = folio->index;
-                       if (!truncate_op) {
-                               /*
-                                * Only need to hold the fault mutex in the
-                                * hole punch case.  This prevents races with
-                                * page faults.  Races are not possible in the
-                                * case of truncation.
-                                */
-                               hash = hugetlb_fault_mutex_hash(mapping, index);
-                               mutex_lock(&hugetlb_fault_mutex_table[hash]);
-                       }
+                       hash = hugetlb_fault_mutex_hash(mapping, index);
+                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
                         /*
                          * If folio is mapped, it was faulted in after being
@@ -504,8 +497,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                         }
  
                         folio_unlock(folio);
-                       if (!truncate_op)
-                               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                 }
                 folio_batch_release(&fbatch);
                 cond_resched();
@@ -543,8 +535,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
         BUG_ON(offset & ~huge_page_mask(h));
         pgoff = offset >> PAGE_SHIFT;
  
-       i_mmap_lock_write(mapping);
         i_size_write(inode, offset);
+       i_mmap_lock_write(mapping);
         if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
                 hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
                                       ZAP_FLAG_DROP_MARKER);
@@ -703,11 +695,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                 /* addr is the offset within the file (zero based) */
                 addr = index * hpage_size;
  
-               /*
-                * fault mutex taken here, protects against fault path
-                * and hole punch.  inode_lock previously taken protects
-                * against truncation.
-                */
+               /* mutex taken here, fault path and hole punch */
                 hash = hugetlb_fault_mutex_hash(mapping, index);
                 mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index d4347ae..14afb5b 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5560,17 +5560,15 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         }
  
         /*
-        * We can not race with truncation due to holding i_mmap_rwsem.
-        * i_size is modified when holding i_mmap_rwsem, so check here
-        * once for faults beyond end of file.
+        * Use page lock to guard against racing truncation
+        * before we get page_table_lock.
          */
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       if (idx >= size)
-               goto out;
-
         new_page = false;
         page = find_lock_page(mapping, idx);
         if (!page) {
+               size = i_size_read(mapping->host) >> huge_page_shift(h);
+               if (idx >= size)
+                       goto out;
                 /* Check for page in userfault range */
                 if (userfaultfd_missing(vma)) {
                         ret = hugetlb_handle_userfault(vma, mapping, idx,
@@ -5666,6 +5664,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         }
  
         ptl = huge_pte_lock(h, mm, ptep);
+       size = i_size_read(mapping->host) >> huge_page_shift(h);
+       if (idx >= size)
+               goto backout;
+
         ret = 0;
         /* If pte changed from under us, retry */
         if (!pte_same(huge_ptep_get(ptep), old_pte))
@@ -5774,10 +5776,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         /*
          * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-        * until finished with ptep.  This serves two purposes:
-        * 1) It prevents huge_pmd_unshare from being called elsewhere
-        *    and making the ptep no longer valid.
-        * 2) It synchronizes us with i_size modifications during truncation.
+        * until finished with ptep.  This prevents huge_pmd_unshare from
+        * being called elsewhere and making the ptep no longer valid.
          *
          * ptep could have already be assigned via huge_pte_offset.  That
          * is OK, as huge_pte_alloc will return the same value unless
author	Mike Kravetz <mike.kravetz@oracle.com>
	Wed, 14 Sep 2022 22:18:02 +0000 (15:18 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Mon, 3 Oct 2022 21:03:16 +0000 (14:03 -0700)
fs/hugetlbfs/inode.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history