hugetlb: add vma based lock for pmd sharing

author Mike Kravetz <mike.kravetz@oracle.com>

Wed, 14 Sep 2022 22:18:07 +0000 (15:18 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Mon, 3 Oct 2022 21:03:17 +0000 (14:03 -0700)
author Mike Kravetz <mike.kravetz@oracle.com>
Wed, 14 Sep 2022 22:18:07 +0000 (15:18 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Mon, 3 Oct 2022 21:03:17 +0000 (14:03 -0700)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 4893d6d..7b70aa9 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -115,6 +115,12 @@ struct file_region {
  #endif
  };
  
+struct hugetlb_vma_lock {
+       struct kref refs;
+       struct rw_semaphore rw_sema;
+       struct vm_area_struct *vma;
+};
+
  extern struct resv_map *resv_map_alloc(void);
  void resv_map_release(struct kref *ref);
  
@@ -127,7 +133,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                 long min_hpages);
  void hugepage_put_subpool(struct hugepage_subpool *spool);
  
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void hugetlb_dup_vma_private(struct vm_area_struct *vma);
  void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
  int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
  int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
@@ -215,6 +221,14 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
  struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
                              pgd_t *pgd, int flags);
  
+void hugetlb_vma_lock_read(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
+void hugetlb_vma_lock_write(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
+void hugetlb_vma_lock_release(struct kref *kref);
+
  int pmd_huge(pmd_t pmd);
  int pud_huge(pud_t pud);
  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -226,7 +240,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
  
  #else /* !CONFIG_HUGETLB_PAGE */
  
-static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
  {
  }
  
@@ -337,6 +351,31 @@ static inline int prepare_hugepage_range(struct file *file,
         return -EINVAL;
  }
  
+static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+       return 1;
+}
+
+static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
  static inline int pmd_huge(pmd_t pmd)
  {
         return 0;
diff --git a/kernel/fork.c b/kernel/fork.c

index 5046033..3d788f7 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -674,12 +674,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                 }
  
                 /*
-                * Clear hugetlb-related page reserves for children. This only
-                * affects MAP_PRIVATE mappings. Faults generated by the child
-                * are not guaranteed to succeed, even if read-only
+                * Copy/update hugetlb private vma information.
                  */
                 if (is_vm_hugetlb_page(tmp))
-                       reset_vma_resv_huge_pages(tmp);
+                       hugetlb_dup_vma_private(tmp);
  
                 /* Link the vma into the MT */
                 mas.index = tmp->vm_start;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 482f7f3..f44b799 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -91,6 +91,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
  
  /* Forward declaration */
  static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
  
  static inline bool subpool_is_free(struct hugepage_subpool *spool)
  {
@@ -859,7 +861,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
   * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
   * is guaranteed to have their future faults succeed.
   *
- * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * With the exception of hugetlb_dup_vma_private() which is called at fork(),
   * the reserve counters are updated with the hugetlb_lock held. It is safe
   * to reset the VMA at fork() time as it is not in use yet and there is no
   * chance of the global counters getting corrupted as a result of the values.
@@ -1006,12 +1008,20 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
         return (get_vma_private_data(vma) & flag) != 0;
  }
  
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
  {
         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+       /*
+        * Clear vm_private_data
+        * - For MAP_PRIVATE mappings, this is the reserve map which does
+        *   not apply to children.  Faults generated by the children are
+        *   not guaranteed to succeed, even if read-only.
+        * - For shared mappings this is a per-vma semaphore that may be
+        *   allocated in a subsequent call to hugetlb_vm_op_open.
+        */
+       vma->vm_private_data = (void *)0;
         if (!(vma->vm_flags & VM_MAYSHARE))
-               vma->vm_private_data = (void *)0;
+               return;
  }
  
  /*
@@ -1042,7 +1052,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
                 kref_put(&reservations->refs, resv_map_release);
         }
  
-       reset_vma_resv_huge_pages(vma);
+       hugetlb_dup_vma_private(vma);
  }
  
  /* Returns true if the VMA has associated reserve pages */
@@ -4623,16 +4633,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
                 resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
                 kref_get(&resv->refs);
         }
+
+       hugetlb_vma_lock_alloc(vma);
  }
  
  static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  {
         struct hstate *h = hstate_vma(vma);
-       struct resv_map *resv = vma_resv_map(vma);
+       struct resv_map *resv;
         struct hugepage_subpool *spool = subpool_vma(vma);
         unsigned long reserve, start, end;
         long gbl_reserve;
  
+       hugetlb_vma_lock_free(vma);
+
+       resv = vma_resv_map(vma);
         if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                 return;
  
@@ -6440,6 +6455,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
         }
  
         /*
+        * vma specific semaphore used for pmd sharing synchronization
+        */
+       hugetlb_vma_lock_alloc(vma);
+
+       /*
          * Only apply hugepage reservation if asked. At fault time, an
          * attempt will be made for VM_NORESERVE to allocate a page
          * without using reserves
@@ -6462,12 +6482,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
                 resv_map = inode_resv_map(inode);
  
                 chg = region_chg(resv_map, from, to, &regions_needed);
-
         } else {
                 /* Private mapping. */
                 resv_map = resv_map_alloc();
                 if (!resv_map)
-                       return false;
+                       goto out_err;
  
                 chg = to - from;
  
@@ -6562,6 +6581,7 @@ out_uncharge_cgroup:
         hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
                                             chg * pages_per_huge_page(h), h_cg);
  out_err:
+       hugetlb_vma_lock_free(vma);
         if (!vma || vma->vm_flags & VM_MAYSHARE)
                 /* Only call region_abort if the region_chg succeeded but the
                  * region_add failed or didn't run.
@@ -6641,14 +6661,34 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
  }
  
  static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma,
-                               unsigned long start, unsigned long end)
+                               unsigned long start, unsigned long end,
+                               bool check_vma_lock)
  {
+#ifdef CONFIG_USERFAULTFD
+       if (uffd_disable_huge_pmd_share(vma))
+               return false;
+#endif
         /*
          * check on proper vm_flags and page table alignment
          */
-       if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end))
-               return true;
-       return false;
+       if (!(vma->vm_flags & VM_MAYSHARE))
+               return false;
+       if (check_vma_lock && !vma->vm_private_data)
+               return false;
+       if (!range_in_vma(vma, start, end))
+               return false;
+       return true;
+}
+
+static bool vma_pmd_shareable(struct vm_area_struct *vma)
+{
+       unsigned long start = ALIGN(vma->vm_start, PUD_SIZE),
+                     end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+       if (start >= end)
+               return false;
+
+       return __vma_aligned_range_pmd_shareable(vma, start, end, false);
  }
  
  static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,
@@ -6657,15 +6697,11 @@ static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,
         unsigned long start = addr & PUD_MASK;
         unsigned long end = start + PUD_SIZE;
  
-       return __vma_aligned_range_pmd_shareable(vma, start, end);
+       return __vma_aligned_range_pmd_shareable(vma, start, end, true);
  }
  
  bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
  {
-#ifdef CONFIG_USERFAULTFD
-       if (uffd_disable_huge_pmd_share(vma))
-               return false;
-#endif
         return vma_addr_pmd_shareable(vma, addr);
  }
  
@@ -6696,6 +6732,130 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                 *end = ALIGN(*end, PUD_SIZE);
  }
  
+static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+               vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               down_read(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               up_read(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               down_write(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               up_write(&vma_lock->rw_sema);
+       }
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+       struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+       if (!__vma_shareable_flags_pmd(vma))
+               return 1;
+
+       return down_write_trylock(&vma_lock->rw_sema);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               lockdep_assert_held(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+       struct hugetlb_vma_lock *vma_lock = container_of(kref,
+                       struct hugetlb_vma_lock, refs);
+
+       kfree(vma_lock);
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+       /*
+        * Only present in sharable vmas.  See comment in
+        * __unmap_hugepage_range_final about how VM_SHARED could
+        * be set without VM_MAYSHARE.  As a result, we need to
+        * check if either is set in the free path.
+        */
+       if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
+               return;
+
+       if (vma->vm_private_data) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               /*
+                * vma_lock structure may or not be released, but it
+                * certainly will no longer be attached to vma so clear
+                * pointer.
+                */
+               vma_lock->vma = NULL;
+               kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+               vma->vm_private_data = NULL;
+       }
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+       struct hugetlb_vma_lock *vma_lock;
+
+       /* Only establish in (flags) sharable vmas */
+       if (!vma || !(vma->vm_flags & VM_MAYSHARE))
+               return;
+
+       /* Should never get here with non-NULL vm_private_data */
+       if (vma->vm_private_data)
+               return;
+
+       /* Check size/alignment for pmd sharing possible */
+       if (!vma_pmd_shareable(vma))
+               return;
+
+       vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
+       if (!vma_lock)
+               /*
+                * If we can not allocate structure, then vma can not
+                * participate in pmd sharing.
+                */
+               return;
+
+       kref_init(&vma_lock->refs);
+       init_rwsem(&vma_lock->rw_sema);
+       vma_lock->vma = vma;
+       vma->vm_private_data = vma_lock;
+}
+
  /*
   * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
   * and returns the corresponding pte. While this is not necessary for the
@@ -6782,6 +6942,19 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
  }
  
  #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+}
+
  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                       unsigned long addr, pud_t *pud)
  {
diff --git a/mm/rmap.c b/mm/rmap.c

index 2a08647..0e179c8 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
   *   mm->mmap_lock
   *     mapping->invalidate_lock (in filemap_fault)
   *       page->flags PG_locked (lock_page)
- *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
   *           mapping->i_mmap_rwsem
   *             anon_vma->rwsem
   *               mm->page_table_lock or pte_lock
@@ -44,6 +44,12 @@
   * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
   *   ->tasklist_lock
   *     pte map lock
+ *
+ * hugetlbfs PageHuge() take locks in this order:
+ *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ *     vma_lock (hugetlb specific lock for pmd_sharing)
+ *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
+ *         page->flags PG_locked (lock_page)
   */
  
  #include <linux/mm.h>
author	Mike Kravetz <mike.kravetz@oracle.com>
	Wed, 14 Sep 2022 22:18:07 +0000 (15:18 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Mon, 3 Oct 2022 21:03:17 +0000 (14:03 -0700)
include/linux/hugetlb.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history