mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page

author Muchun Song <songmuchun@bytedance.com>

Thu, 1 Jul 2021 01:47:21 +0000 (18:47 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 1 Jul 2021 03:47:25 +0000 (20:47 -0700)
author Muchun Song <songmuchun@bytedance.com>
Thu, 1 Jul 2021 01:47:21 +0000 (18:47 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Jul 2021 03:47:25 +0000 (20:47 -0700)
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst

index f7b1c74..6988895 100644 (file)
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -60,6 +60,10 @@ HugePages_Surp
          the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
          maximum number of surplus huge pages is controlled by
          ``/proc/sys/vm/nr_overcommit_hugepages``.
+       Note: When the feature of freeing unused vmemmap pages associated
+       with each hugetlb page is enabled, the number of surplus huge pages
+       may be temporarily larger than the maximum number of surplus huge
+       pages when the system is under memory pressure.
  Hugepagesize
         is the default hugepage size (in Kb).
  Hugetlb
@@ -80,6 +84,10 @@ returned to the huge page pool when freed by a task.  A user with root
  privileges can dynamically allocate more or free some persistent huge pages
  by increasing or decreasing the value of ``nr_hugepages``.
  
+Note: When the feature of freeing unused vmemmap pages associated with each
+hugetlb page is enabled, we can fail to free the huge pages triggered by
+the user when ths system is under memory pressure.  Please try again later.
+
  Pages that are used as huge pages are reserved inside the kernel and cannot
  be used for other purposes.  Huge pages cannot be swapped out under
  memory pressure.
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst

index 05d51d2..c6bae2d 100644 (file)
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -357,6 +357,19 @@ creates ZONE_MOVABLE as following.
     Unfortunately, there is no information to show which memory block belongs
     to ZONE_MOVABLE. This is TBD.
  
+   Memory offlining can fail when dissolving a free huge page on ZONE_MOVABLE
+   and the feature of freeing unused vmemmap pages associated with each hugetlb
+   page is enabled.
+
+   This can happen when we have plenty of ZONE_MOVABLE memory, but not enough
+   kernel memory to allocate vmemmmap pages.  We may even be able to migrate
+   huge page contents, but will not be able to dissolve the source huge page.
+   This will prevent an offline operation and is unfortunate as memory offlining
+   is expected to succeed on movable zones.  Users that depend on memory hotplug
+   to succeed for movable zones should carefully consider whether the memory
+   savings gained from this feature are worth the risk of possibly not being
+   able to offline memory in certain situations.
+
  .. note::
     Techniques that rely on long-term pinnings of memory (especially, RDMA and
     vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 0c8c964..3578d9d 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -532,12 +532,14 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
   *     modifications require hugetlb_lock.
   * HPG_freed - Set when page is on the free lists.
   *     Synchronization: hugetlb_lock held for examination and modification.
+ * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
   */
  enum hugetlb_page_flags {
         HPG_restore_reserve = 0,
         HPG_migratable,
         HPG_temporary,
         HPG_freed,
+       HPG_vmemmap_optimized,
         __NR_HPAGEFLAGS,
  };
  
@@ -583,6 +585,7 @@ HPAGEFLAG(RestoreReserve, restore_reserve)
  HPAGEFLAG(Migratable, migratable)
  HPAGEFLAG(Temporary, temporary)
  HPAGEFLAG(Freed, freed)
+HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
  
  #ifdef CONFIG_HUGETLB_PAGE
  
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 3437aa7..706bee9 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3078,6 +3078,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
  
  void vmemmap_remap_free(unsigned long start, unsigned long end,
                         unsigned long reuse);
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+                       unsigned long reuse, gfp_t gfp_mask);
  
  void *sparse_buffer_alloc(unsigned long size);
  struct page * __populate_section_memmap(unsigned long pfn,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index e7eb1ab..778db5d 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1376,6 +1376,39 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
         h->nr_huge_pages_node[nid]--;
  }
  
+static void add_hugetlb_page(struct hstate *h, struct page *page,
+                            bool adjust_surplus)
+{
+       int zeroed;
+       int nid = page_to_nid(page);
+
+       VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+
+       lockdep_assert_held(&hugetlb_lock);
+
+       INIT_LIST_HEAD(&page->lru);
+       h->nr_huge_pages++;
+       h->nr_huge_pages_node[nid]++;
+
+       if (adjust_surplus) {
+               h->surplus_huge_pages++;
+               h->surplus_huge_pages_node[nid]++;
+       }
+
+       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+       set_page_private(page, 0);
+       SetHPageVmemmapOptimized(page);
+
+       /*
+        * This page is now managed by the hugetlb allocator and has
+        * no users -- drop the last reference.
+        */
+       zeroed = put_page_testzero(page);
+       VM_BUG_ON_PAGE(!zeroed, page);
+       arch_clear_hugepage_flags(page);
+       enqueue_huge_page(h, page);
+}
+
  static void __update_and_free_page(struct hstate *h, struct page *page)
  {
         int i;
@@ -1384,6 +1417,18 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                 return;
  
+       if (alloc_huge_page_vmemmap(h, page)) {
+               spin_lock_irq(&hugetlb_lock);
+               /*
+                * If we cannot allocate vmemmap pages, just refuse to free the
+                * page and put the page back on the hugetlb free list and treat
+                * as a surplus page.
+                */
+               add_hugetlb_page(h, page, true);
+               spin_unlock_irq(&hugetlb_lock);
+               return;
+       }
+
         for (i = 0; i < pages_per_huge_page(h);
              i++, subpage = mem_map_next(subpage, page, i)) {
                 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1450,7 +1495,7 @@ static inline void flush_free_hpage_work(struct hstate *h)
  static void update_and_free_page(struct hstate *h, struct page *page,
                                  bool atomic)
  {
-       if (!free_vmemmap_pages_per_hpage(h) || !atomic) {
+       if (!HPageVmemmapOptimized(page) || !atomic) {
                 __update_and_free_page(h, page);
                 return;
         }
@@ -1806,10 +1851,14 @@ static struct page *remove_pool_huge_page(struct hstate *h,
   * nothing for in-use hugepages and non-hugepages.
   * This function returns values like below:
   *
- *  -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
- *          (allocated or reserved.)
- *       0: successfully dissolved free hugepages or the page is not a
- *          hugepage (considered as already dissolved)
+ *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
+ *           when the system is under memory pressure and the feature of
+ *           freeing unused vmemmap pages associated with each hugetlb page
+ *           is enabled.
+ *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
+ *           (allocated or reserved.)
+ *       0:  successfully dissolved free hugepages or the page is not a
+ *           hugepage (considered as already dissolved)
   */
  int dissolve_free_huge_page(struct page *page)
  {
@@ -1851,19 +1900,38 @@ retry:
                         goto retry;
                 }
  
-               /*
-                * Move PageHWPoison flag from head page to the raw error page,
-                * which makes any subpages rather than the error page reusable.
-                */
-               if (PageHWPoison(head) && page != head) {
-                       SetPageHWPoison(page);
-                       ClearPageHWPoison(head);
-               }
                 remove_hugetlb_page(h, head, false);
                 h->max_huge_pages--;
                 spin_unlock_irq(&hugetlb_lock);
-               update_and_free_page(h, head, false);
-               return 0;
+
+               /*
+                * Normally update_and_free_page will allocate required vmemmmap
+                * before freeing the page.  update_and_free_page will fail to
+                * free the page if it can not allocate required vmemmap.  We
+                * need to adjust max_huge_pages if the page is not freed.
+                * Attempt to allocate vmemmmap here so that we can take
+                * appropriate action on failure.
+                */
+               rc = alloc_huge_page_vmemmap(h, head);
+               if (!rc) {
+                       /*
+                        * Move PageHWPoison flag from head page to the raw
+                        * error page, which makes any subpages rather than
+                        * the error page reusable.
+                        */
+                       if (PageHWPoison(head) && page != head) {
+                               SetPageHWPoison(page);
+                               ClearPageHWPoison(head);
+                       }
+                       update_and_free_page(h, head, false);
+               } else {
+                       spin_lock_irq(&hugetlb_lock);
+                       add_hugetlb_page(h, head, false);
+                       h->max_huge_pages++;
+                       spin_unlock_irq(&hugetlb_lock);
+               }
+
+               return rc;
         }
  out:
         spin_unlock_irq(&hugetlb_lock);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c

index cb28c5b..a897c77 100644 (file)
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -185,6 +185,38 @@ static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
         return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
  }
  
+/*
+ * Previously discarded vmemmap pages will be allocated and remapping
+ * after this function returns zero.
+ */
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+       int ret;
+       unsigned long vmemmap_addr = (unsigned long)head;
+       unsigned long vmemmap_end, vmemmap_reuse;
+
+       if (!HPageVmemmapOptimized(head))
+               return 0;
+
+       vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+       vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+       vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+       /*
+        * The pages which the vmemmap virtual address range [@vmemmap_addr,
+        * @vmemmap_end) are mapped to are freed to the buddy allocator, and
+        * the range is mapped to the page which @vmemmap_reuse is mapped to.
+        * When a HugeTLB page is freed to the buddy allocator, previously
+        * discarded vmemmap pages must be allocated and remapping.
+        */
+       ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+                                 GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
+
+       if (!ret)
+               ClearHPageVmemmapOptimized(head);
+
+       return ret;
+}
+
  void free_huge_page_vmemmap(struct hstate *h, struct page *head)
  {
         unsigned long vmemmap_addr = (unsigned long)head;
@@ -203,4 +235,6 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head)
          * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
          */
         vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse);
+
+       SetHPageVmemmapOptimized(head);
  }
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h

index 01f8637..a37771b 100644 (file)
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,6 +11,7 @@
  #include <linux/hugetlb.h>
  
  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
  void free_huge_page_vmemmap(struct hstate *h, struct page *head);
  
  /*
@@ -25,6 +26,11 @@ static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
         return 0;
  }
  #else
+static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+       return 0;
+}
+
  static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
  {
  }
diff --git a/mm/migrate.c b/mm/migrate.c

index 380ca57..cc4d6af 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -626,7 +626,10 @@ void migrate_page_states(struct page *newpage, struct page *page)
         if (PageSwapCache(page))
                 ClearPageSwapCache(page);
         ClearPagePrivate(page);
-       set_page_private(page, 0);
+
+       /* page->private contains hugetlb specific flags */
+       if (!PageHuge(page))
+               set_page_private(page, 0);
  
         /*
          * If any waiters have accumulated on the new page then
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c

index 3ec5488..a3aa275 100644 (file)
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@
   * @remap_pte:         called for each lowest-level entry (PTE).
   * @reuse_page:                the page which is reused for the tail vmemmap pages.
   * @reuse_addr:                the virtual address of the @reuse_page page.
- * @vmemmap_pages:     the list head of the vmemmap pages that can be freed.
+ * @vmemmap_pages:     the list head of the vmemmap pages that can be freed
+ *                     or is mapped from.
   */
  struct vmemmap_remap_walk {
         void (*remap_pte)(pte_t *pte, unsigned long addr,
@@ -224,6 +225,78 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
         free_vmemmap_page_list(&vmemmap_pages);
  }
  
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+                               struct vmemmap_remap_walk *walk)
+{
+       pgprot_t pgprot = PAGE_KERNEL;
+       struct page *page;
+       void *to;
+
+       BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+       page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+       list_del(&page->lru);
+       to = page_to_virt(page);
+       copy_page(to, (void *)walk->reuse_addr);
+
+       set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+                                  gfp_t gfp_mask, struct list_head *list)
+{
+       unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+       int nid = page_to_nid((struct page *)start);
+       struct page *page, *next;
+
+       while (nr_pages--) {
+               page = alloc_pages_node(nid, gfp_mask, 0);
+               if (!page)
+                       goto out;
+               list_add_tail(&page->lru, list);
+       }
+
+       return 0;
+out:
+       list_for_each_entry_safe(page, next, list, lru)
+               __free_pages(page, 0);
+       return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ *                      to the page which is from the @vmemmap_pages
+ *                      respectively.
+ * @start:     start address of the vmemmap virtual address range that we want
+ *             to remap.
+ * @end:       end address of the vmemmap virtual address range that we want to
+ *             remap.
+ * @reuse:     reuse address.
+ * @gfp_mask:  GFP flag for allocating vmemmap pages.
+ */
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+                       unsigned long reuse, gfp_t gfp_mask)
+{
+       LIST_HEAD(vmemmap_pages);
+       struct vmemmap_remap_walk walk = {
+               .remap_pte      = vmemmap_restore_pte,
+               .reuse_addr     = reuse,
+               .vmemmap_pages  = &vmemmap_pages,
+       };
+
+       /* See the comment in the vmemmap_remap_free(). */
+       BUG_ON(start - reuse != PAGE_SIZE);
+
+       might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+
+       if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+               return -ENOMEM;
+
+       vmemmap_remap_range(reuse, end, &walk);
+
+       return 0;
+}
+
  /*
   * Allocate a block of memory to be used to back the virtual memory map
   * or to back the page tables that are used to create the mapping.
author	Muchun Song <songmuchun@bytedance.com>
	Thu, 1 Jul 2021 01:47:21 +0000 (18:47 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 1 Jul 2021 03:47:25 +0000 (20:47 -0700)
Documentation/admin-guide/mm/hugetlbpage.rst		patch \| blob \| history
Documentation/admin-guide/mm/memory-hotplug.rst		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/hugetlb_vmemmap.c		patch \| blob \| history
mm/hugetlb_vmemmap.h		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/sparse-vmemmap.c		patch \| blob \| history