hugetlb: Try to grow hugetlb pool for MAP_PRIVATE mappings

author Adam Litke <agl@us.ibm.com>

Tue, 16 Oct 2007 08:26:18 +0000 (01:26 -0700)

committer Linus Torvalds <torvalds@woody.linux-foundation.org>

Tue, 16 Oct 2007 16:43:02 +0000 (09:43 -0700)
author Adam Litke <agl@us.ibm.com>
Tue, 16 Oct 2007 08:26:18 +0000 (01:26 -0700)
committer Linus Torvalds <torvalds@woody.linux-foundation.org>
Tue, 16 Oct 2007 16:43:02 +0000 (09:43 -0700)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index ba029d6..8768e52 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -23,10 +23,12 @@
  
  const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
  static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
+static unsigned long surplus_huge_pages;
  unsigned long max_huge_pages;
  static struct list_head hugepage_freelists[MAX_NUMNODES];
  static unsigned int nr_huge_pages_node[MAX_NUMNODES];
  static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
  static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
  unsigned long hugepages_treat_as_movable;
  
@@ -109,15 +111,57 @@ static void update_and_free_page(struct page *page)
  
  static void free_huge_page(struct page *page)
  {
-       BUG_ON(page_count(page));
+       int nid = page_to_nid(page);
  
+       BUG_ON(page_count(page));
         INIT_LIST_HEAD(&page->lru);
  
         spin_lock(&hugetlb_lock);
-       enqueue_huge_page(page);
+       if (surplus_huge_pages_node[nid]) {
+               update_and_free_page(page);
+               surplus_huge_pages--;
+               surplus_huge_pages_node[nid]--;
+       } else {
+               enqueue_huge_page(page);
+       }
         spin_unlock(&hugetlb_lock);
  }
  
+/*
+ * Increment or decrement surplus_huge_pages.  Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(int delta)
+{
+       static int prev_nid;
+       int nid = prev_nid;
+       int ret = 0;
+
+       VM_BUG_ON(delta != -1 && delta != 1);
+       do {
+               nid = next_node(nid, node_online_map);
+               if (nid == MAX_NUMNODES)
+                       nid = first_node(node_online_map);
+
+               /* To shrink on this node, there must be a surplus page */
+               if (delta < 0 && !surplus_huge_pages_node[nid])
+                       continue;
+               /* Surplus cannot exceed the total number of pages */
+               if (delta > 0 && surplus_huge_pages_node[nid] >=
+                                               nr_huge_pages_node[nid])
+                       continue;
+
+               surplus_huge_pages += delta;
+               surplus_huge_pages_node[nid] += delta;
+               ret = 1;
+               break;
+       } while (nid != prev_nid);
+
+       prev_nid = nid;
+       return ret;
+}
+
  static int alloc_fresh_huge_page(void)
  {
         static int prev_nid;
@@ -150,10 +194,30 @@ static int alloc_fresh_huge_page(void)
         return 0;
  }
  
+static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
+                                               unsigned long address)
+{
+       struct page *page;
+
+       page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
+                                       HUGETLB_PAGE_ORDER);
+       if (page) {
+               set_compound_page_dtor(page, free_huge_page);
+               spin_lock(&hugetlb_lock);
+               nr_huge_pages++;
+               nr_huge_pages_node[page_to_nid(page)]++;
+               surplus_huge_pages++;
+               surplus_huge_pages_node[page_to_nid(page)]++;
+               spin_unlock(&hugetlb_lock);
+       }
+
+       return page;
+}
+
  static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                     unsigned long addr)
  {
-       struct page *page;
+       struct page *page = NULL;
  
         spin_lock(&hugetlb_lock);
         if (vma->vm_flags & VM_MAYSHARE)
@@ -173,7 +237,16 @@ fail:
         if (vma->vm_flags & VM_MAYSHARE)
                 resv_huge_pages++;
         spin_unlock(&hugetlb_lock);
-       return NULL;
+
+       /*
+        * Private mappings do not use reserved huge pages so the allocation
+        * may have failed due to an undersized hugetlb pool.  Try to grab a
+        * surplus huge page from the buddy allocator.
+        */
+       if (!(vma->vm_flags & VM_MAYSHARE))
+               page = alloc_buddy_huge_page(vma, addr);
+
+       return page;
  }
  
  static int __init hugetlb_init(void)
@@ -241,26 +314,62 @@ static inline void try_to_free_low(unsigned long count)
  }
  #endif
  
+#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
  static unsigned long set_max_huge_pages(unsigned long count)
  {
-       while (count > nr_huge_pages) {
-               if (!alloc_fresh_huge_page())
-                       return nr_huge_pages;
-       }
-       if (count >= nr_huge_pages)
-               return nr_huge_pages;
+       unsigned long min_count, ret;
  
+       /*
+        * Increase the pool size
+        * First take pages out of surplus state.  Then make up the
+        * remaining difference by allocating fresh huge pages.
+        */
         spin_lock(&hugetlb_lock);
-       count = max(count, resv_huge_pages);
-       try_to_free_low(count);
-       while (count < nr_huge_pages) {
+       while (surplus_huge_pages && count > persistent_huge_pages) {
+               if (!adjust_pool_surplus(-1))
+                       break;
+       }
+
+       while (count > persistent_huge_pages) {
+               int ret;
+               /*
+                * If this allocation races such that we no longer need the
+                * page, free_huge_page will handle it by freeing the page
+                * and reducing the surplus.
+                */
+               spin_unlock(&hugetlb_lock);
+               ret = alloc_fresh_huge_page();
+               spin_lock(&hugetlb_lock);
+               if (!ret)
+                       goto out;
+
+       }
+       if (count >= persistent_huge_pages)
+               goto out;
+
+       /*
+        * Decrease the pool size
+        * First return free pages to the buddy allocator (being careful
+        * to keep enough around to satisfy reservations).  Then place
+        * pages into surplus state as needed so the pool will shrink
+        * to the desired size as pages become free.
+        */
+       min_count = max(count, resv_huge_pages);
+       try_to_free_low(min_count);
+       while (min_count < persistent_huge_pages) {
                 struct page *page = dequeue_huge_page(NULL, 0);
                 if (!page)
                         break;
                 update_and_free_page(page);
         }
+       while (count < persistent_huge_pages) {
+               if (!adjust_pool_surplus(1))
+                       break;
+       }
+out:
+       ret = persistent_huge_pages;
         spin_unlock(&hugetlb_lock);
-       return nr_huge_pages;
+       return ret;
  }
  
  int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +401,12 @@ int hugetlb_report_meminfo(char *buf)
                         "HugePages_Total: %5lu\n"
                         "HugePages_Free:  %5lu\n"
                         "HugePages_Rsvd:  %5lu\n"
+                       "HugePages_Surp:  %5lu\n"
                         "Hugepagesize:    %5lu kB\n",
                         nr_huge_pages,
                         free_huge_pages,
                         resv_huge_pages,
+                       surplus_huge_pages,
                         HPAGE_SIZE/1024);
  }
author	Adam Litke <agl@us.ibm.com>
	Tue, 16 Oct 2007 08:26:18 +0000 (01:26 -0700)
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>
	Tue, 16 Oct 2007 16:43:02 +0000 (09:43 -0700)