page allocator: break up the allocator entry point into fast and slow paths

author Mel Gorman <mel@csn.ul.ie>

Tue, 16 Jun 2009 22:31:57 +0000 (15:31 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 17 Jun 2009 02:47:32 +0000 (19:47 -0700)
author Mel Gorman <mel@csn.ul.ie>
Tue, 16 Jun 2009 22:31:57 +0000 (15:31 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 17 Jun 2009 02:47:32 +0000 (19:47 -0700)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 6be8fcb..512bf9a 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1457,47 +1457,171 @@ try_next_zone:
         return page;
  }
  
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-                       struct zonelist *zonelist, nodemask_t *nodemask)
+static inline int
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+                               unsigned long pages_reclaimed)
  {
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
-       enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-       struct zoneref *z;
-       struct zone *zone;
-       struct page *page;
-       struct reclaim_state reclaim_state;
-       struct task_struct *p = current;
-       int do_retry;
-       int alloc_flags;
-       unsigned long did_some_progress;
-       unsigned long pages_reclaimed = 0;
+       /* Do not loop if specifically requested */
+       if (gfp_mask & __GFP_NORETRY)
+               return 0;
  
-       lockdep_trace_alloc(gfp_mask);
+       /*
+        * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+        * means __GFP_NOFAIL, but that may not be true in other
+        * implementations.
+        */
+       if (order <= PAGE_ALLOC_COSTLY_ORDER)
+               return 1;
  
-       might_sleep_if(wait);
+       /*
+        * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+        * specified, then we retry until we no longer reclaim any pages
+        * (above), or we've reclaimed an order of pages at least as
+        * large as the allocation's order. In both cases, if the
+        * allocation still fails, we stop retrying.
+        */
+       if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
+               return 1;
  
-       if (should_fail_alloc_page(gfp_mask, order))
-               return NULL;
+       /*
+        * Don't let big-order allocations loop unless the caller
+        * explicitly requests that.
+        */
+       if (gfp_mask & __GFP_NOFAIL)
+               return 1;
  
-       /* the list of zones suitable for gfp_mask */
-       z = zonelist->_zonerefs;
-       if (unlikely(!z->zone)) {
-               /*
-                * Happens if we have an empty zonelist as a result of
-                * GFP_THISNODE being used on a memoryless node
-                */
+       return 0;
+}
+
+static inline struct page *
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+       struct zonelist *zonelist, enum zone_type high_zoneidx,
+       nodemask_t *nodemask)
+{
+       struct page *page;
+
+       /* Acquire the OOM killer lock for the zones in zonelist */
+       if (!try_set_zone_oom(zonelist, gfp_mask)) {
+               schedule_timeout_uninterruptible(1);
                 return NULL;
         }
  
-restart:
-       page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-                       zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+       /*
+        * Go through the zonelist yet one more time, keep very high watermark
+        * here, this is only to catch a parallel oom killing, we must fail if
+        * we're still under heavy pressure.
+        */
+       page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+               order, zonelist, high_zoneidx,
+               ALLOC_WMARK_HIGH|ALLOC_CPUSET);
         if (page)
-               goto got_pg;
+               goto out;
+
+       /* The OOM killer will not help higher order allocs */
+       if (order > PAGE_ALLOC_COSTLY_ORDER)
+               goto out;
+
+       /* Exhausted what can be done so it's blamo time */
+       out_of_memory(zonelist, gfp_mask, order);
+
+out:
+       clear_zonelist_oom(zonelist, gfp_mask);
+       return page;
+}
+
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+       struct zonelist *zonelist, enum zone_type high_zoneidx,
+       nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+{
+       struct page *page = NULL;
+       struct reclaim_state reclaim_state;
+       struct task_struct *p = current;
+
+       cond_resched();
+
+       /* We now go into synchronous reclaim */
+       cpuset_memory_pressure_bump();
+
+       /*
+        * The task's cpuset might have expanded its set of allowable nodes
+        */
+       p->flags |= PF_MEMALLOC;
+       lockdep_set_current_reclaim_state(gfp_mask);
+       reclaim_state.reclaimed_slab = 0;
+       p->reclaim_state = &reclaim_state;
+
+       *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+
+       p->reclaim_state = NULL;
+       lockdep_clear_current_reclaim_state();
+       p->flags &= ~PF_MEMALLOC;
+
+       cond_resched();
+
+       if (order != 0)
+               drain_all_pages();
+
+       if (likely(*did_some_progress))
+               page = get_page_from_freelist(gfp_mask, nodemask, order,
+                                       zonelist, high_zoneidx, alloc_flags);
+       return page;
+}
+
+static inline int
+is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
+{
+       if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+                       && !in_interrupt())
+               return 1;
+       return 0;
+}
+
+/*
+ * This is called in the allocator slow-path if the allocation request is of
+ * sufficient urgency to ignore watermarks and take other desperate measures
+ */
+static inline struct page *
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+       struct zonelist *zonelist, enum zone_type high_zoneidx,
+       nodemask_t *nodemask)
+{
+       struct page *page;
+
+       do {
+               page = get_page_from_freelist(gfp_mask, nodemask, order,
+                       zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+
+               if (!page && gfp_mask & __GFP_NOFAIL)
+                       congestion_wait(WRITE, HZ/50);
+       } while (!page && (gfp_mask & __GFP_NOFAIL));
+
+       return page;
+}
+
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
+                                               enum zone_type high_zoneidx)
+{
+       struct zoneref *z;
+       struct zone *zone;
+
+       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+               wakeup_kswapd(zone, order);
+}
+
+static inline struct page *
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+       struct zonelist *zonelist, enum zone_type high_zoneidx,
+       nodemask_t *nodemask)
+{
+       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       struct page *page = NULL;
+       int alloc_flags;
+       unsigned long pages_reclaimed = 0;
+       unsigned long did_some_progress;
+       struct task_struct *p = current;
  
         /*
          * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,8 +1634,7 @@ restart:
         if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                 goto nopage;
  
-       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-               wakeup_kswapd(zone, order);
+       wake_all_kswapd(order, zonelist, high_zoneidx);
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -1531,6 +1654,7 @@ restart:
         if (wait)
                 alloc_flags |= ALLOC_CPUSET;
  
+restart:
         /*
          * Go through the zonelist again. Let __GFP_HIGH and allocations
          * coming from realtime tasks go deeper into reserves.
@@ -1544,23 +1668,18 @@ restart:
         if (page)
                 goto got_pg;
  
-       /* This allocation should allow future memory freeing. */
-
  rebalance:
-       if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-                       && !in_interrupt()) {
+       /* Allocate without watermarks if the context allows */
+       if (is_allocation_high_priority(p, gfp_mask)) {
+               /* Do not dip into emergency reserves if specified */
                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-nofail_alloc:
-                       /* go through the zonelist yet again, ignoring mins */
-                       page = get_page_from_freelist(gfp_mask, nodemask, order,
-                               zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+                       page = __alloc_pages_high_priority(gfp_mask, order,
+                               zonelist, high_zoneidx, nodemask);
                         if (page)
                                 goto got_pg;
-                       if (gfp_mask & __GFP_NOFAIL) {
-                               congestion_wait(WRITE, HZ/50);
-                               goto nofail_alloc;
-                       }
                 }
+
+               /* Ensure no recursion into the allocator */
                 goto nopage;
         }
  
@@ -1568,93 +1687,42 @@ nofail_alloc:
         if (!wait)
                 goto nopage;
  
-       cond_resched();
-
-       /* We now go into synchronous reclaim */
-       cpuset_memory_pressure_bump();
-
-       p->flags |= PF_MEMALLOC;
-
-       lockdep_set_current_reclaim_state(gfp_mask);
-       reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
-
-       did_some_progress = try_to_free_pages(zonelist, order,
-                                               gfp_mask, nodemask);
-
-       p->reclaim_state = NULL;
-       lockdep_clear_current_reclaim_state();
-       p->flags &= ~PF_MEMALLOC;
+       /* Try direct reclaim and then allocating */
+       page = __alloc_pages_direct_reclaim(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, &did_some_progress);
+       if (page)
+               goto got_pg;
  
-       cond_resched();
+       /*
+        * If we failed to make any progress reclaiming, then we are
+        * running out of options and have to consider going OOM
+        */
+       if (!did_some_progress) {
+               if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+                       page = __alloc_pages_may_oom(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask);
+                       if (page)
+                               goto got_pg;
  
-       if (order != 0)
-               drain_all_pages();
+                       /*
+                        * The OOM killer does not trigger for high-order allocations
+                        * but if no progress is being made, there are no other
+                        * options and retrying is unlikely to help
+                        */
+                       if (order > PAGE_ALLOC_COSTLY_ORDER)
+                               goto nopage;
  
-       if (likely(did_some_progress)) {
-               page = get_page_from_freelist(gfp_mask, nodemask, order,
-                                       zonelist, high_zoneidx, alloc_flags);
-               if (page)
-                       goto got_pg;
-       } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-               if (!try_set_zone_oom(zonelist, gfp_mask)) {
-                       schedule_timeout_uninterruptible(1);
                         goto restart;
                 }
-
-               /*
-                * Go through the zonelist yet one more time, keep
-                * very high watermark here, this is only to catch
-                * a parallel oom killing, we must fail if we're still
-                * under heavy pressure.
-                */
-               page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
-                       order, zonelist, high_zoneidx,
-                       ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-               if (page) {
-                       clear_zonelist_oom(zonelist, gfp_mask);
-                       goto got_pg;
-               }
-
-               /* The OOM killer will not help higher order allocs so fail */
-               if (order > PAGE_ALLOC_COSTLY_ORDER) {
-                       clear_zonelist_oom(zonelist, gfp_mask);
-                       goto nopage;
-               }
-
-               out_of_memory(zonelist, gfp_mask, order);
-               clear_zonelist_oom(zonelist, gfp_mask);
-               goto restart;
         }
  
-       /*
-        * Don't let big-order allocations loop unless the caller explicitly
-        * requests that.  Wait for some write requests to complete then retry.
-        *
-        * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-        * means __GFP_NOFAIL, but that may not be true in other
-        * implementations.
-        *
-        * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-        * specified, then we retry until we no longer reclaim any pages
-        * (above), or we've reclaimed an order of pages at least as
-        * large as the allocation's order. In both cases, if the
-        * allocation still fails, we stop retrying.
-        */
+       /* Check if we should retry the allocation */
         pages_reclaimed += did_some_progress;
-       do_retry = 0;
-       if (!(gfp_mask & __GFP_NORETRY)) {
-               if (order <= PAGE_ALLOC_COSTLY_ORDER) {
-                       do_retry = 1;
-               } else {
-                       if (gfp_mask & __GFP_REPEAT &&
-                               pages_reclaimed < (1 << order))
-                                       do_retry = 1;
-               }
-               if (gfp_mask & __GFP_NOFAIL)
-                       do_retry = 1;
-       }
-       if (do_retry) {
+       if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+               /* Wait for some write requests to complete then retry */
                 congestion_wait(WRITE, HZ/50);
                 goto rebalance;
         }
@@ -1669,6 +1737,41 @@ nopage:
         }
  got_pg:
         return page;
+
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+                       struct zonelist *zonelist, nodemask_t *nodemask)
+{
+       enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+       struct page *page;
+
+       lockdep_trace_alloc(gfp_mask);
+
+       might_sleep_if(gfp_mask & __GFP_WAIT);
+
+       if (should_fail_alloc_page(gfp_mask, order))
+               return NULL;
+
+       /*
+        * Check the zones suitable for the gfp_mask contain at least one
+        * valid zone. It's possible to have an empty zonelist as a result
+        * of GFP_THISNODE and a memoryless node
+        */
+       if (unlikely(!zonelist->_zonerefs->zone))
+               return NULL;
+
+       page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+                       zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+       if (unlikely(!page))
+               page = __alloc_pages_slowpath(gfp_mask, order,
+                               zonelist, high_zoneidx, nodemask);
+
+       return page;
  }
  EXPORT_SYMBOL(__alloc_pages_nodemask);
author	Mel Gorman <mel@csn.ul.ie>
	Tue, 16 Jun 2009 22:31:57 +0000 (15:31 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 17 Jun 2009 02:47:32 +0000 (19:47 -0700)