mm: fix off-by-two in __zone_watermark_ok()
[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / page_alloc.c
index bdc804c..59153da 100644 (file)
@@ -57,6 +57,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
 
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory.  This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
        saved_gfp_mask = gfp_allowed_mask;
        gfp_allowed_mask &= ~GFP_IOFS;
 }
+
+bool pm_suspended_storage(void)
+{
+       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+               return false;
+       return true;
+}
 #endif /* CONFIG_PM_SLEEP */
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -311,8 +327,8 @@ out:
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
- * All pages have PG_compound set.  All pages have their ->private pointing at
- * the head page (even the head page has this).
+ * All pages have PG_compound set.  All tail pages have their ->first_page
+ * pointing at the head page.
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
@@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
                clear_highpage(page + i);
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+       unsigned long res;
+
+       if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+               printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+               return 0;
+       }
+       _debug_guardpage_minorder = res;
+       printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+       return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
+{
+       __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+
+static inline void clear_page_guard_flag(struct page *page)
+{
+       __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
 static inline void set_page_order(struct page *page, int order)
 {
        set_page_private(page, order);
@@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
        if (page_zone_id(page) != page_zone_id(buddy))
                return 0;
 
+       if (page_is_guard(buddy) && page_order(buddy) == order) {
+               VM_BUG_ON(page_count(buddy) != 0);
+               return 1;
+       }
+
        if (PageBuddy(buddy) && page_order(buddy) == order) {
                VM_BUG_ON(page_count(buddy) != 0);
                return 1;
@@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page,
                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
-
-               /* Our buddy is free, merge with it and move up one order. */
-               list_del(&buddy->lru);
-               zone->free_area[order].nr_free--;
-               rmv_page_order(buddy);
+               /*
+                * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+                * merge with it and move up one order.
+                */
+               if (page_is_guard(buddy)) {
+                       clear_page_guard_flag(buddy);
+                       set_page_private(page, 0);
+                       __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+               } else {
+                       list_del(&buddy->lru);
+                       zone->free_area[order].nr_free--;
+                       rmv_page_order(buddy);
+               }
                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
@@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        int i;
        int bad = 0;
 
-       trace_mm_page_free_direct(page, order);
+       trace_mm_page_free(page, order);
        kmemcheck_free_shadow(page, order);
 
        if (PageAnon(page))
@@ -724,6 +784,23 @@ static inline void expand(struct zone *zone, struct page *page,
                high--;
                size >>= 1;
                VM_BUG_ON(bad_range(zone, &page[size]));
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+               if (high < debug_guardpage_minorder()) {
+                       /*
+                        * Mark as guard pages (or page), that will allow to
+                        * merge back to allocator when buddy will be freed.
+                        * Corresponding page table entries will not be touched,
+                        * pages will stay not present in virtual address space
+                        */
+                       INIT_LIST_HEAD(&page[size].lru);
+                       set_page_guard_flag(&page[size]);
+                       set_page_private(&page[size], high);
+                       /* Guard pages are not available for any usage */
+                       __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+                       continue;
+               }
+#endif
                list_add(&page[size].lru, &area->free_list[migratetype]);
                area->nr_free++;
                set_page_order(&page[size], high);
@@ -1189,6 +1266,19 @@ out:
 }
 
 /*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+       struct page *page, *next;
+
+       list_for_each_entry_safe(page, next, list, lru) {
+               trace_mm_page_free_batched(page, cold);
+               free_hot_cold_page(page, cold);
+       }
+}
+
+/*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
@@ -1386,7 +1476,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 
 static int __init fail_page_alloc_debugfs(void)
 {
-       mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+       umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;
 
        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
@@ -1435,7 +1525,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        long min = mark;
        int o;
 
-       free_pages -= (1 << order) + 1;
+       free_pages -= (1 << order) - 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
@@ -1645,6 +1735,35 @@ zonelist_scan:
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                continue;
+               /*
+                * When allocating a page cache page for writing, we
+                * want to get it from a zone that is within its dirty
+                * limit, such that no single zone holds more than its
+                * proportional share of globally allowed dirty pages.
+                * The dirty limits take into account the zone's
+                * lowmem reserves and high watermark so that kswapd
+                * should be able to balance it without having to
+                * write pages from its LRU list.
+                *
+                * This may look like it could increase pressure on
+                * lower zones by failing allocations in higher zones
+                * before they are full.  But the pages that do spill
+                * over are limited as the lower zones are protected
+                * by this very same mechanism.  It should not become
+                * a practical burden to them.
+                *
+                * XXX: For now, allow allocations to potentially
+                * exceed the per-zone dirty limit in the slowpath
+                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * which is important when on a NUMA setup the allowed
+                * zones are together not big enough to reach the
+                * global limit.  The proper fix for these situations
+                * will require awareness of zones in the
+                * dirty-throttling and the flusher threads.
+                */
+               if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                   (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+                       goto this_zone_full;
 
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1734,7 +1853,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
        unsigned int filter = SHOW_MEM_FILTER_NODES;
 
-       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+           debug_guardpage_minorder() > 0)
                return;
 
        /*
@@ -1773,12 +1893,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+                               unsigned long did_some_progress,
                                unsigned long pages_reclaimed)
 {
        /* Do not loop if specifically requested */
        if (gfp_mask & __GFP_NORETRY)
                return 0;
 
+       /* Always retry if specifically requested */
+       if (gfp_mask & __GFP_NOFAIL)
+               return 1;
+
+       /*
+        * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+        * making forward progress without invoking OOM. Suspend also disables
+        * storage devices so kswapd will not help. Bail if we are suspending.
+        */
+       if (!did_some_progress && pm_suspended_storage())
+               return 0;
+
        /*
         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
         * means __GFP_NOFAIL, but that may not be true in other
@@ -1797,13 +1930,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
        if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
                return 1;
 
-       /*
-        * Don't let big-order allocations loop unless the caller
-        * explicitly requests that.
-        */
-       if (gfp_mask & __GFP_NOFAIL)
-               return 1;
-
        return 0;
 }
 
@@ -2196,7 +2322,8 @@ rebalance:
 
        /* Check if we should retry the allocation */
        pages_reclaimed += did_some_progress;
-       if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+       if (should_alloc_retry(gfp_mask, order, did_some_progress,
+                                               pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
@@ -2306,16 +2433,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
 
-void __pagevec_free(struct pagevec *pvec)
-{
-       int i = pagevec_count(pvec);
-
-       while (--i >= 0) {
-               trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
-               free_hot_cold_page(pvec->pages[i], pvec->cold);
-       }
-}
-
 void __free_pages(struct page *page, unsigned int order)
 {
        if (put_page_testzero(page)) {
@@ -3385,25 +3502,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                if (page_to_nid(page) != zone_to_nid(zone))
                        continue;
 
-               /* Blocks with reserved pages will never free, skip them. */
-               block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-               if (pageblock_is_reserved(pfn, block_end_pfn))
-                       continue;
-
                block_migratetype = get_pageblock_migratetype(page);
 
-               /* If this block is reserved, account for it */
-               if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
-                       reserve--;
-                       continue;
-               }
+               /* Only test what is necessary when the reserves are not met */
+               if (reserve > 0) {
+                       /*
+                        * Blocks with reserved pages will never free, skip
+                        * them.
+                        */
+                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                       if (pageblock_is_reserved(pfn, block_end_pfn))
+                               continue;
 
-               /* Suitable for reserving if this block is movable */
-               if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
-                       set_pageblock_migratetype(page, MIGRATE_RESERVE);
-                       move_freepages_block(zone, page, MIGRATE_RESERVE);
-                       reserve--;
-                       continue;
+                       /* If this block is reserved, account for it */
+                       if (block_migratetype == MIGRATE_RESERVE) {
+                               reserve--;
+                               continue;
+                       }
+
+                       /* Suitable for reserving if this block is movable */
+                       if (block_migratetype == MIGRATE_MOVABLE) {
+                               set_pageblock_migratetype(page,
+                                                       MIGRATE_RESERVE);
+                               move_freepages_block(zone, page,
+                                                       MIGRATE_RESERVE);
+                               reserve--;
+                               continue;
+                       }
                }
 
                /*
@@ -4734,8 +4859,19 @@ static void calculate_totalreserve_pages(void)
                        if (max > zone->present_pages)
                                max = zone->present_pages;
                        reserve_pages += max;
+                       /*
+                        * Lowmem reserves are not available to
+                        * GFP_HIGHUSER page cache allocations and
+                        * kswapd tries to balance zones to their high
+                        * watermark.  As a result, neither should be
+                        * regarded as dirtyable memory, to prevent a
+                        * situation where reclaim has to clean pages
+                        * in order to balance the zones.
+                        */
+                       zone->dirty_balance_reserve = max;
                }
        }
+       dirty_balance_reserve = reserve_pages;
        totalreserve_pages = reserve_pages;
 }