mm: vmscan: respect NUMA policy mask when shrinking slab on direct reclaim
[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / vmscan.c
index eea668d..47905d7 100644 (file)
@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc)
 }
 #endif
 
-unsigned long zone_reclaimable_pages(struct zone *zone)
+static unsigned long zone_reclaimable_pages(struct zone *zone)
 {
        int nr;
 
@@ -281,17 +281,34 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
                                nr_pages_scanned, lru_pages,
                                max_pass, delta, total_scan);
 
-       while (total_scan >= batch_size) {
+       /*
+        * Normally, we should not scan less than batch_size objects in one
+        * pass to avoid too frequent shrinker calls, but if the slab has less
+        * than batch_size objects in total and we are really tight on memory,
+        * we will try to reclaim all available objects, otherwise we can end
+        * up failing allocations although there are plenty of reclaimable
+        * objects spread over several slabs with usage less than the
+        * batch_size.
+        *
+        * We detect the "tight on memory" situations by looking at the total
+        * number of objects we want to scan (total_scan). If it is greater
+        * than the total number of objects on slab (max_pass), we must be
+        * scanning at high prio and therefore should try to reclaim as much as
+        * possible.
+        */
+       while (total_scan >= batch_size ||
+              total_scan >= max_pass) {
                unsigned long ret;
+               unsigned long nr_to_scan = min(batch_size, total_scan);
 
-               shrinkctl->nr_to_scan = batch_size;
+               shrinkctl->nr_to_scan = nr_to_scan;
                ret = shrinker->scan_objects(shrinker, shrinkctl);
                if (ret == SHRINK_STOP)
                        break;
                freed += ret;
 
-               count_vm_events(SLABS_SCANNED, batch_size);
-               total_scan -= batch_size;
+               count_vm_events(SLABS_SCANNED, nr_to_scan);
+               total_scan -= nr_to_scan;
 
                cond_resched();
        }
@@ -352,16 +369,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
        }
 
        list_for_each_entry(shrinker, &shrinker_list, list) {
-               for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
-                       if (!node_online(shrinkctl->nid))
-                               continue;
-
-                       if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
-                           (shrinkctl->nid != 0))
-                               break;
-
+               if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+                       shrinkctl->nid = 0;
                        freed += shrink_slab_node(shrinkctl, shrinker,
-                                nr_pages_scanned, lru_pages);
+                                       nr_pages_scanned, lru_pages);
+                       continue;
+               }
+
+               for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+                       if (node_online(shrinkctl->nid))
+                               freed += shrink_slab_node(shrinkctl, shrinker,
+                                               nr_pages_scanned, lru_pages);
 
                }
        }
@@ -603,7 +621,7 @@ void putback_lru_page(struct page *page)
        bool is_unevictable;
        int was_unevictable = PageUnevictable(page);
 
-       VM_BUG_ON(PageLRU(page));
+       VM_BUG_ON_PAGE(PageLRU(page), page);
 
 redo:
        ClearPageUnevictable(page);
@@ -794,8 +812,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (!trylock_page(page))
                        goto keep;
 
-               VM_BUG_ON(PageActive(page));
-               VM_BUG_ON(page_zone(page) != zone);
+               VM_BUG_ON_PAGE(PageActive(page), page);
+               VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 
                sc->nr_scanned++;
 
@@ -1079,14 +1097,14 @@ activate_locked:
                /* Not a candidate for swapping, so reclaim swap space. */
                if (PageSwapCache(page) && vm_swap_full())
                        try_to_free_swap(page);
-               VM_BUG_ON(PageActive(page));
+               VM_BUG_ON_PAGE(PageActive(page), page);
                SetPageActive(page);
                pgactivate++;
 keep_locked:
                unlock_page(page);
 keep:
                list_add(&page->lru, &ret_pages);
-               VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
+               VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
 
        free_hot_cold_page_list(&free_pages, 1);
@@ -1240,7 +1258,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
 
-               VM_BUG_ON(!PageLRU(page));
+               VM_BUG_ON_PAGE(!PageLRU(page), page);
 
                switch (__isolate_lru_page(page, mode)) {
                case 0:
@@ -1295,7 +1313,7 @@ int isolate_lru_page(struct page *page)
 {
        int ret = -EBUSY;
 
-       VM_BUG_ON(!page_count(page));
+       VM_BUG_ON_PAGE(!page_count(page), page);
 
        if (PageLRU(page)) {
                struct zone *zone = page_zone(page);
@@ -1366,7 +1384,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                struct page *page = lru_to_page(page_list);
                int lru;
 
-               VM_BUG_ON(PageLRU(page));
+               VM_BUG_ON_PAGE(PageLRU(page), page);
                list_del(&page->lru);
                if (unlikely(!page_evictable(page))) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1522,19 +1540,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                 * If dirty pages are scanned that are not queued for IO, it
                 * implies that flushers are not keeping up. In this case, flag
                 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
-                * pages from reclaim context. It will forcibly stall in the
-                * next check.
+                * pages from reclaim context.
                 */
                if (nr_unqueued_dirty == nr_taken)
                        zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
 
                /*
-                * In addition, if kswapd scans pages marked marked for
-                * immediate reclaim and under writeback (nr_immediate), it
-                * implies that pages are cycling through the LRU faster than
+                * If kswapd scans pages marked marked for immediate
+                * reclaim and under writeback (nr_immediate), it implies
+                * that pages are cycling through the LRU faster than
                 * they are written so also forcibly stall.
                 */
-               if (nr_unqueued_dirty == nr_taken || nr_immediate)
+               if (nr_immediate)
                        congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
 
@@ -1586,7 +1603,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
                page = lru_to_page(list);
                lruvec = mem_cgroup_page_lruvec(page, zone);
 
-               VM_BUG_ON(PageLRU(page));
+               VM_BUG_ON_PAGE(PageLRU(page), page);
                SetPageLRU(page);
 
                nr_pages = hpage_nr_pages(page);
@@ -2407,8 +2424,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        unsigned long lru_pages = 0;
 
                        nodes_clear(shrink->nodes_to_scan);
-                       for_each_zone_zonelist(zone, z, zonelist,
-                                       gfp_zone(sc->gfp_mask)) {
+                       for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                       gfp_zone(sc->gfp_mask), sc->nodemask) {
                                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                        continue;
 
@@ -2484,10 +2501,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
+               if (!populated_zone(zone))
+                       continue;
+
                pfmemalloc_reserve += min_wmark_pages(zone);
                free_pages += zone_page_state(zone, NR_FREE_PAGES);
        }
 
+       /* If there are no reserves (unexpected config) then do not throttle */
+       if (!pfmemalloc_reserve)
+               return true;
+
        wmark_ok = free_pages > pfmemalloc_reserve / 2;
 
        /* kswapd must be awake if processes are being throttled */
@@ -2512,9 +2536,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
 {
+       struct zoneref *z;
        struct zone *zone;
-       int high_zoneidx = gfp_zone(gfp_mask);
-       pg_data_t *pgdat;
+       pg_data_t *pgdat = NULL;
 
        /*
         * Kernel threads should not be throttled as they may be indirectly
@@ -2533,10 +2557,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
        if (fatal_signal_pending(current))
                goto out;
 
-       /* Check if the pfmemalloc reserves are ok */
-       first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
-       pgdat = zone->zone_pgdat;
-       if (pfmemalloc_watermark_ok(pgdat))
+       /*
+        * Check if the pfmemalloc reserves are ok by finding the first node
+        * with a usable ZONE_NORMAL or lower zone. The expectation is that
+        * GFP_KERNEL will be required for allocating network buffers when
+        * swapping over the network so ZONE_HIGHMEM is unusable.
+        *
+        * Throttling is based on the first usable node and throttled processes
+        * wait on a queue until kswapd makes progress and wakes them. There
+        * is an affinity then between processes waking up and where reclaim
+        * progress has been made assuming the process wakes on the same node.
+        * More importantly, processes running on remote nodes will not compete
+        * for remote pfmemalloc reserves and processes on different nodes
+        * should make reasonable progress.
+        */
+       for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                       gfp_mask, nodemask) {
+               if (zone_idx(zone) > ZONE_NORMAL)
+                       continue;
+
+               /* Throttle based on the first usable node */
+               pgdat = zone->zone_pgdat;
+               if (pfmemalloc_watermark_ok(pgdat))
+                       goto out;
+               break;
+       }
+
+       /* If no zone was usable by the allocation flags then do not throttle */
+       if (!pgdat)
                goto out;
 
        /* Account for the throttling */
@@ -3267,7 +3315,10 @@ static int kswapd(void *p)
                }
        }
 
+       tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
        current->reclaim_state = NULL;
+       lockdep_clear_current_reclaim_state();
+
        return 0;
 }
 
@@ -3297,27 +3348,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
-/*
- * The reclaimable count would be mostly accurate.
- * The less reclaimable pages may be
- * - mlocked pages, which will be moved to unevictable list when encountered
- * - mapped pages, which may require several travels to be reclaimed
- * - dirty pages, which is not "instantly" reclaimable
- */
-unsigned long global_reclaimable_pages(void)
-{
-       int nr;
-
-       nr = global_page_state(NR_ACTIVE_FILE) +
-            global_page_state(NR_INACTIVE_FILE);
-
-       if (get_nr_swap_pages() > 0)
-               nr += global_page_state(NR_ACTIVE_ANON) +
-                     global_page_state(NR_INACTIVE_ANON);
-
-       return nr;
-}
-
 #ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3701,7 +3731,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                if (page_evictable(page)) {
                        enum lru_list lru = page_lru_base_type(page);
 
-                       VM_BUG_ON(PageActive(page));
+                       VM_BUG_ON_PAGE(PageActive(page), page);
                        ClearPageUnevictable(page);
                        del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
                        add_page_to_lru_list(page, lruvec, lru);