mm, page_alloc: disable pcplists during memory offline

author Vlastimil Babka <vbabka@suse.cz>

Tue, 15 Dec 2020 03:10:59 +0000 (19:10 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 15 Dec 2020 20:13:43 +0000 (12:13 -0800)
author Vlastimil Babka <vbabka@suse.cz>
Tue, 15 Dec 2020 03:10:59 +0000 (19:10 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Dec 2020 20:13:43 +0000 (12:13 -0800)
diff --git a/mm/internal.h b/mm/internal.h

index fd6734be9c85c704f56cc92917e77d6369052563..25d2b2439f19f27d788944c2f18bd563e94b601a 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -204,6 +204,8 @@ extern void free_unref_page_list(struct list_head *list);
  
  extern void zone_pcp_update(struct zone *zone);
  extern void zone_pcp_reset(struct zone *zone);
+extern void zone_pcp_disable(struct zone *zone);
+extern void zone_pcp_enable(struct zone *zone);
  
  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
  
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 3c494ab0d075742e3779eb6bb770a0c80ff4f1c1..e0a561c550b3ac629a3fe4e4791f9af043c07d6d 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1491,17 +1491,21 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
         }
         node = zone_to_nid(zone);
  
+       /*
+        * Disable pcplists so that page isolation cannot race with freeing
+        * in a way that pages from isolated pageblock are left on pcplists.
+        */
+       zone_pcp_disable(zone);
+
         /* set above range as isolated */
         ret = start_isolate_page_range(start_pfn, end_pfn,
                                        MIGRATE_MOVABLE,
                                        MEMORY_OFFLINE | REPORT_FAILURE);
         if (ret) {
                 reason = "failure to isolate range";
-               goto failed_removal;
+               goto failed_removal_pcplists_disabled;
         }
  
-       drain_all_pages(zone);
-
         arg.start_pfn = start_pfn;
         arg.nr_pages = nr_pages;
         node_states_check_changes_offline(nr_pages, zone, &arg);
@@ -1551,20 +1555,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
                         goto failed_removal_isolated;
                 }
  
-               /*
-                * per-cpu pages are drained after start_isolate_page_range, but
-                * if there are still pages that are not free, make sure that we
-                * drain again, because when we isolated range we might have
-                * raced with another thread that was adding pages to pcp list.
-                *
-                * Forward progress should be still guaranteed because
-                * pages on the pcp list can only belong to MOVABLE_ZONE
-                * because has_unmovable_pages explicitly checks for
-                * PageBuddy on freed pages on other zones.
-                */
                 ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
-               if (ret)
-                       drain_all_pages(zone);
+
         } while (ret);
  
         /* Mark all sections offline and remove free pages from the buddy. */
@@ -1580,6 +1572,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
         zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
         spin_unlock_irqrestore(&zone->lock, flags);
  
+       zone_pcp_enable(zone);
+
         /* removal success */
         adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
         zone->present_pages -= nr_pages;
@@ -1612,6 +1606,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
  failed_removal_isolated:
         undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
         memory_notify(MEM_CANCEL_OFFLINE, &arg);
+failed_removal_pcplists_disabled:
+       zone_pcp_enable(zone);
  failed_removal:
         pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
                  (unsigned long long) start_pfn << PAGE_SHIFT,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index a259c22e46093f27f8d9cb5b7e42142a77f65c6b..40baa24211363dcfa7778dc4e3742bbdb61b4e6c 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3026,13 +3026,16 @@ static void drain_local_pages_wq(struct work_struct *work)
  }
  
  /*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
- *
- * When zone parameter is non-NULL, spill just the single zone's pages.
+ * The implementation of drain_all_pages(), exposing an extra parameter to
+ * drain on all cpus.
   *
- * Note that this can be extremely slow as the draining happens in a workqueue.
+ * drain_all_pages() is optimized to only execute on cpus where pcplists are
+ * not empty. The check for non-emptiness can however race with a free to
+ * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
+ * that need the guarantee that every CPU has drained can disable the
+ * optimizing racy check.
   */
-void drain_all_pages(struct zone *zone)
+void __drain_all_pages(struct zone *zone, bool force_all_cpus)
  {
         int cpu;
  
@@ -3071,7 +3074,13 @@ void drain_all_pages(struct zone *zone)
                 struct zone *z;
                 bool has_pcps = false;
  
-               if (zone) {
+               if (force_all_cpus) {
+                       /*
+                        * The pcp.count check is racy, some callers need a
+                        * guarantee that no cpu is missed.
+                        */
+                       has_pcps = true;
+               } else if (zone) {
                         pcp = per_cpu_ptr(zone->pageset, cpu);
                         if (pcp->pcp.count)
                                 has_pcps = true;
@@ -3104,6 +3113,18 @@ void drain_all_pages(struct zone *zone)
         mutex_unlock(&pcpu_drain_mutex);
  }
  
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
+ * Note that this can be extremely slow as the draining happens in a workqueue.
+ */
+void drain_all_pages(struct zone *zone)
+{
+       __drain_all_pages(zone, false);
+}
+
  #ifdef CONFIG_HIBERNATION
  
  /*
@@ -6316,6 +6337,18 @@ static void pageset_init(struct per_cpu_pageset *p)
         pcp->batch = BOOT_PAGESET_BATCH;
  }
  
+void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
+               unsigned long batch)
+{
+       struct per_cpu_pageset *p;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               p = per_cpu_ptr(zone->pageset, cpu);
+               pageset_update(&p->pcp, high, batch);
+       }
+}
+
  /*
   * Calculate and set new high and batch values for all per-cpu pagesets of a
   * zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
@@ -6323,8 +6356,6 @@ static void pageset_init(struct per_cpu_pageset *p)
  static void zone_set_pageset_high_and_batch(struct zone *zone)
  {
         unsigned long new_high, new_batch;
-       struct per_cpu_pageset *p;
-       int cpu;
  
         if (percpu_pagelist_fraction) {
                 new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
@@ -6344,10 +6375,7 @@ static void zone_set_pageset_high_and_batch(struct zone *zone)
         zone->pageset_high = new_high;
         zone->pageset_batch = new_batch;
  
-       for_each_possible_cpu(cpu) {
-               p = per_cpu_ptr(zone->pageset, cpu);
-               pageset_update(&p->pcp, new_high, new_batch);
-       }
+       __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
  }
  
  void __meminit setup_zone_pageset(struct zone *zone)
@@ -8742,6 +8770,27 @@ void __meminit zone_pcp_update(struct zone *zone)
         mutex_unlock(&pcp_batch_high_lock);
  }
  
+/*
+ * Effectively disable pcplists for the zone by setting the high limit to 0
+ * and draining all cpus. A concurrent page freeing on another CPU that's about
+ * to put the page on pcplist will either finish before the drain and the page
+ * will be drained, or observe the new high limit and skip the pcplist.
+ *
+ * Must be paired with a call to zone_pcp_enable().
+ */
+void zone_pcp_disable(struct zone *zone)
+{
+       mutex_lock(&pcp_batch_high_lock);
+       __zone_set_pageset_high_and_batch(zone, 0, 1);
+       __drain_all_pages(zone, true);
+}
+
+void zone_pcp_enable(struct zone *zone)
+{
+       __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
+       mutex_unlock(&pcp_batch_high_lock);
+}
+
  void zone_pcp_reset(struct zone *zone)
  {
         unsigned long flags;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c

index feab446d198282f81f086a746f70320a6f411141..a254e1f370a3ca79bc27093d5695b4b357a8250d 100644 (file)
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -174,9 +174,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
   * A call to drain_all_pages() after isolation can flush most of them. However
   * in some cases pages might still end up on pcp lists and that would allow
   * for their allocation even when they are in fact isolated already. Depending
- * on how strong of a guarantee the caller needs, further drain_all_pages()
- * might be needed (e.g. __offline_pages will need to call it after check for
- * isolated range for a next retry).
+ * on how strong of a guarantee the caller needs, zone_pcp_disable/enable()
+ * might be used to flush and disable pcplist before isolation and enable after
+ * unisolation.
   *
   * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
   */
author	Vlastimil Babka <vbabka@suse.cz>
	Tue, 15 Dec 2020 03:10:59 +0000 (19:10 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 15 Dec 2020 20:13:43 +0000 (12:13 -0800)
mm/internal.h		patch \| blob \| history
mm/memory_hotplug.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/page_isolation.c		patch \| blob \| history