}
node = zone_to_nid(zone);
+ /*
+ * Disable pcplists so that page isolation cannot race with freeing
+ * in a way that pages from isolated pageblock are left on pcplists.
+ */
+ zone_pcp_disable(zone);
+
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
MEMORY_OFFLINE | REPORT_FAILURE);
if (ret) {
reason = "failure to isolate range";
- goto failed_removal;
+ goto failed_removal_pcplists_disabled;
}
- drain_all_pages(zone);
-
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
node_states_check_changes_offline(nr_pages, zone, &arg);
goto failed_removal_isolated;
}
- /*
- * per-cpu pages are drained after start_isolate_page_range, but
- * if there are still pages that are not free, make sure that we
- * drain again, because when we isolated range we might have
- * raced with another thread that was adding pages to pcp list.
- *
- * Forward progress should be still guaranteed because
- * pages on the pcp list can only belong to MOVABLE_ZONE
- * because has_unmovable_pages explicitly checks for
- * PageBuddy on freed pages on other zones.
- */
ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
- if (ret)
- drain_all_pages(zone);
+
} while (ret);
/* Mark all sections offline and remove free pages from the buddy. */
zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
+ zone_pcp_enable(zone);
+
/* removal success */
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
zone->present_pages -= nr_pages;
failed_removal_isolated:
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
+failed_removal_pcplists_disabled:
+ zone_pcp_enable(zone);
failed_removal:
pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
(unsigned long long) start_pfn << PAGE_SHIFT,
}
/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
- *
- * When zone parameter is non-NULL, spill just the single zone's pages.
+ * The implementation of drain_all_pages(), exposing an extra parameter to
+ * drain on all cpus.
*
- * Note that this can be extremely slow as the draining happens in a workqueue.
+ * drain_all_pages() is optimized to only execute on cpus where pcplists are
+ * not empty. The check for non-emptiness can however race with a free to
+ * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
+ * that need the guarantee that every CPU has drained can disable the
+ * optimizing racy check.
*/
-void drain_all_pages(struct zone *zone)
+void __drain_all_pages(struct zone *zone, bool force_all_cpus)
{
int cpu;
struct zone *z;
bool has_pcps = false;
- if (zone) {
+ if (force_all_cpus) {
+ /*
+ * The pcp.count check is racy, some callers need a
+ * guarantee that no cpu is missed.
+ */
+ has_pcps = true;
+ } else if (zone) {
pcp = per_cpu_ptr(zone->pageset, cpu);
if (pcp->pcp.count)
has_pcps = true;
mutex_unlock(&pcpu_drain_mutex);
}
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
+ * Note that this can be extremely slow as the draining happens in a workqueue.
+ */
+void drain_all_pages(struct zone *zone)
+{
+ __drain_all_pages(zone, false);
+}
+
#ifdef CONFIG_HIBERNATION
/*
pcp->batch = BOOT_PAGESET_BATCH;
}
+void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
+ unsigned long batch)
+{
+ struct per_cpu_pageset *p;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ p = per_cpu_ptr(zone->pageset, cpu);
+ pageset_update(&p->pcp, high, batch);
+ }
+}
+
/*
* Calculate and set new high and batch values for all per-cpu pagesets of a
* zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
static void zone_set_pageset_high_and_batch(struct zone *zone)
{
unsigned long new_high, new_batch;
- struct per_cpu_pageset *p;
- int cpu;
if (percpu_pagelist_fraction) {
new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
zone->pageset_high = new_high;
zone->pageset_batch = new_batch;
- for_each_possible_cpu(cpu) {
- p = per_cpu_ptr(zone->pageset, cpu);
- pageset_update(&p->pcp, new_high, new_batch);
- }
+ __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
}
void __meminit setup_zone_pageset(struct zone *zone)
mutex_unlock(&pcp_batch_high_lock);
}
+/*
+ * Effectively disable pcplists for the zone by setting the high limit to 0
+ * and draining all cpus. A concurrent page freeing on another CPU that's about
+ * to put the page on pcplist will either finish before the drain and the page
+ * will be drained, or observe the new high limit and skip the pcplist.
+ *
+ * Must be paired with a call to zone_pcp_enable().
+ */
+void zone_pcp_disable(struct zone *zone)
+{
+ mutex_lock(&pcp_batch_high_lock);
+ __zone_set_pageset_high_and_batch(zone, 0, 1);
+ __drain_all_pages(zone, true);
+}
+
+void zone_pcp_enable(struct zone *zone)
+{
+ __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
+ mutex_unlock(&pcp_batch_high_lock);
+}
+
void zone_pcp_reset(struct zone *zone)
{
unsigned long flags;