Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / memory_hotplug.c
1 /*
2  *  linux/mm/memory_hotplug.c
3  *
4  *  Copyright (C)
5  */
6
7 #include <linux/stddef.h>
8 #include <linux/mm.h>
9 #include <linux/swap.h>
10 #include <linux/interrupt.h>
11 #include <linux/pagemap.h>
12 #include <linux/bootmem.h>
13 #include <linux/compiler.h>
14 #include <linux/export.h>
15 #include <linux/pagevec.h>
16 #include <linux/writeback.h>
17 #include <linux/slab.h>
18 #include <linux/sysctl.h>
19 #include <linux/cpu.h>
20 #include <linux/memory.h>
21 #include <linux/memory_hotplug.h>
22 #include <linux/highmem.h>
23 #include <linux/vmalloc.h>
24 #include <linux/ioport.h>
25 #include <linux/delay.h>
26 #include <linux/migrate.h>
27 #include <linux/page-isolation.h>
28 #include <linux/pfn.h>
29 #include <linux/suspend.h>
30 #include <linux/mm_inline.h>
31 #include <linux/firmware-map.h>
32
33 #include <asm/tlbflush.h>
34
35 #include "internal.h"
36
37 /*
38  * online_page_callback contains pointer to current page onlining function.
39  * Initially it is generic_online_page(). If it is required it could be
40  * changed by calling set_online_page_callback() for callback registration
41  * and restore_online_page_callback() for generic callback restore.
42  */
43
44 static void generic_online_page(struct page *page);
45
46 static online_page_callback_t online_page_callback = generic_online_page;
47
48 DEFINE_MUTEX(mem_hotplug_mutex);
49
50 void lock_memory_hotplug(void)
51 {
52         mutex_lock(&mem_hotplug_mutex);
53
54         /* for exclusive hibernation if CONFIG_HIBERNATION=y */
55         lock_system_sleep();
56 }
57
58 void unlock_memory_hotplug(void)
59 {
60         unlock_system_sleep();
61         mutex_unlock(&mem_hotplug_mutex);
62 }
63
64
65 /* add this memory to iomem resource */
66 static struct resource *register_memory_resource(u64 start, u64 size)
67 {
68         struct resource *res;
69         res = kzalloc(sizeof(struct resource), GFP_KERNEL);
70         BUG_ON(!res);
71
72         res->name = "System RAM";
73         res->start = start;
74         res->end = start + size - 1;
75         res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
76         if (request_resource(&iomem_resource, res) < 0) {
77                 printk("System RAM resource %pR cannot be added\n", res);
78                 kfree(res);
79                 res = NULL;
80         }
81         return res;
82 }
83
84 static void release_memory_resource(struct resource *res)
85 {
86         if (!res)
87                 return;
88         release_resource(res);
89         kfree(res);
90         return;
91 }
92
93 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
94 #ifndef CONFIG_SPARSEMEM_VMEMMAP
95 static void get_page_bootmem(unsigned long info,  struct page *page,
96                              unsigned long type)
97 {
98         page->lru.next = (struct list_head *) type;
99         SetPagePrivate(page);
100         set_page_private(page, info);
101         atomic_inc(&page->_count);
102 }
103
104 /* reference to __meminit __free_pages_bootmem is valid
105  * so use __ref to tell modpost not to generate a warning */
106 void __ref put_page_bootmem(struct page *page)
107 {
108         unsigned long type;
109         struct zone *zone;
110
111         type = (unsigned long) page->lru.next;
112         BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
113                type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
114
115         if (atomic_dec_return(&page->_count) == 1) {
116                 ClearPagePrivate(page);
117                 set_page_private(page, 0);
118                 INIT_LIST_HEAD(&page->lru);
119                 __free_pages_bootmem(page, 0);
120
121                 zone = page_zone(page);
122                 zone_span_writelock(zone);
123                 zone->present_pages++;
124                 zone_span_writeunlock(zone);
125                 totalram_pages++;
126         }
127
128 }
129
130 static void register_page_bootmem_info_section(unsigned long start_pfn)
131 {
132         unsigned long *usemap, mapsize, section_nr, i;
133         struct mem_section *ms;
134         struct page *page, *memmap;
135
136         section_nr = pfn_to_section_nr(start_pfn);
137         ms = __nr_to_section(section_nr);
138
139         /* Get section's memmap address */
140         memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
141
142         /*
143          * Get page for the memmap's phys address
144          * XXX: need more consideration for sparse_vmemmap...
145          */
146         page = virt_to_page(memmap);
147         mapsize = sizeof(struct page) * PAGES_PER_SECTION;
148         mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
149
150         /* remember memmap's page */
151         for (i = 0; i < mapsize; i++, page++)
152                 get_page_bootmem(section_nr, page, SECTION_INFO);
153
154         usemap = __nr_to_section(section_nr)->pageblock_flags;
155         page = virt_to_page(usemap);
156
157         mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
158
159         for (i = 0; i < mapsize; i++, page++)
160                 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
161
162 }
163
164 void register_page_bootmem_info_node(struct pglist_data *pgdat)
165 {
166         unsigned long i, pfn, end_pfn, nr_pages;
167         int node = pgdat->node_id;
168         struct page *page;
169         struct zone *zone;
170
171         nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
172         page = virt_to_page(pgdat);
173
174         for (i = 0; i < nr_pages; i++, page++)
175                 get_page_bootmem(node, page, NODE_INFO);
176
177         zone = &pgdat->node_zones[0];
178         for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
179                 if (zone->wait_table) {
180                         nr_pages = zone->wait_table_hash_nr_entries
181                                 * sizeof(wait_queue_head_t);
182                         nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
183                         page = virt_to_page(zone->wait_table);
184
185                         for (i = 0; i < nr_pages; i++, page++)
186                                 get_page_bootmem(node, page, NODE_INFO);
187                 }
188         }
189
190         pfn = pgdat->node_start_pfn;
191         end_pfn = pfn + pgdat->node_spanned_pages;
192
193         /* register_section info */
194         for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
195                 /*
196                  * Some platforms can assign the same pfn to multiple nodes - on
197                  * node0 as well as nodeN.  To avoid registering a pfn against
198                  * multiple nodes we check that this pfn does not already
199                  * reside in some other node.
200                  */
201                 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
202                         register_page_bootmem_info_section(pfn);
203         }
204 }
205 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
206
207 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
208                            unsigned long end_pfn)
209 {
210         unsigned long old_zone_end_pfn;
211
212         zone_span_writelock(zone);
213
214         old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
215         if (start_pfn < zone->zone_start_pfn)
216                 zone->zone_start_pfn = start_pfn;
217
218         zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
219                                 zone->zone_start_pfn;
220
221         zone_span_writeunlock(zone);
222 }
223
224 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
225                             unsigned long end_pfn)
226 {
227         unsigned long old_pgdat_end_pfn =
228                 pgdat->node_start_pfn + pgdat->node_spanned_pages;
229
230         if (start_pfn < pgdat->node_start_pfn)
231                 pgdat->node_start_pfn = start_pfn;
232
233         pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
234                                         pgdat->node_start_pfn;
235 }
236
237 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
238 {
239         struct pglist_data *pgdat = zone->zone_pgdat;
240         int nr_pages = PAGES_PER_SECTION;
241         int nid = pgdat->node_id;
242         int zone_type;
243         unsigned long flags;
244
245         zone_type = zone - pgdat->node_zones;
246         if (!zone->wait_table) {
247                 int ret;
248
249                 ret = init_currently_empty_zone(zone, phys_start_pfn,
250                                                 nr_pages, MEMMAP_HOTPLUG);
251                 if (ret)
252                         return ret;
253         }
254         pgdat_resize_lock(zone->zone_pgdat, &flags);
255         grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
256         grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
257                         phys_start_pfn + nr_pages);
258         pgdat_resize_unlock(zone->zone_pgdat, &flags);
259         memmap_init_zone(nr_pages, nid, zone_type,
260                          phys_start_pfn, MEMMAP_HOTPLUG);
261         return 0;
262 }
263
264 static int __meminit __add_section(int nid, struct zone *zone,
265                                         unsigned long phys_start_pfn)
266 {
267         int nr_pages = PAGES_PER_SECTION;
268         int ret;
269
270         if (pfn_valid(phys_start_pfn))
271                 return -EEXIST;
272
273         ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
274
275         if (ret < 0)
276                 return ret;
277
278         ret = __add_zone(zone, phys_start_pfn);
279
280         if (ret < 0)
281                 return ret;
282
283         return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
284 }
285
286 #ifdef CONFIG_SPARSEMEM_VMEMMAP
287 static int __remove_section(struct zone *zone, struct mem_section *ms)
288 {
289         /*
290          * XXX: Freeing memmap with vmemmap is not implement yet.
291          *      This should be removed later.
292          */
293         return -EBUSY;
294 }
295 #else
296 static int __remove_section(struct zone *zone, struct mem_section *ms)
297 {
298         unsigned long flags;
299         struct pglist_data *pgdat = zone->zone_pgdat;
300         int ret = -EINVAL;
301
302         if (!valid_section(ms))
303                 return ret;
304
305         ret = unregister_memory_section(ms);
306         if (ret)
307                 return ret;
308
309         pgdat_resize_lock(pgdat, &flags);
310         sparse_remove_one_section(zone, ms);
311         pgdat_resize_unlock(pgdat, &flags);
312         return 0;
313 }
314 #endif
315
316 /*
317  * Reasonably generic function for adding memory.  It is
318  * expected that archs that support memory hotplug will
319  * call this function after deciding the zone to which to
320  * add the new pages.
321  */
322 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
323                         unsigned long nr_pages)
324 {
325         unsigned long i;
326         int err = 0;
327         int start_sec, end_sec;
328         /* during initialize mem_map, align hot-added range to section */
329         start_sec = pfn_to_section_nr(phys_start_pfn);
330         end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
331
332         for (i = start_sec; i <= end_sec; i++) {
333                 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
334
335                 /*
336                  * EEXIST is finally dealt with by ioresource collision
337                  * check. see add_memory() => register_memory_resource()
338                  * Warning will be printed if there is collision.
339                  */
340                 if (err && (err != -EEXIST))
341                         break;
342                 err = 0;
343         }
344
345         return err;
346 }
347 EXPORT_SYMBOL_GPL(__add_pages);
348
349 /**
350  * __remove_pages() - remove sections of pages from a zone
351  * @zone: zone from which pages need to be removed
352  * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
353  * @nr_pages: number of pages to remove (must be multiple of section size)
354  *
355  * Generic helper function to remove section mappings and sysfs entries
356  * for the section of the memory we are removing. Caller needs to make
357  * sure that pages are marked reserved and zones are adjust properly by
358  * calling offline_pages().
359  */
360 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
361                  unsigned long nr_pages)
362 {
363         unsigned long i, ret = 0;
364         int sections_to_remove;
365
366         /*
367          * We can only remove entire sections
368          */
369         BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
370         BUG_ON(nr_pages % PAGES_PER_SECTION);
371
372         release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
373
374         sections_to_remove = nr_pages / PAGES_PER_SECTION;
375         for (i = 0; i < sections_to_remove; i++) {
376                 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
377                 ret = __remove_section(zone, __pfn_to_section(pfn));
378                 if (ret)
379                         break;
380         }
381         return ret;
382 }
383 EXPORT_SYMBOL_GPL(__remove_pages);
384
385 int set_online_page_callback(online_page_callback_t callback)
386 {
387         int rc = -EINVAL;
388
389         lock_memory_hotplug();
390
391         if (online_page_callback == generic_online_page) {
392                 online_page_callback = callback;
393                 rc = 0;
394         }
395
396         unlock_memory_hotplug();
397
398         return rc;
399 }
400 EXPORT_SYMBOL_GPL(set_online_page_callback);
401
402 int restore_online_page_callback(online_page_callback_t callback)
403 {
404         int rc = -EINVAL;
405
406         lock_memory_hotplug();
407
408         if (online_page_callback == callback) {
409                 online_page_callback = generic_online_page;
410                 rc = 0;
411         }
412
413         unlock_memory_hotplug();
414
415         return rc;
416 }
417 EXPORT_SYMBOL_GPL(restore_online_page_callback);
418
419 void __online_page_set_limits(struct page *page)
420 {
421         unsigned long pfn = page_to_pfn(page);
422
423         if (pfn >= num_physpages)
424                 num_physpages = pfn + 1;
425 }
426 EXPORT_SYMBOL_GPL(__online_page_set_limits);
427
428 void __online_page_increment_counters(struct page *page)
429 {
430         totalram_pages++;
431
432 #ifdef CONFIG_HIGHMEM
433         if (PageHighMem(page))
434                 totalhigh_pages++;
435 #endif
436 }
437 EXPORT_SYMBOL_GPL(__online_page_increment_counters);
438
439 void __online_page_free(struct page *page)
440 {
441         ClearPageReserved(page);
442         init_page_count(page);
443         __free_page(page);
444 }
445 EXPORT_SYMBOL_GPL(__online_page_free);
446
447 static void generic_online_page(struct page *page)
448 {
449         __online_page_set_limits(page);
450         __online_page_increment_counters(page);
451         __online_page_free(page);
452 }
453
454 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
455                         void *arg)
456 {
457         unsigned long i;
458         unsigned long onlined_pages = *(unsigned long *)arg;
459         struct page *page;
460         if (PageReserved(pfn_to_page(start_pfn)))
461                 for (i = 0; i < nr_pages; i++) {
462                         page = pfn_to_page(start_pfn + i);
463                         (*online_page_callback)(page);
464                         onlined_pages++;
465                 }
466         *(unsigned long *)arg = onlined_pages;
467         return 0;
468 }
469
470
471 int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
472 {
473         unsigned long onlined_pages = 0;
474         struct zone *zone;
475         int need_zonelists_rebuild = 0;
476         int nid;
477         int ret;
478         struct memory_notify arg;
479
480         lock_memory_hotplug();
481         arg.start_pfn = pfn;
482         arg.nr_pages = nr_pages;
483         arg.status_change_nid = -1;
484
485         nid = page_to_nid(pfn_to_page(pfn));
486         if (node_present_pages(nid) == 0)
487                 arg.status_change_nid = nid;
488
489         ret = memory_notify(MEM_GOING_ONLINE, &arg);
490         ret = notifier_to_errno(ret);
491         if (ret) {
492                 memory_notify(MEM_CANCEL_ONLINE, &arg);
493                 unlock_memory_hotplug();
494                 return ret;
495         }
496         /*
497          * This doesn't need a lock to do pfn_to_page().
498          * The section can't be removed here because of the
499          * memory_block->state_mutex.
500          */
501         zone = page_zone(pfn_to_page(pfn));
502         /*
503          * If this zone is not populated, then it is not in zonelist.
504          * This means the page allocator ignores this zone.
505          * So, zonelist must be updated after online.
506          */
507         mutex_lock(&zonelists_mutex);
508         if (!populated_zone(zone))
509                 need_zonelists_rebuild = 1;
510
511         ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
512                 online_pages_range);
513         if (ret) {
514                 mutex_unlock(&zonelists_mutex);
515                 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
516                        (unsigned long long) pfn << PAGE_SHIFT,
517                        (((unsigned long long) pfn + nr_pages)
518                             << PAGE_SHIFT) - 1);
519                 memory_notify(MEM_CANCEL_ONLINE, &arg);
520                 unlock_memory_hotplug();
521                 return ret;
522         }
523
524         zone->present_pages += onlined_pages;
525         zone->zone_pgdat->node_present_pages += onlined_pages;
526         if (onlined_pages) {
527                 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
528                 if (need_zonelists_rebuild)
529                         build_all_zonelists(NULL, zone);
530                 else
531                         zone_pcp_update(zone);
532         }
533
534         mutex_unlock(&zonelists_mutex);
535
536         init_per_zone_wmark_min();
537
538         if (onlined_pages)
539                 kswapd_run(zone_to_nid(zone));
540
541         vm_total_pages = nr_free_pagecache_pages();
542
543         writeback_set_ratelimit();
544
545         if (onlined_pages)
546                 memory_notify(MEM_ONLINE, &arg);
547         unlock_memory_hotplug();
548
549         return 0;
550 }
551 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
552
553 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
554 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
555 {
556         struct pglist_data *pgdat;
557         unsigned long zones_size[MAX_NR_ZONES] = {0};
558         unsigned long zholes_size[MAX_NR_ZONES] = {0};
559         unsigned long start_pfn = start >> PAGE_SHIFT;
560
561         pgdat = arch_alloc_nodedata(nid);
562         if (!pgdat)
563                 return NULL;
564
565         arch_refresh_nodedata(nid, pgdat);
566
567         /* we can use NODE_DATA(nid) from here */
568
569         /* init node's zones as empty zones, we don't have any present pages.*/
570         free_area_init_node(nid, zones_size, start_pfn, zholes_size);
571
572         /*
573          * The node we allocated has no zone fallback lists. For avoiding
574          * to access not-initialized zonelist, build here.
575          */
576         mutex_lock(&zonelists_mutex);
577         build_all_zonelists(pgdat, NULL);
578         mutex_unlock(&zonelists_mutex);
579
580         return pgdat;
581 }
582
583 static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
584 {
585         arch_refresh_nodedata(nid, NULL);
586         arch_free_nodedata(pgdat);
587         return;
588 }
589
590
591 /*
592  * called by cpu_up() to online a node without onlined memory.
593  */
594 int mem_online_node(int nid)
595 {
596         pg_data_t       *pgdat;
597         int     ret;
598
599         lock_memory_hotplug();
600         pgdat = hotadd_new_pgdat(nid, 0);
601         if (!pgdat) {
602                 ret = -ENOMEM;
603                 goto out;
604         }
605         node_set_online(nid);
606         ret = register_one_node(nid);
607         BUG_ON(ret);
608
609 out:
610         unlock_memory_hotplug();
611         return ret;
612 }
613
614 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
615 int __ref add_memory(int nid, u64 start, u64 size)
616 {
617         pg_data_t *pgdat = NULL;
618         int new_pgdat = 0;
619         struct resource *res;
620         int ret;
621
622         lock_memory_hotplug();
623
624         res = register_memory_resource(start, size);
625         ret = -EEXIST;
626         if (!res)
627                 goto out;
628
629         if (!node_online(nid)) {
630                 pgdat = hotadd_new_pgdat(nid, start);
631                 ret = -ENOMEM;
632                 if (!pgdat)
633                         goto error;
634                 new_pgdat = 1;
635         }
636
637         /* call arch's memory hotadd */
638         ret = arch_add_memory(nid, start, size);
639
640         if (ret < 0)
641                 goto error;
642
643         /* we online node here. we can't roll back from here. */
644         node_set_online(nid);
645
646         if (new_pgdat) {
647                 ret = register_one_node(nid);
648                 /*
649                  * If sysfs file of new node can't create, cpu on the node
650                  * can't be hot-added. There is no rollback way now.
651                  * So, check by BUG_ON() to catch it reluctantly..
652                  */
653                 BUG_ON(ret);
654         }
655
656         /* create new memmap entry */
657         firmware_map_add_hotplug(start, start + size, "System RAM");
658
659         goto out;
660
661 error:
662         /* rollback pgdat allocation and others */
663         if (new_pgdat)
664                 rollback_node_hotadd(nid, pgdat);
665         if (res)
666                 release_memory_resource(res);
667
668 out:
669         unlock_memory_hotplug();
670         return ret;
671 }
672 EXPORT_SYMBOL_GPL(add_memory);
673
674 #ifdef CONFIG_MEMORY_HOTREMOVE
675 /*
676  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
677  * set and the size of the free page is given by page_order(). Using this,
678  * the function determines if the pageblock contains only free pages.
679  * Due to buddy contraints, a free page at least the size of a pageblock will
680  * be located at the start of the pageblock
681  */
682 static inline int pageblock_free(struct page *page)
683 {
684         return PageBuddy(page) && page_order(page) >= pageblock_order;
685 }
686
687 /* Return the start of the next active pageblock after a given page */
688 static struct page *next_active_pageblock(struct page *page)
689 {
690         /* Ensure the starting page is pageblock-aligned */
691         BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
692
693         /* If the entire pageblock is free, move to the end of free page */
694         if (pageblock_free(page)) {
695                 int order;
696                 /* be careful. we don't have locks, page_order can be changed.*/
697                 order = page_order(page);
698                 if ((order < MAX_ORDER) && (order >= pageblock_order))
699                         return page + (1 << order);
700         }
701
702         return page + pageblock_nr_pages;
703 }
704
705 /* Checks if this range of memory is likely to be hot-removable. */
706 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
707 {
708         struct page *page = pfn_to_page(start_pfn);
709         struct page *end_page = page + nr_pages;
710
711         /* Check the starting page of each pageblock within the range */
712         for (; page < end_page; page = next_active_pageblock(page)) {
713                 if (!is_pageblock_removable_nolock(page))
714                         return 0;
715                 cond_resched();
716         }
717
718         /* All pageblocks in the memory block are likely to be hot-removable */
719         return 1;
720 }
721
722 /*
723  * Confirm all pages in a range [start, end) is belongs to the same zone.
724  */
725 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
726 {
727         unsigned long pfn;
728         struct zone *zone = NULL;
729         struct page *page;
730         int i;
731         for (pfn = start_pfn;
732              pfn < end_pfn;
733              pfn += MAX_ORDER_NR_PAGES) {
734                 i = 0;
735                 /* This is just a CONFIG_HOLES_IN_ZONE check.*/
736                 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
737                         i++;
738                 if (i == MAX_ORDER_NR_PAGES)
739                         continue;
740                 page = pfn_to_page(pfn + i);
741                 if (zone && page_zone(page) != zone)
742                         return 0;
743                 zone = page_zone(page);
744         }
745         return 1;
746 }
747
748 /*
749  * Scanning pfn is much easier than scanning lru list.
750  * Scan pfn from start to end and Find LRU page.
751  */
752 static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
753 {
754         unsigned long pfn;
755         struct page *page;
756         for (pfn = start; pfn < end; pfn++) {
757                 if (pfn_valid(pfn)) {
758                         page = pfn_to_page(pfn);
759                         if (PageLRU(page))
760                                 return pfn;
761                 }
762         }
763         return 0;
764 }
765
766 #define NR_OFFLINE_AT_ONCE_PAGES        (256)
767 static int
768 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
769 {
770         unsigned long pfn;
771         struct page *page;
772         int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
773         int not_managed = 0;
774         int ret = 0;
775         LIST_HEAD(source);
776
777         for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
778                 if (!pfn_valid(pfn))
779                         continue;
780                 page = pfn_to_page(pfn);
781                 if (!get_page_unless_zero(page))
782                         continue;
783                 /*
784                  * We can skip free pages. And we can only deal with pages on
785                  * LRU.
786                  */
787                 ret = isolate_lru_page(page);
788                 if (!ret) { /* Success */
789                         put_page(page);
790                         list_add_tail(&page->lru, &source);
791                         move_pages--;
792                         inc_zone_page_state(page, NR_ISOLATED_ANON +
793                                             page_is_file_cache(page));
794
795                 } else {
796 #ifdef CONFIG_DEBUG_VM
797                         printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
798                                pfn);
799                         dump_page(page);
800 #endif
801                         put_page(page);
802                         /* Because we don't have big zone->lock. we should
803                            check this again here. */
804                         if (page_count(page)) {
805                                 not_managed++;
806                                 ret = -EBUSY;
807                                 break;
808                         }
809                 }
810         }
811         if (!list_empty(&source)) {
812                 if (not_managed) {
813                         putback_lru_pages(&source);
814                         goto out;
815                 }
816
817                 /*
818                  * alloc_migrate_target should be improooooved!!
819                  * migrate_pages returns # of failed pages.
820                  */
821                 ret = migrate_pages(&source, alloc_migrate_target, 0,
822                                                         true, MIGRATE_SYNC);
823                 if (ret)
824                         putback_lru_pages(&source);
825         }
826 out:
827         return ret;
828 }
829
830 /*
831  * remove from free_area[] and mark all as Reserved.
832  */
833 static int
834 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
835                         void *data)
836 {
837         __offline_isolated_pages(start, start + nr_pages);
838         return 0;
839 }
840
841 static void
842 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
843 {
844         walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
845                                 offline_isolated_pages_cb);
846 }
847
848 /*
849  * Check all pages in range, recoreded as memory resource, are isolated.
850  */
851 static int
852 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
853                         void *data)
854 {
855         int ret;
856         long offlined = *(long *)data;
857         ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
858         offlined = nr_pages;
859         if (!ret)
860                 *(long *)data += offlined;
861         return ret;
862 }
863
864 static long
865 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
866 {
867         long offlined = 0;
868         int ret;
869
870         ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
871                         check_pages_isolated_cb);
872         if (ret < 0)
873                 offlined = (long)ret;
874         return offlined;
875 }
876
877 static int __ref __offline_pages(unsigned long start_pfn,
878                   unsigned long end_pfn, unsigned long timeout)
879 {
880         unsigned long pfn, nr_pages, expire;
881         long offlined_pages;
882         int ret, drain, retry_max, node;
883         struct zone *zone;
884         struct memory_notify arg;
885
886         BUG_ON(start_pfn >= end_pfn);
887         /* at least, alignment against pageblock is necessary */
888         if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
889                 return -EINVAL;
890         if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
891                 return -EINVAL;
892         /* This makes hotplug much easier...and readable.
893            we assume this for now. .*/
894         if (!test_pages_in_a_zone(start_pfn, end_pfn))
895                 return -EINVAL;
896
897         lock_memory_hotplug();
898
899         zone = page_zone(pfn_to_page(start_pfn));
900         node = zone_to_nid(zone);
901         nr_pages = end_pfn - start_pfn;
902
903         /* set above range as isolated */
904         ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
905         if (ret)
906                 goto out;
907
908         arg.start_pfn = start_pfn;
909         arg.nr_pages = nr_pages;
910         arg.status_change_nid = -1;
911         if (nr_pages >= node_present_pages(node))
912                 arg.status_change_nid = node;
913
914         ret = memory_notify(MEM_GOING_OFFLINE, &arg);
915         ret = notifier_to_errno(ret);
916         if (ret)
917                 goto failed_removal;
918
919         pfn = start_pfn;
920         expire = jiffies + timeout;
921         drain = 0;
922         retry_max = 5;
923 repeat:
924         /* start memory hot removal */
925         ret = -EAGAIN;
926         if (time_after(jiffies, expire))
927                 goto failed_removal;
928         ret = -EINTR;
929         if (signal_pending(current))
930                 goto failed_removal;
931         ret = 0;
932         if (drain) {
933                 lru_add_drain_all();
934                 cond_resched();
935                 drain_all_pages();
936         }
937
938         pfn = scan_lru_pages(start_pfn, end_pfn);
939         if (pfn) { /* We have page on LRU */
940                 ret = do_migrate_range(pfn, end_pfn);
941                 if (!ret) {
942                         drain = 1;
943                         goto repeat;
944                 } else {
945                         if (ret < 0)
946                                 if (--retry_max == 0)
947                                         goto failed_removal;
948                         yield();
949                         drain = 1;
950                         goto repeat;
951                 }
952         }
953         /* drain all zone's lru pagevec, this is asyncronous... */
954         lru_add_drain_all();
955         yield();
956         /* drain pcp pages , this is synchrouns. */
957         drain_all_pages();
958         /* check again */
959         offlined_pages = check_pages_isolated(start_pfn, end_pfn);
960         if (offlined_pages < 0) {
961                 ret = -EBUSY;
962                 goto failed_removal;
963         }
964         printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
965         /* Ok, all of our target is islaoted.
966            We cannot do rollback at this point. */
967         offline_isolated_pages(start_pfn, end_pfn);
968         /* reset pagetype flags and makes migrate type to be MOVABLE */
969         undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
970         /* removal success */
971         zone->present_pages -= offlined_pages;
972         zone->zone_pgdat->node_present_pages -= offlined_pages;
973         totalram_pages -= offlined_pages;
974
975         init_per_zone_wmark_min();
976
977         if (!populated_zone(zone)) {
978                 zone_pcp_reset(zone);
979                 mutex_lock(&zonelists_mutex);
980                 build_all_zonelists(NULL, NULL);
981                 mutex_unlock(&zonelists_mutex);
982         } else
983                 zone_pcp_update(zone);
984
985         if (!node_present_pages(node)) {
986                 node_clear_state(node, N_HIGH_MEMORY);
987                 kswapd_stop(node);
988         }
989
990         vm_total_pages = nr_free_pagecache_pages();
991         writeback_set_ratelimit();
992
993         memory_notify(MEM_OFFLINE, &arg);
994         unlock_memory_hotplug();
995         return 0;
996
997 failed_removal:
998         printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
999                (unsigned long long) start_pfn << PAGE_SHIFT,
1000                ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1001         memory_notify(MEM_CANCEL_OFFLINE, &arg);
1002         /* pushback to free area */
1003         undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1004
1005 out:
1006         unlock_memory_hotplug();
1007         return ret;
1008 }
1009
1010 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1011 {
1012         return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1013 }
1014
1015 int remove_memory(u64 start, u64 size)
1016 {
1017         struct memory_block *mem = NULL;
1018         struct mem_section *section;
1019         unsigned long start_pfn, end_pfn;
1020         unsigned long pfn, section_nr;
1021         int ret;
1022
1023         start_pfn = PFN_DOWN(start);
1024         end_pfn = start_pfn + PFN_DOWN(size);
1025
1026         for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1027                 section_nr = pfn_to_section_nr(pfn);
1028                 if (!present_section_nr(section_nr))
1029                         continue;
1030
1031                 section = __nr_to_section(section_nr);
1032                 /* same memblock? */
1033                 if (mem)
1034                         if ((section_nr >= mem->start_section_nr) &&
1035                             (section_nr <= mem->end_section_nr))
1036                                 continue;
1037
1038                 mem = find_memory_block_hinted(section, mem);
1039                 if (!mem)
1040                         continue;
1041
1042                 ret = offline_memory_block(mem);
1043                 if (ret) {
1044                         kobject_put(&mem->dev.kobj);
1045                         return ret;
1046                 }
1047         }
1048
1049         if (mem)
1050                 kobject_put(&mem->dev.kobj);
1051
1052         return 0;
1053 }
1054 #else
1055 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1056 {
1057         return -EINVAL;
1058 }
1059 int remove_memory(u64 start, u64 size)
1060 {
1061         return -EINVAL;
1062 }
1063 #endif /* CONFIG_MEMORY_HOTREMOVE */
1064 EXPORT_SYMBOL_GPL(remove_memory);