+
+/**
+ * offline_memory_block_cb - callback function for offlining memory block
+ * @mem: the memory block to be offlined
+ * @arg: buffer to hold error msg
+ *
+ * Always return 0, and put the error msg in arg if any.
+ */
+static int offline_memory_block_cb(struct memory_block *mem, void *arg)
+{
+ int *ret = arg;
+ int error = offline_memory_block(mem);
+
+ if (error != 0 && *ret == 0)
+ *ret = error;
+
+ return 0;
+}
+
+static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
+{
+ int ret = !is_memblock_offlined(mem);
+
+ if (unlikely(ret))
+ pr_warn("removing memory fails, because memory "
+ "[%#010llx-%#010llx] is onlined\n",
+ PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
+ PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
+
+ return ret;
+}
+
+static int check_cpu_on_node(void *data)
+{
+ struct pglist_data *pgdat = data;
+ int cpu;
+
+ for_each_present_cpu(cpu) {
+ if (cpu_to_node(cpu) == pgdat->node_id)
+ /*
+ * the cpu on this node isn't removed, and we can't
+ * offline this node.
+ */
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void unmap_cpu_on_node(void *data)
+{
+#ifdef CONFIG_ACPI_NUMA
+ struct pglist_data *pgdat = data;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (cpu_to_node(cpu) == pgdat->node_id)
+ numa_clear_node(cpu);
+#endif
+}
+
+static int check_and_unmap_cpu_on_node(void *data)
+{
+ int ret = check_cpu_on_node(data);
+
+ if (ret)
+ return ret;
+
+ /*
+ * the node will be offlined when we come here, so we can clear
+ * the cpu_to_node() now.
+ */
+
+ unmap_cpu_on_node(data);
+ return 0;
+}
+
+/* offline the node if all memory sections of this node are removed */
+void try_offline_node(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ unsigned long start_pfn = pgdat->node_start_pfn;
+ unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+ unsigned long pfn;
+ struct page *pgdat_page = virt_to_page(pgdat);
+ int i;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ unsigned long section_nr = pfn_to_section_nr(pfn);
+
+ if (!present_section_nr(section_nr))
+ continue;
+
+ if (pfn_to_nid(pfn) != nid)
+ continue;
+
+ /*
+ * some memory sections of this node are not removed, and we
+ * can't offline node now.
+ */
+ return;
+ }
+
+ if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
+ return;
+
+ /*
+ * all memory/cpu of this node are removed, we can offline this
+ * node now.
+ */
+ node_set_offline(nid);
+ unregister_one_node(nid);
+
+ if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
+ /* node data is allocated from boot memory */
+ return;
+
+ /* free waittable in each zone */
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ /*
+ * wait_table may be allocated from boot memory,
+ * here only free if it's allocated by vmalloc.
+ */
+ if (is_vmalloc_addr(zone->wait_table))
+ vfree(zone->wait_table);
+ }
+
+ /*
+ * Since there is no way to guarentee the address of pgdat/zone is not
+ * on stack of any kernel threads or used by other kernel objects
+ * without reference counting or other symchronizing method, do not
+ * reset node_data and free pgdat here. Just reset it to 0 and reuse
+ * the memory when the node is online again.
+ */
+ memset(pgdat, 0, sizeof(*pgdat));
+}
+EXPORT_SYMBOL(try_offline_node);
+
+int __ref remove_memory(int nid, u64 start, u64 size)
+{
+ unsigned long start_pfn, end_pfn;
+ int ret = 0;
+ int retry = 1;
+
+ start_pfn = PFN_DOWN(start);
+ end_pfn = PFN_UP(start + size - 1);
+
+ /*
+ * When CONFIG_MEMCG is on, one memory block may be used by other
+ * blocks to store page cgroup when onlining pages. But we don't know
+ * in what order pages are onlined. So we iterate twice to offline
+ * memory:
+ * 1st iterate: offline every non primary memory block.
+ * 2nd iterate: offline primary (i.e. first added) memory block.
+ */
+repeat:
+ walk_memory_range(start_pfn, end_pfn, &ret,
+ offline_memory_block_cb);
+ if (ret) {
+ if (!retry)
+ return ret;
+
+ retry = 0;
+ ret = 0;
+ goto repeat;
+ }
+
+ lock_memory_hotplug();
+
+ /*
+ * we have offlined all memory blocks like this:
+ * 1. lock memory hotplug
+ * 2. offline a memory block
+ * 3. unlock memory hotplug
+ *
+ * repeat step1-3 to offline the memory block. All memory blocks
+ * must be offlined before removing memory. But we don't hold the
+ * lock in the whole operation. So we should check whether all
+ * memory blocks are offlined.
+ */
+
+ ret = walk_memory_range(start_pfn, end_pfn, NULL,
+ is_memblock_offlined_cb);
+ if (ret) {
+ unlock_memory_hotplug();
+ return ret;
+ }
+
+ /* remove memmap entry */
+ firmware_map_remove(start, start + size, "System RAM");
+
+ arch_remove_memory(start, size);
+
+ try_offline_node(nid);
+
+ unlock_memory_hotplug();
+
+ return 0;
+}