drivers/base/memory.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Memory subsystem support
   4  *
   5  * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
   6  *            Dave Hansen <haveblue@us.ibm.com>
   7  *
   8  * This file provides the necessary infrastructure to represent
   9  * a SPARSEMEM-memory-model system's physical memory in /sysfs.
  10  * All arch-independent code that assumes MEMORY_HOTPLUG requires
  11  * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
  12  */
  13
  14 #include <linux/module.h>
  15 #include <linux/init.h>
  16 #include <linux/topology.h>
  17 #include <linux/capability.h>
  18 #include <linux/device.h>
  19 #include <linux/memory.h>
  20 #include <linux/memory_hotplug.h>
  21 #include <linux/mm.h>
  22 #include <linux/stat.h>
  23 #include <linux/slab.h>
  24 #include <linux/xarray.h>
  25
  26 #include <linux/atomic.h>
  27 #include <linux/uaccess.h>
  28
  29 #define MEMORY_CLASS_NAME       "memory"
  30
  31 static const char *const online_type_to_str[] = {
  32         [MMOP_OFFLINE] = "offline",
  33         [MMOP_ONLINE] = "online",
  34         [MMOP_ONLINE_KERNEL] = "online_kernel",
  35         [MMOP_ONLINE_MOVABLE] = "online_movable",
  36 };
  37
  38 int mhp_online_type_from_str(const char *str)
  39 {
  40         int i;
  41
  42         for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
  43                 if (sysfs_streq(str, online_type_to_str[i]))
  44                         return i;
  45         }
  46         return -EINVAL;
  47 }
  48
  49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
  50
  51 static int sections_per_block;
  52
  53 static inline unsigned long memory_block_id(unsigned long section_nr)
  54 {
  55         return section_nr / sections_per_block;
  56 }
  57
  58 static inline unsigned long pfn_to_block_id(unsigned long pfn)
  59 {
  60         return memory_block_id(pfn_to_section_nr(pfn));
  61 }
  62
  63 static inline unsigned long phys_to_block_id(unsigned long phys)
  64 {
  65         return pfn_to_block_id(PFN_DOWN(phys));
  66 }
  67
  68 static int memory_subsys_online(struct device *dev);
  69 static int memory_subsys_offline(struct device *dev);
  70
  71 static struct bus_type memory_subsys = {
  72         .name = MEMORY_CLASS_NAME,
  73         .dev_name = MEMORY_CLASS_NAME,
  74         .online = memory_subsys_online,
  75         .offline = memory_subsys_offline,
  76 };
  77
  78 /*
  79  * Memory blocks are cached in a local radix tree to avoid
  80  * a costly linear search for the corresponding device on
  81  * the subsystem bus.
  82  */
  83 static DEFINE_XARRAY(memory_blocks);
  84
  85 /*
  86  * Memory groups, indexed by memory group id (mgid).
  87  */
  88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
  89 #define MEMORY_GROUP_MARK_DYNAMIC       XA_MARK_1
  90
  91 static BLOCKING_NOTIFIER_HEAD(memory_chain);
  92
  93 int register_memory_notifier(struct notifier_block *nb)
  94 {
  95         return blocking_notifier_chain_register(&memory_chain, nb);
  96 }
  97 EXPORT_SYMBOL(register_memory_notifier);
  98
  99 void unregister_memory_notifier(struct notifier_block *nb)
 100 {
 101         blocking_notifier_chain_unregister(&memory_chain, nb);
 102 }
 103 EXPORT_SYMBOL(unregister_memory_notifier);
 104
 105 static void memory_block_release(struct device *dev)
 106 {
 107         struct memory_block *mem = to_memory_block(dev);
 108         /* Verify that the altmap is freed */
 109         WARN_ON(mem->altmap);
 110         kfree(mem);
 111 }
 112
 113 unsigned long __weak memory_block_size_bytes(void)
 114 {
 115         return MIN_MEMORY_BLOCK_SIZE;
 116 }
 117 EXPORT_SYMBOL_GPL(memory_block_size_bytes);
 118
 119 /* Show the memory block ID, relative to the memory block size */
 120 static ssize_t phys_index_show(struct device *dev,
 121                                struct device_attribute *attr, char *buf)
 122 {
 123         struct memory_block *mem = to_memory_block(dev);
 124
 125         return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr));
 126 }
 127
 128 /*
 129  * Legacy interface that we cannot remove. Always indicate "removable"
 130  * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
 131  */
 132 static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 133                               char *buf)
 134 {
 135         return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
 136 }
 137
 138 /*
 139  * online, offline, going offline, etc.
 140  */
 141 static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 142                           char *buf)
 143 {
 144         struct memory_block *mem = to_memory_block(dev);
 145         const char *output;
 146
 147         /*
 148          * We can probably put these states in a nice little array
 149          * so that they're not open-coded
 150          */
 151         switch (mem->state) {
 152         case MEM_ONLINE:
 153                 output = "online";
 154                 break;
 155         case MEM_OFFLINE:
 156                 output = "offline";
 157                 break;
 158         case MEM_GOING_OFFLINE:
 159                 output = "going-offline";
 160                 break;
 161         default:
 162                 WARN_ON(1);
 163                 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
 164         }
 165
 166         return sysfs_emit(buf, "%s\n", output);
 167 }
 168
 169 int memory_notify(unsigned long val, void *v)
 170 {
 171         return blocking_notifier_call_chain(&memory_chain, val, v);
 172 }
 173
 174 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
 175 static unsigned long memblk_nr_poison(struct memory_block *mem);
 176 #else
 177 static inline unsigned long memblk_nr_poison(struct memory_block *mem)
 178 {
 179         return 0;
 180 }
 181 #endif
 182
 183 /*
 184  * Must acquire mem_hotplug_lock in write mode.
 185  */
 186 static int memory_block_online(struct memory_block *mem)
 187 {
 188         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 189         unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 190         unsigned long nr_vmemmap_pages = 0;
 191         struct zone *zone;
 192         int ret;
 193
 194         if (memblk_nr_poison(mem))
 195                 return -EHWPOISON;
 196
 197         zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
 198                                   start_pfn, nr_pages);
 199
 200         /*
 201          * Although vmemmap pages have a different lifecycle than the pages
 202          * they describe (they remain until the memory is unplugged), doing
 203          * their initialization and accounting at memory onlining/offlining
 204          * stage helps to keep accounting easier to follow - e.g vmemmaps
 205          * belong to the same zone as the memory they backed.
 206          */
 207         if (mem->altmap)
 208                 nr_vmemmap_pages = mem->altmap->free;
 209
 210         mem_hotplug_begin();
 211         if (nr_vmemmap_pages) {
 212                 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
 213                 if (ret)
 214                         goto out;
 215         }
 216
 217         ret = online_pages(start_pfn + nr_vmemmap_pages,
 218                            nr_pages - nr_vmemmap_pages, zone, mem->group);
 219         if (ret) {
 220                 if (nr_vmemmap_pages)
 221                         mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 222                 goto out;
 223         }
 224
 225         /*
 226          * Account once onlining succeeded. If the zone was unpopulated, it is
 227          * now already properly populated.
 228          */
 229         if (nr_vmemmap_pages)
 230                 adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 231                                           nr_vmemmap_pages);
 232
 233         mem->zone = zone;
 234 out:
 235         mem_hotplug_done();
 236         return ret;
 237 }
 238
 239 /*
 240  * Must acquire mem_hotplug_lock in write mode.
 241  */
 242 static int memory_block_offline(struct memory_block *mem)
 243 {
 244         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 245         unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 246         unsigned long nr_vmemmap_pages = 0;
 247         int ret;
 248
 249         if (!mem->zone)
 250                 return -EINVAL;
 251
 252         /*
 253          * Unaccount before offlining, such that unpopulated zone and kthreads
 254          * can properly be torn down in offline_pages().
 255          */
 256         if (mem->altmap)
 257                 nr_vmemmap_pages = mem->altmap->free;
 258
 259         mem_hotplug_begin();
 260         if (nr_vmemmap_pages)
 261                 adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 262                                           -nr_vmemmap_pages);
 263
 264         ret = offline_pages(start_pfn + nr_vmemmap_pages,
 265                             nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
 266         if (ret) {
 267                 /* offline_pages() failed. Account back. */
 268                 if (nr_vmemmap_pages)
 269                         adjust_present_page_count(pfn_to_page(start_pfn),
 270                                                   mem->group, nr_vmemmap_pages);
 271                 goto out;
 272         }
 273
 274         if (nr_vmemmap_pages)
 275                 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 276
 277         mem->zone = NULL;
 278 out:
 279         mem_hotplug_done();
 280         return ret;
 281 }
 282
 283 /*
 284  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 285  * OK to have direct references to sparsemem variables in here.
 286  */
 287 static int
 288 memory_block_action(struct memory_block *mem, unsigned long action)
 289 {
 290         int ret;
 291
 292         switch (action) {
 293         case MEM_ONLINE:
 294                 ret = memory_block_online(mem);
 295                 break;
 296         case MEM_OFFLINE:
 297                 ret = memory_block_offline(mem);
 298                 break;
 299         default:
 300                 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
 301                      "%ld\n", __func__, mem->start_section_nr, action, action);
 302                 ret = -EINVAL;
 303         }
 304
 305         return ret;
 306 }
 307
 308 static int memory_block_change_state(struct memory_block *mem,
 309                 unsigned long to_state, unsigned long from_state_req)
 310 {
 311         int ret = 0;
 312
 313         if (mem->state != from_state_req)
 314                 return -EINVAL;
 315
 316         if (to_state == MEM_OFFLINE)
 317                 mem->state = MEM_GOING_OFFLINE;
 318
 319         ret = memory_block_action(mem, to_state);
 320         mem->state = ret ? from_state_req : to_state;
 321
 322         return ret;
 323 }
 324
 325 /* The device lock serializes operations on memory_subsys_[online|offline] */
 326 static int memory_subsys_online(struct device *dev)
 327 {
 328         struct memory_block *mem = to_memory_block(dev);
 329         int ret;
 330
 331         if (mem->state == MEM_ONLINE)
 332                 return 0;
 333
 334         /*
 335          * When called via device_online() without configuring the online_type,
 336          * we want to default to MMOP_ONLINE.
 337          */
 338         if (mem->online_type == MMOP_OFFLINE)
 339                 mem->online_type = MMOP_ONLINE;
 340
 341         ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 342         mem->online_type = MMOP_OFFLINE;
 343
 344         return ret;
 345 }
 346
 347 static int memory_subsys_offline(struct device *dev)
 348 {
 349         struct memory_block *mem = to_memory_block(dev);
 350
 351         if (mem->state == MEM_OFFLINE)
 352                 return 0;
 353
 354         return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
 355 }
 356
 357 static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 358                            const char *buf, size_t count)
 359 {
 360         const int online_type = mhp_online_type_from_str(buf);
 361         struct memory_block *mem = to_memory_block(dev);
 362         int ret;
 363
 364         if (online_type < 0)
 365                 return -EINVAL;
 366
 367         ret = lock_device_hotplug_sysfs();
 368         if (ret)
 369                 return ret;
 370
 371         switch (online_type) {
 372         case MMOP_ONLINE_KERNEL:
 373         case MMOP_ONLINE_MOVABLE:
 374         case MMOP_ONLINE:
 375                 /* mem->online_type is protected by device_hotplug_lock */
 376                 mem->online_type = online_type;
 377                 ret = device_online(&mem->dev);
 378                 break;
 379         case MMOP_OFFLINE:
 380                 ret = device_offline(&mem->dev);
 381                 break;
 382         default:
 383                 ret = -EINVAL; /* should never happen */
 384         }
 385
 386         unlock_device_hotplug();
 387
 388         if (ret < 0)
 389                 return ret;
 390         if (ret)
 391                 return -EINVAL;
 392
 393         return count;
 394 }
 395
 396 /*
 397  * Legacy interface that we cannot remove: s390x exposes the storage increment
 398  * covered by a memory block, allowing for identifying which memory blocks
 399  * comprise a storage increment. Since a memory block spans complete
 400  * storage increments nowadays, this interface is basically unused. Other
 401  * archs never exposed != 0.
 402  */
 403 static ssize_t phys_device_show(struct device *dev,
 404                                 struct device_attribute *attr, char *buf)
 405 {
 406         struct memory_block *mem = to_memory_block(dev);
 407         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 408
 409         return sysfs_emit(buf, "%d\n",
 410                           arch_get_memory_phys_device(start_pfn));
 411 }
 412
 413 #ifdef CONFIG_MEMORY_HOTREMOVE
 414 static int print_allowed_zone(char *buf, int len, int nid,
 415                               struct memory_group *group,
 416                               unsigned long start_pfn, unsigned long nr_pages,
 417                               int online_type, struct zone *default_zone)
 418 {
 419         struct zone *zone;
 420
 421         zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
 422         if (zone == default_zone)
 423                 return 0;
 424
 425         return sysfs_emit_at(buf, len, " %s", zone->name);
 426 }
 427
 428 static ssize_t valid_zones_show(struct device *dev,
 429                                 struct device_attribute *attr, char *buf)
 430 {
 431         struct memory_block *mem = to_memory_block(dev);
 432         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 433         unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 434         struct memory_group *group = mem->group;
 435         struct zone *default_zone;
 436         int nid = mem->nid;
 437         int len = 0;
 438
 439         /*
 440          * Check the existing zone. Make sure that we do that only on the
 441          * online nodes otherwise the page_zone is not reliable
 442          */
 443         if (mem->state == MEM_ONLINE) {
 444                 /*
 445                  * If !mem->zone, the memory block spans multiple zones and
 446                  * cannot get offlined.
 447                  */
 448                 default_zone = mem->zone;
 449                 if (!default_zone)
 450                         return sysfs_emit(buf, "%s\n", "none");
 451                 len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 452                 goto out;
 453         }
 454
 455         default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
 456                                           start_pfn, nr_pages);
 457
 458         len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 459         len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 460                                   MMOP_ONLINE_KERNEL, default_zone);
 461         len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 462                                   MMOP_ONLINE_MOVABLE, default_zone);
 463 out:
 464         len += sysfs_emit_at(buf, len, "\n");
 465         return len;
 466 }
 467 static DEVICE_ATTR_RO(valid_zones);
 468 #endif
 469
 470 static DEVICE_ATTR_RO(phys_index);
 471 static DEVICE_ATTR_RW(state);
 472 static DEVICE_ATTR_RO(phys_device);
 473 static DEVICE_ATTR_RO(removable);
 474
 475 /*
 476  * Show the memory block size (shared by all memory blocks).
 477  */
 478 static ssize_t block_size_bytes_show(struct device *dev,
 479                                      struct device_attribute *attr, char *buf)
 480 {
 481         return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
 482 }
 483
 484 static DEVICE_ATTR_RO(block_size_bytes);
 485
 486 /*
 487  * Memory auto online policy.
 488  */
 489
 490 static ssize_t auto_online_blocks_show(struct device *dev,
 491                                        struct device_attribute *attr, char *buf)
 492 {
 493         return sysfs_emit(buf, "%s\n",
 494                           online_type_to_str[mhp_default_online_type]);
 495 }
 496
 497 static ssize_t auto_online_blocks_store(struct device *dev,
 498                                         struct device_attribute *attr,
 499                                         const char *buf, size_t count)
 500 {
 501         const int online_type = mhp_online_type_from_str(buf);
 502
 503         if (online_type < 0)
 504                 return -EINVAL;
 505
 506         mhp_default_online_type = online_type;
 507         return count;
 508 }
 509
 510 static DEVICE_ATTR_RW(auto_online_blocks);
 511
 512 #ifdef CONFIG_CRASH_HOTPLUG
 513 #include <linux/kexec.h>
 514 static ssize_t crash_hotplug_show(struct device *dev,
 515                                        struct device_attribute *attr, char *buf)
 516 {
 517         return sysfs_emit(buf, "%d\n", crash_hotplug_memory_support());
 518 }
 519 static DEVICE_ATTR_RO(crash_hotplug);
 520 #endif
 521
 522 /*
 523  * Some architectures will have custom drivers to do this, and
 524  * will not need to do it from userspace.  The fake hot-add code
 525  * as well as ppc64 will do all of their discovery in userspace
 526  * and will require this interface.
 527  */
 528 #ifdef CONFIG_ARCH_MEMORY_PROBE
 529 static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 530                            const char *buf, size_t count)
 531 {
 532         u64 phys_addr;
 533         int nid, ret;
 534         unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
 535
 536         ret = kstrtoull(buf, 0, &phys_addr);
 537         if (ret)
 538                 return ret;
 539
 540         if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
 541                 return -EINVAL;
 542
 543         ret = lock_device_hotplug_sysfs();
 544         if (ret)
 545                 return ret;
 546
 547         nid = memory_add_physaddr_to_nid(phys_addr);
 548         ret = __add_memory(nid, phys_addr,
 549                            MIN_MEMORY_BLOCK_SIZE * sections_per_block,
 550                            MHP_NONE);
 551
 552         if (ret)
 553                 goto out;
 554
 555         ret = count;
 556 out:
 557         unlock_device_hotplug();
 558         return ret;
 559 }
 560
 561 static DEVICE_ATTR_WO(probe);
 562 #endif
 563
 564 #ifdef CONFIG_MEMORY_FAILURE
 565 /*
 566  * Support for offlining pages of memory
 567  */
 568
 569 /* Soft offline a page */
 570 static ssize_t soft_offline_page_store(struct device *dev,
 571                                        struct device_attribute *attr,
 572                                        const char *buf, size_t count)
 573 {
 574         int ret;
 575         u64 pfn;
 576         if (!capable(CAP_SYS_ADMIN))
 577                 return -EPERM;
 578         if (kstrtoull(buf, 0, &pfn) < 0)
 579                 return -EINVAL;
 580         pfn >>= PAGE_SHIFT;
 581         ret = soft_offline_page(pfn, 0);
 582         return ret == 0 ? count : ret;
 583 }
 584
 585 /* Forcibly offline a page, including killing processes. */
 586 static ssize_t hard_offline_page_store(struct device *dev,
 587                                        struct device_attribute *attr,
 588                                        const char *buf, size_t count)
 589 {
 590         int ret;
 591         u64 pfn;
 592         if (!capable(CAP_SYS_ADMIN))
 593                 return -EPERM;
 594         if (kstrtoull(buf, 0, &pfn) < 0)
 595                 return -EINVAL;
 596         pfn >>= PAGE_SHIFT;
 597         ret = memory_failure(pfn, MF_SW_SIMULATED);
 598         if (ret == -EOPNOTSUPP)
 599                 ret = 0;
 600         return ret ? ret : count;
 601 }
 602
 603 static DEVICE_ATTR_WO(soft_offline_page);
 604 static DEVICE_ATTR_WO(hard_offline_page);
 605 #endif
 606
 607 /* See phys_device_show(). */
 608 int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 609 {
 610         return 0;
 611 }
 612
 613 /*
 614  * A reference for the returned memory block device is acquired.
 615  *
 616  * Called under device_hotplug_lock.
 617  */
 618 static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 619 {
 620         struct memory_block *mem;
 621
 622         mem = xa_load(&memory_blocks, block_id);
 623         if (mem)
 624                 get_device(&mem->dev);
 625         return mem;
 626 }
 627
 628 /*
 629  * Called under device_hotplug_lock.
 630  */
 631 struct memory_block *find_memory_block(unsigned long section_nr)
 632 {
 633         unsigned long block_id = memory_block_id(section_nr);
 634
 635         return find_memory_block_by_id(block_id);
 636 }
 637
 638 static struct attribute *memory_memblk_attrs[] = {
 639         &dev_attr_phys_index.attr,
 640         &dev_attr_state.attr,
 641         &dev_attr_phys_device.attr,
 642         &dev_attr_removable.attr,
 643 #ifdef CONFIG_MEMORY_HOTREMOVE
 644         &dev_attr_valid_zones.attr,
 645 #endif
 646         NULL
 647 };
 648
 649 static const struct attribute_group memory_memblk_attr_group = {
 650         .attrs = memory_memblk_attrs,
 651 };
 652
 653 static const struct attribute_group *memory_memblk_attr_groups[] = {
 654         &memory_memblk_attr_group,
 655         NULL,
 656 };
 657
 658 static int __add_memory_block(struct memory_block *memory)
 659 {
 660         int ret;
 661
 662         memory->dev.bus = &memory_subsys;
 663         memory->dev.id = memory->start_section_nr / sections_per_block;
 664         memory->dev.release = memory_block_release;
 665         memory->dev.groups = memory_memblk_attr_groups;
 666         memory->dev.offline = memory->state == MEM_OFFLINE;
 667
 668         ret = device_register(&memory->dev);
 669         if (ret) {
 670                 put_device(&memory->dev);
 671                 return ret;
 672         }
 673         ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
 674                               GFP_KERNEL));
 675         if (ret)
 676                 device_unregister(&memory->dev);
 677
 678         return ret;
 679 }
 680
 681 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
 682                                                      int nid)
 683 {
 684         const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 685         const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 686         struct zone *zone, *matching_zone = NULL;
 687         pg_data_t *pgdat = NODE_DATA(nid);
 688         int i;
 689
 690         /*
 691          * This logic only works for early memory, when the applicable zones
 692          * already span the memory block. We don't expect overlapping zones on
 693          * a single node for early memory. So if we're told that some PFNs
 694          * of a node fall into this memory block, we can assume that all node
 695          * zones that intersect with the memory block are actually applicable.
 696          * No need to look at the memmap.
 697          */
 698         for (i = 0; i < MAX_NR_ZONES; i++) {
 699                 zone = pgdat->node_zones + i;
 700                 if (!populated_zone(zone))
 701                         continue;
 702                 if (!zone_intersects(zone, start_pfn, nr_pages))
 703                         continue;
 704                 if (!matching_zone) {
 705                         matching_zone = zone;
 706                         continue;
 707                 }
 708                 /* Spans multiple zones ... */
 709                 matching_zone = NULL;
 710                 break;
 711         }
 712         return matching_zone;
 713 }
 714
 715 #ifdef CONFIG_NUMA
 716 /**
 717  * memory_block_add_nid() - Indicate that system RAM falling into this memory
 718  *                          block device (partially) belongs to the given node.
 719  * @mem: The memory block device.
 720  * @nid: The node id.
 721  * @context: The memory initialization context.
 722  *
 723  * Indicate that system RAM falling into this memory block (partially) belongs
 724  * to the given node. If the context indicates ("early") that we are adding the
 725  * node during node device subsystem initialization, this will also properly
 726  * set/adjust mem->zone based on the zone ranges of the given node.
 727  */
 728 void memory_block_add_nid(struct memory_block *mem, int nid,
 729                           enum meminit_context context)
 730 {
 731         if (context == MEMINIT_EARLY && mem->nid != nid) {
 732                 /*
 733                  * For early memory we have to determine the zone when setting
 734                  * the node id and handle multiple nodes spanning a single
 735                  * memory block by indicate via zone == NULL that we're not
 736                  * dealing with a single zone. So if we're setting the node id
 737                  * the first time, determine if there is a single zone. If we're
 738                  * setting the node id a second time to a different node,
 739                  * invalidate the single detected zone.
 740                  */
 741                 if (mem->nid == NUMA_NO_NODE)
 742                         mem->zone = early_node_zone_for_memory_block(mem, nid);
 743                 else
 744                         mem->zone = NULL;
 745         }
 746
 747         /*
 748          * If this memory block spans multiple nodes, we only indicate
 749          * the last processed node. If we span multiple nodes (not applicable
 750          * to hotplugged memory), zone == NULL will prohibit memory offlining
 751          * and consequently unplug.
 752          */
 753         mem->nid = nid;
 754 }
 755 #endif
 756
 757 static int add_memory_block(unsigned long block_id, unsigned long state,
 758                             struct vmem_altmap *altmap,
 759                             struct memory_group *group)
 760 {
 761         struct memory_block *mem;
 762         int ret = 0;
 763
 764         mem = find_memory_block_by_id(block_id);
 765         if (mem) {
 766                 put_device(&mem->dev);
 767                 return -EEXIST;
 768         }
 769         mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 770         if (!mem)
 771                 return -ENOMEM;
 772
 773         mem->start_section_nr = block_id * sections_per_block;
 774         mem->state = state;
 775         mem->nid = NUMA_NO_NODE;
 776         mem->altmap = altmap;
 777         INIT_LIST_HEAD(&mem->group_next);
 778
 779 #ifndef CONFIG_NUMA
 780         if (state == MEM_ONLINE)
 781                 /*
 782                  * MEM_ONLINE at this point implies early memory. With NUMA,
 783                  * we'll determine the zone when setting the node id via
 784                  * memory_block_add_nid(). Memory hotplug updated the zone
 785                  * manually when memory onlining/offlining succeeds.
 786                  */
 787                 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
 788 #endif /* CONFIG_NUMA */
 789
 790         ret = __add_memory_block(mem);
 791         if (ret)
 792                 return ret;
 793
 794         if (group) {
 795                 mem->group = group;
 796                 list_add(&mem->group_next, &group->memory_blocks);
 797         }
 798
 799         return 0;
 800 }
 801
 802 static int __init add_boot_memory_block(unsigned long base_section_nr)
 803 {
 804         int section_count = 0;
 805         unsigned long nr;
 806
 807         for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
 808              nr++)
 809                 if (present_section_nr(nr))
 810                         section_count++;
 811
 812         if (section_count == 0)
 813                 return 0;
 814         return add_memory_block(memory_block_id(base_section_nr),
 815                                 MEM_ONLINE, NULL,  NULL);
 816 }
 817
 818 static int add_hotplug_memory_block(unsigned long block_id,
 819                                     struct vmem_altmap *altmap,
 820                                     struct memory_group *group)
 821 {
 822         return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 823 }
 824
 825 static void remove_memory_block(struct memory_block *memory)
 826 {
 827         if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
 828                 return;
 829
 830         WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
 831
 832         if (memory->group) {
 833                 list_del(&memory->group_next);
 834                 memory->group = NULL;
 835         }
 836
 837         /* drop the ref. we got via find_memory_block() */
 838         put_device(&memory->dev);
 839         device_unregister(&memory->dev);
 840 }
 841
 842 /*
 843  * Create memory block devices for the given memory area. Start and size
 844  * have to be aligned to memory block granularity. Memory block devices
 845  * will be initialized as offline.
 846  *
 847  * Called under device_hotplug_lock.
 848  */
 849 int create_memory_block_devices(unsigned long start, unsigned long size,
 850                                 struct vmem_altmap *altmap,
 851                                 struct memory_group *group)
 852 {
 853         const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 854         unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 855         struct memory_block *mem;
 856         unsigned long block_id;
 857         int ret = 0;
 858
 859         if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 860                          !IS_ALIGNED(size, memory_block_size_bytes())))
 861                 return -EINVAL;
 862
 863         for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 864                 ret = add_hotplug_memory_block(block_id, altmap, group);
 865                 if (ret)
 866                         break;
 867         }
 868         if (ret) {
 869                 end_block_id = block_id;
 870                 for (block_id = start_block_id; block_id != end_block_id;
 871                      block_id++) {
 872                         mem = find_memory_block_by_id(block_id);
 873                         if (WARN_ON_ONCE(!mem))
 874                                 continue;
 875                         remove_memory_block(mem);
 876                 }
 877         }
 878         return ret;
 879 }
 880
 881 /*
 882  * Remove memory block devices for the given memory area. Start and size
 883  * have to be aligned to memory block granularity. Memory block devices
 884  * have to be offline.
 885  *
 886  * Called under device_hotplug_lock.
 887  */
 888 void remove_memory_block_devices(unsigned long start, unsigned long size)
 889 {
 890         const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 891         const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 892         struct memory_block *mem;
 893         unsigned long block_id;
 894
 895         if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 896                          !IS_ALIGNED(size, memory_block_size_bytes())))
 897                 return;
 898
 899         for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 900                 mem = find_memory_block_by_id(block_id);
 901                 if (WARN_ON_ONCE(!mem))
 902                         continue;
 903                 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
 904                 unregister_memory_block_under_nodes(mem);
 905                 remove_memory_block(mem);
 906         }
 907 }
 908
 909 static struct attribute *memory_root_attrs[] = {
 910 #ifdef CONFIG_ARCH_MEMORY_PROBE
 911         &dev_attr_probe.attr,
 912 #endif
 913
 914 #ifdef CONFIG_MEMORY_FAILURE
 915         &dev_attr_soft_offline_page.attr,
 916         &dev_attr_hard_offline_page.attr,
 917 #endif
 918
 919         &dev_attr_block_size_bytes.attr,
 920         &dev_attr_auto_online_blocks.attr,
 921 #ifdef CONFIG_CRASH_HOTPLUG
 922         &dev_attr_crash_hotplug.attr,
 923 #endif
 924         NULL
 925 };
 926
 927 static const struct attribute_group memory_root_attr_group = {
 928         .attrs = memory_root_attrs,
 929 };
 930
 931 static const struct attribute_group *memory_root_attr_groups[] = {
 932         &memory_root_attr_group,
 933         NULL,
 934 };
 935
 936 /*
 937  * Initialize the sysfs support for memory devices. At the time this function
 938  * is called, we cannot have concurrent creation/deletion of memory block
 939  * devices, the device_hotplug_lock is not needed.
 940  */
 941 void __init memory_dev_init(void)
 942 {
 943         int ret;
 944         unsigned long block_sz, nr;
 945
 946         /* Validate the configured memory block size */
 947         block_sz = memory_block_size_bytes();
 948         if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
 949                 panic("Memory block size not suitable: 0x%lx\n", block_sz);
 950         sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
 951
 952         ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 953         if (ret)
 954                 panic("%s() failed to register subsystem: %d\n", __func__, ret);
 955
 956         /*
 957          * Create entries for memory sections that were found
 958          * during boot and have been initialized
 959          */
 960         for (nr = 0; nr <= __highest_present_section_nr;
 961              nr += sections_per_block) {
 962                 ret = add_boot_memory_block(nr);
 963                 if (ret)
 964                         panic("%s() failed to add memory block: %d\n", __func__,
 965                               ret);
 966         }
 967 }
 968
 969 /**
 970  * walk_memory_blocks - walk through all present memory blocks overlapped
 971  *                      by the range [start, start + size)
 972  *
 973  * @start: start address of the memory range
 974  * @size: size of the memory range
 975  * @arg: argument passed to func
 976  * @func: callback for each memory section walked
 977  *
 978  * This function walks through all present memory blocks overlapped by the
 979  * range [start, start + size), calling func on each memory block.
 980  *
 981  * In case func() returns an error, walking is aborted and the error is
 982  * returned.
 983  *
 984  * Called under device_hotplug_lock.
 985  */
 986 int walk_memory_blocks(unsigned long start, unsigned long size,
 987                        void *arg, walk_memory_blocks_func_t func)
 988 {
 989         const unsigned long start_block_id = phys_to_block_id(start);
 990         const unsigned long end_block_id = phys_to_block_id(start + size - 1);
 991         struct memory_block *mem;
 992         unsigned long block_id;
 993         int ret = 0;
 994
 995         if (!size)
 996                 return 0;
 997
 998         for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
 999                 mem = find_memory_block_by_id(block_id);
1000                 if (!mem)
1001                         continue;
1002
1003                 ret = func(mem, arg);
1004                 put_device(&mem->dev);
1005                 if (ret)
1006                         break;
1007         }
1008         return ret;
1009 }
1010
1011 struct for_each_memory_block_cb_data {
1012         walk_memory_blocks_func_t func;
1013         void *arg;
1014 };
1015
1016 static int for_each_memory_block_cb(struct device *dev, void *data)
1017 {
1018         struct memory_block *mem = to_memory_block(dev);
1019         struct for_each_memory_block_cb_data *cb_data = data;
1020
1021         return cb_data->func(mem, cb_data->arg);
1022 }
1023
1024 /**
1025  * for_each_memory_block - walk through all present memory blocks
1026  *
1027  * @arg: argument passed to func
1028  * @func: callback for each memory block walked
1029  *
1030  * This function walks through all present memory blocks, calling func on
1031  * each memory block.
1032  *
1033  * In case func() returns an error, walking is aborted and the error is
1034  * returned.
1035  */
1036 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
1037 {
1038         struct for_each_memory_block_cb_data cb_data = {
1039                 .func = func,
1040                 .arg = arg,
1041         };
1042
1043         return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
1044                                 for_each_memory_block_cb);
1045 }
1046
1047 /*
1048  * This is an internal helper to unify allocation and initialization of
1049  * memory groups. Note that the passed memory group will be copied to a
1050  * dynamically allocated memory group. After this call, the passed
1051  * memory group should no longer be used.
1052  */
1053 static int memory_group_register(struct memory_group group)
1054 {
1055         struct memory_group *new_group;
1056         uint32_t mgid;
1057         int ret;
1058
1059         if (!node_possible(group.nid))
1060                 return -EINVAL;
1061
1062         new_group = kzalloc(sizeof(group), GFP_KERNEL);
1063         if (!new_group)
1064                 return -ENOMEM;
1065         *new_group = group;
1066         INIT_LIST_HEAD(&new_group->memory_blocks);
1067
1068         ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
1069                        GFP_KERNEL);
1070         if (ret) {
1071                 kfree(new_group);
1072                 return ret;
1073         } else if (group.is_dynamic) {
1074                 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
1075         }
1076         return mgid;
1077 }
1078
1079 /**
1080  * memory_group_register_static() - Register a static memory group.
1081  * @nid: The node id.
1082  * @max_pages: The maximum number of pages we'll have in this static memory
1083  *             group.
1084  *
1085  * Register a new static memory group and return the memory group id.
1086  * All memory in the group belongs to a single unit, such as a DIMM. All
1087  * memory belonging to a static memory group is added in one go to be removed
1088  * in one go -- it's static.
1089  *
1090  * Returns an error if out of memory, if the node id is invalid, if no new
1091  * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
1092  * returns the new memory group id.
1093  */
1094 int memory_group_register_static(int nid, unsigned long max_pages)
1095 {
1096         struct memory_group group = {
1097                 .nid = nid,
1098                 .s = {
1099                         .max_pages = max_pages,
1100                 },
1101         };
1102
1103         if (!max_pages)
1104                 return -EINVAL;
1105         return memory_group_register(group);
1106 }
1107 EXPORT_SYMBOL_GPL(memory_group_register_static);
1108
1109 /**
1110  * memory_group_register_dynamic() - Register a dynamic memory group.
1111  * @nid: The node id.
1112  * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
1113  *              memory group.
1114  *
1115  * Register a new dynamic memory group and return the memory group id.
1116  * Memory within a dynamic memory group is added/removed dynamically
1117  * in unit_pages.
1118  *
1119  * Returns an error if out of memory, if the node id is invalid, if no new
1120  * memory groups can be registered, or if unit_pages is invalid (0, not a
1121  * power of two, smaller than a single memory block). Otherwise, returns the
1122  * new memory group id.
1123  */
1124 int memory_group_register_dynamic(int nid, unsigned long unit_pages)
1125 {
1126         struct memory_group group = {
1127                 .nid = nid,
1128                 .is_dynamic = true,
1129                 .d = {
1130                         .unit_pages = unit_pages,
1131                 },
1132         };
1133
1134         if (!unit_pages || !is_power_of_2(unit_pages) ||
1135             unit_pages < PHYS_PFN(memory_block_size_bytes()))
1136                 return -EINVAL;
1137         return memory_group_register(group);
1138 }
1139 EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
1140
1141 /**
1142  * memory_group_unregister() - Unregister a memory group.
1143  * @mgid: the memory group id
1144  *
1145  * Unregister a memory group. If any memory block still belongs to this
1146  * memory group, unregistering will fail.
1147  *
1148  * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
1149  * memory blocks still belong to this memory group and returns 0 if
1150  * unregistering succeeded.
1151  */
1152 int memory_group_unregister(int mgid)
1153 {
1154         struct memory_group *group;
1155
1156         if (mgid < 0)
1157                 return -EINVAL;
1158
1159         group = xa_load(&memory_groups, mgid);
1160         if (!group)
1161                 return -EINVAL;
1162         if (!list_empty(&group->memory_blocks))
1163                 return -EBUSY;
1164         xa_erase(&memory_groups, mgid);
1165         kfree(group);
1166         return 0;
1167 }
1168 EXPORT_SYMBOL_GPL(memory_group_unregister);
1169
1170 /*
1171  * This is an internal helper only to be used in core memory hotplug code to
1172  * lookup a memory group. We don't care about locking, as we don't expect a
1173  * memory group to get unregistered while adding memory to it -- because
1174  * the group and the memory is managed by the same driver.
1175  */
1176 struct memory_group *memory_group_find_by_id(int mgid)
1177 {
1178         return xa_load(&memory_groups, mgid);
1179 }
1180
1181 /*
1182  * This is an internal helper only to be used in core memory hotplug code to
1183  * walk all dynamic memory groups excluding a given memory group, either
1184  * belonging to a specific node, or belonging to any node.
1185  */
1186 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
1187                                struct memory_group *excluded, void *arg)
1188 {
1189         struct memory_group *group;
1190         unsigned long index;
1191         int ret = 0;
1192
1193         xa_for_each_marked(&memory_groups, index, group,
1194                            MEMORY_GROUP_MARK_DYNAMIC) {
1195                 if (group == excluded)
1196                         continue;
1197 #ifdef CONFIG_NUMA
1198                 if (nid != NUMA_NO_NODE && group->nid != nid)
1199                         continue;
1200 #endif /* CONFIG_NUMA */
1201                 ret = func(group, arg);
1202                 if (ret)
1203                         break;
1204         }
1205         return ret;
1206 }
1207
1208 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
1209 void memblk_nr_poison_inc(unsigned long pfn)
1210 {
1211         const unsigned long block_id = pfn_to_block_id(pfn);
1212         struct memory_block *mem = find_memory_block_by_id(block_id);
1213
1214         if (mem)
1215                 atomic_long_inc(&mem->nr_hwpoison);
1216 }
1217
1218 void memblk_nr_poison_sub(unsigned long pfn, long i)
1219 {
1220         const unsigned long block_id = pfn_to_block_id(pfn);
1221         struct memory_block *mem = find_memory_block_by_id(block_id);
1222
1223         if (mem)
1224                 atomic_long_sub(i, &mem->nr_hwpoison);
1225 }
1226
1227 static unsigned long memblk_nr_poison(struct memory_block *mem)
1228 {
1229         return atomic_long_read(&mem->nr_hwpoison);
1230 }
1231 #endif