mm/page_ext.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/mm.h>
   3 #include <linux/mmzone.h>
   4 #include <linux/memblock.h>
   5 #include <linux/page_ext.h>
   6 #include <linux/memory.h>
   7 #include <linux/vmalloc.h>
   8 #include <linux/kmemleak.h>
   9 #include <linux/page_owner.h>
  10 #include <linux/page_idle.h>
  11 #include <linux/page_table_check.h>
  12
  13 /*
  14  * struct page extension
  15  *
  16  * This is the feature to manage memory for extended data per page.
  17  *
  18  * Until now, we must modify struct page itself to store extra data per page.
  19  * This requires rebuilding the kernel and it is really time consuming process.
  20  * And, sometimes, rebuild is impossible due to third party module dependency.
  21  * At last, enlarging struct page could cause un-wanted system behaviour change.
  22  *
  23  * This feature is intended to overcome above mentioned problems. This feature
  24  * allocates memory for extended data per page in certain place rather than
  25  * the struct page itself. This memory can be accessed by the accessor
  26  * functions provided by this code. During the boot process, it checks whether
  27  * allocation of huge chunk of memory is needed or not. If not, it avoids
  28  * allocating memory at all. With this advantage, we can include this feature
  29  * into the kernel in default and can avoid rebuild and solve related problems.
  30  *
  31  * To help these things to work well, there are two callbacks for clients. One
  32  * is the need callback which is mandatory if user wants to avoid useless
  33  * memory allocation at boot-time. The other is optional, init callback, which
  34  * is used to do proper initialization after memory is allocated.
  35  *
  36  * The need callback is used to decide whether extended memory allocation is
  37  * needed or not. Sometimes users want to deactivate some features in this
  38  * boot and extra memory would be unnecessary. In this case, to avoid
  39  * allocating huge chunk of memory, each clients represent their need of
  40  * extra memory through the need callback. If one of the need callbacks
  41  * returns true, it means that someone needs extra memory so that
  42  * page extension core should allocates memory for page extension. If
  43  * none of need callbacks return true, memory isn't needed at all in this boot
  44  * and page extension core can skip to allocate memory. As result,
  45  * none of memory is wasted.
  46  *
  47  * When need callback returns true, page_ext checks if there is a request for
  48  * extra memory through size in struct page_ext_operations. If it is non-zero,
  49  * extra space is allocated for each page_ext entry and offset is returned to
  50  * user through offset in struct page_ext_operations.
  51  *
  52  * The init callback is used to do proper initialization after page extension
  53  * is completely initialized. In sparse memory system, extra memory is
  54  * allocated some time later than memmap is allocated. In other words, lifetime
  55  * of memory for page extension isn't same with memmap for struct page.
  56  * Therefore, clients can't store extra data until page extension is
  57  * initialized, even if pages are allocated and used freely. This could
  58  * cause inadequate state of extra data per page, so, to prevent it, client
  59  * can utilize this callback to initialize the state of it correctly.
  60  */
  61
  62 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
  63 static bool need_page_idle(void)
  64 {
  65         return true;
  66 }
  67 static struct page_ext_operations page_idle_ops __initdata = {
  68         .need = need_page_idle,
  69 };
  70 #endif
  71
  72 static struct page_ext_operations *page_ext_ops[] __initdata = {
  73 #ifdef CONFIG_PAGE_OWNER
  74         &page_owner_ops,
  75 #endif
  76 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
  77         &page_idle_ops,
  78 #endif
  79 #ifdef CONFIG_PAGE_TABLE_CHECK
  80         &page_table_check_ops,
  81 #endif
  82 };
  83
  84 unsigned long page_ext_size = sizeof(struct page_ext);
  85
  86 static unsigned long total_usage;
  87
  88 static bool __init invoke_need_callbacks(void)
  89 {
  90         int i;
  91         int entries = ARRAY_SIZE(page_ext_ops);
  92         bool need = false;
  93
  94         for (i = 0; i < entries; i++) {
  95                 if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
  96                         page_ext_ops[i]->offset = page_ext_size;
  97                         page_ext_size += page_ext_ops[i]->size;
  98                         need = true;
  99                 }
 100         }
 101
 102         return need;
 103 }
 104
 105 static void __init invoke_init_callbacks(void)
 106 {
 107         int i;
 108         int entries = ARRAY_SIZE(page_ext_ops);
 109
 110         for (i = 0; i < entries; i++) {
 111                 if (page_ext_ops[i]->init)
 112                         page_ext_ops[i]->init();
 113         }
 114 }
 115
 116 #ifndef CONFIG_SPARSEMEM
 117 void __init page_ext_init_flatmem_late(void)
 118 {
 119         invoke_init_callbacks();
 120 }
 121 #endif
 122
 123 static inline struct page_ext *get_entry(void *base, unsigned long index)
 124 {
 125         return base + page_ext_size * index;
 126 }
 127
 128 #ifndef CONFIG_SPARSEMEM
 129
 130
 131 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
 132 {
 133         pgdat->node_page_ext = NULL;
 134 }
 135
 136 struct page_ext *lookup_page_ext(const struct page *page)
 137 {
 138         unsigned long pfn = page_to_pfn(page);
 139         unsigned long index;
 140         struct page_ext *base;
 141
 142         base = NODE_DATA(page_to_nid(page))->node_page_ext;
 143         /*
 144          * The sanity checks the page allocator does upon freeing a
 145          * page can reach here before the page_ext arrays are
 146          * allocated when feeding a range of pages to the allocator
 147          * for the first time during bootup or memory hotplug.
 148          */
 149         if (unlikely(!base))
 150                 return NULL;
 151         index = pfn - round_down(node_start_pfn(page_to_nid(page)),
 152                                         MAX_ORDER_NR_PAGES);
 153         return get_entry(base, index);
 154 }
 155
 156 static int __init alloc_node_page_ext(int nid)
 157 {
 158         struct page_ext *base;
 159         unsigned long table_size;
 160         unsigned long nr_pages;
 161
 162         nr_pages = NODE_DATA(nid)->node_spanned_pages;
 163         if (!nr_pages)
 164                 return 0;
 165
 166         /*
 167          * Need extra space if node range is not aligned with
 168          * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
 169          * checks buddy's status, range could be out of exact node range.
 170          */
 171         if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
 172                 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
 173                 nr_pages += MAX_ORDER_NR_PAGES;
 174
 175         table_size = page_ext_size * nr_pages;
 176
 177         base = memblock_alloc_try_nid(
 178                         table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
 179                         MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 180         if (!base)
 181                 return -ENOMEM;
 182         NODE_DATA(nid)->node_page_ext = base;
 183         total_usage += table_size;
 184         return 0;
 185 }
 186
 187 void __init page_ext_init_flatmem(void)
 188 {
 189
 190         int nid, fail;
 191
 192         if (!invoke_need_callbacks())
 193                 return;
 194
 195         for_each_online_node(nid)  {
 196                 fail = alloc_node_page_ext(nid);
 197                 if (fail)
 198                         goto fail;
 199         }
 200         pr_info("allocated %ld bytes of page_ext\n", total_usage);
 201         return;
 202
 203 fail:
 204         pr_crit("allocation of page_ext failed.\n");
 205         panic("Out of memory");
 206 }
 207
 208 #else /* CONFIG_SPARSEMEM */
 209
 210 struct page_ext *lookup_page_ext(const struct page *page)
 211 {
 212         unsigned long pfn = page_to_pfn(page);
 213         struct mem_section *section = __pfn_to_section(pfn);
 214         /*
 215          * The sanity checks the page allocator does upon freeing a
 216          * page can reach here before the page_ext arrays are
 217          * allocated when feeding a range of pages to the allocator
 218          * for the first time during bootup or memory hotplug.
 219          */
 220         if (!section->page_ext)
 221                 return NULL;
 222         return get_entry(section->page_ext, pfn);
 223 }
 224
 225 static void *__meminit alloc_page_ext(size_t size, int nid)
 226 {
 227         gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
 228         void *addr = NULL;
 229
 230         addr = alloc_pages_exact_nid(nid, size, flags);
 231         if (addr) {
 232                 kmemleak_alloc(addr, size, 1, flags);
 233                 return addr;
 234         }
 235
 236         addr = vzalloc_node(size, nid);
 237
 238         return addr;
 239 }
 240
 241 static int __meminit init_section_page_ext(unsigned long pfn, int nid)
 242 {
 243         struct mem_section *section;
 244         struct page_ext *base;
 245         unsigned long table_size;
 246
 247         section = __pfn_to_section(pfn);
 248
 249         if (section->page_ext)
 250                 return 0;
 251
 252         table_size = page_ext_size * PAGES_PER_SECTION;
 253         base = alloc_page_ext(table_size, nid);
 254
 255         /*
 256          * The value stored in section->page_ext is (base - pfn)
 257          * and it does not point to the memory block allocated above,
 258          * causing kmemleak false positives.
 259          */
 260         kmemleak_not_leak(base);
 261
 262         if (!base) {
 263                 pr_err("page ext allocation failure\n");
 264                 return -ENOMEM;
 265         }
 266
 267         /*
 268          * The passed "pfn" may not be aligned to SECTION.  For the calculation
 269          * we need to apply a mask.
 270          */
 271         pfn &= PAGE_SECTION_MASK;
 272         section->page_ext = (void *)base - page_ext_size * pfn;
 273         total_usage += table_size;
 274         return 0;
 275 }
 276
 277 static void free_page_ext(void *addr)
 278 {
 279         if (is_vmalloc_addr(addr)) {
 280                 vfree(addr);
 281         } else {
 282                 struct page *page = virt_to_page(addr);
 283                 size_t table_size;
 284
 285                 table_size = page_ext_size * PAGES_PER_SECTION;
 286
 287                 BUG_ON(PageReserved(page));
 288                 kmemleak_free(addr);
 289                 free_pages_exact(addr, table_size);
 290         }
 291 }
 292
 293 static void __free_page_ext(unsigned long pfn)
 294 {
 295         struct mem_section *ms;
 296         struct page_ext *base;
 297
 298         ms = __pfn_to_section(pfn);
 299         if (!ms || !ms->page_ext)
 300                 return;
 301         base = get_entry(ms->page_ext, pfn);
 302         free_page_ext(base);
 303         ms->page_ext = NULL;
 304 }
 305
 306 static int __meminit online_page_ext(unsigned long start_pfn,
 307                                 unsigned long nr_pages,
 308                                 int nid)
 309 {
 310         unsigned long start, end, pfn;
 311         int fail = 0;
 312
 313         start = SECTION_ALIGN_DOWN(start_pfn);
 314         end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 315
 316         if (nid == NUMA_NO_NODE) {
 317                 /*
 318                  * In this case, "nid" already exists and contains valid memory.
 319                  * "start_pfn" passed to us is a pfn which is an arg for
 320                  * online__pages(), and start_pfn should exist.
 321                  */
 322                 nid = pfn_to_nid(start_pfn);
 323                 VM_BUG_ON(!node_online(nid));
 324         }
 325
 326         for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
 327                 fail = init_section_page_ext(pfn, nid);
 328         if (!fail)
 329                 return 0;
 330
 331         /* rollback */
 332         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 333                 __free_page_ext(pfn);
 334
 335         return -ENOMEM;
 336 }
 337
 338 static int __meminit offline_page_ext(unsigned long start_pfn,
 339                                 unsigned long nr_pages, int nid)
 340 {
 341         unsigned long start, end, pfn;
 342
 343         start = SECTION_ALIGN_DOWN(start_pfn);
 344         end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 345
 346         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 347                 __free_page_ext(pfn);
 348         return 0;
 349
 350 }
 351
 352 static int __meminit page_ext_callback(struct notifier_block *self,
 353                                unsigned long action, void *arg)
 354 {
 355         struct memory_notify *mn = arg;
 356         int ret = 0;
 357
 358         switch (action) {
 359         case MEM_GOING_ONLINE:
 360                 ret = online_page_ext(mn->start_pfn,
 361                                    mn->nr_pages, mn->status_change_nid);
 362                 break;
 363         case MEM_OFFLINE:
 364                 offline_page_ext(mn->start_pfn,
 365                                 mn->nr_pages, mn->status_change_nid);
 366                 break;
 367         case MEM_CANCEL_ONLINE:
 368                 offline_page_ext(mn->start_pfn,
 369                                 mn->nr_pages, mn->status_change_nid);
 370                 break;
 371         case MEM_GOING_OFFLINE:
 372                 break;
 373         case MEM_ONLINE:
 374         case MEM_CANCEL_OFFLINE:
 375                 break;
 376         }
 377
 378         return notifier_from_errno(ret);
 379 }
 380
 381 void __init page_ext_init(void)
 382 {
 383         unsigned long pfn;
 384         int nid;
 385
 386         if (!invoke_need_callbacks())
 387                 return;
 388
 389         for_each_node_state(nid, N_MEMORY) {
 390                 unsigned long start_pfn, end_pfn;
 391
 392                 start_pfn = node_start_pfn(nid);
 393                 end_pfn = node_end_pfn(nid);
 394                 /*
 395                  * start_pfn and end_pfn may not be aligned to SECTION and the
 396                  * page->flags of out of node pages are not initialized.  So we
 397                  * scan [start_pfn, the biggest section's pfn < end_pfn) here.
 398                  */
 399                 for (pfn = start_pfn; pfn < end_pfn;
 400                         pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
 401
 402                         if (!pfn_valid(pfn))
 403                                 continue;
 404                         /*
 405                          * Nodes's pfns can be overlapping.
 406                          * We know some arch can have a nodes layout such as
 407                          * -------------pfn-------------->
 408                          * N0 | N1 | N2 | N0 | N1 | N2|....
 409                          */
 410                         if (pfn_to_nid(pfn) != nid)
 411                                 continue;
 412                         if (init_section_page_ext(pfn, nid))
 413                                 goto oom;
 414                         cond_resched();
 415                 }
 416         }
 417         hotplug_memory_notifier(page_ext_callback, 0);
 418         pr_info("allocated %ld bytes of page_ext\n", total_usage);
 419         invoke_init_callbacks();
 420         return;
 421
 422 oom:
 423         panic("Out of memory");
 424 }
 425
 426 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
 427 {
 428 }
 429
 430 #endif