Merge tag 'trace-v6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux...
[platform/kernel/linux-starfive.git] / mm / sparse-vmemmap.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Virtual Memory Map support
4  *
5  * (C) 2007 sgi. Christoph Lameter.
6  *
7  * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
8  * virt_to_page, page_address() to be implemented as a base offset
9  * calculation without memory access.
10  *
11  * However, virtual mappings need a page table and TLBs. Many Linux
12  * architectures already map their physical space using 1-1 mappings
13  * via TLBs. For those arches the virtual memory map is essentially
14  * for free if we use the same page size as the 1-1 mappings. In that
15  * case the overhead consists of a few additional pages that are
16  * allocated to create a view of memory for vmemmap.
17  *
18  * The architecture is expected to provide a vmemmap_populate() function
19  * to instantiate the mapping.
20  */
21 #include <linux/mm.h>
22 #include <linux/mmzone.h>
23 #include <linux/memblock.h>
24 #include <linux/memremap.h>
25 #include <linux/highmem.h>
26 #include <linux/slab.h>
27 #include <linux/spinlock.h>
28 #include <linux/vmalloc.h>
29 #include <linux/sched.h>
30 #include <linux/pgtable.h>
31 #include <linux/bootmem_info.h>
32
33 #include <asm/dma.h>
34 #include <asm/pgalloc.h>
35 #include <asm/tlbflush.h>
36
37 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
38 /**
39  * struct vmemmap_remap_walk - walk vmemmap page table
40  *
41  * @remap_pte:          called for each lowest-level entry (PTE).
42  * @nr_walked:          the number of walked pte.
43  * @reuse_page:         the page which is reused for the tail vmemmap pages.
44  * @reuse_addr:         the virtual address of the @reuse_page page.
45  * @vmemmap_pages:      the list head of the vmemmap pages that can be freed
46  *                      or is mapped from.
47  */
48 struct vmemmap_remap_walk {
49         void (*remap_pte)(pte_t *pte, unsigned long addr,
50                           struct vmemmap_remap_walk *walk);
51         unsigned long nr_walked;
52         struct page *reuse_page;
53         unsigned long reuse_addr;
54         struct list_head *vmemmap_pages;
55 };
56
57 static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
58 {
59         pmd_t __pmd;
60         int i;
61         unsigned long addr = start;
62         struct page *page = pmd_page(*pmd);
63         pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
64
65         if (!pgtable)
66                 return -ENOMEM;
67
68         pmd_populate_kernel(&init_mm, &__pmd, pgtable);
69
70         for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
71                 pte_t entry, *pte;
72                 pgprot_t pgprot = PAGE_KERNEL;
73
74                 entry = mk_pte(page + i, pgprot);
75                 pte = pte_offset_kernel(&__pmd, addr);
76                 set_pte_at(&init_mm, addr, pte, entry);
77         }
78
79         spin_lock(&init_mm.page_table_lock);
80         if (likely(pmd_leaf(*pmd))) {
81                 /*
82                  * Higher order allocations from buddy allocator must be able to
83                  * be treated as indepdenent small pages (as they can be freed
84                  * individually).
85                  */
86                 if (!PageReserved(page))
87                         split_page(page, get_order(PMD_SIZE));
88
89                 /* Make pte visible before pmd. See comment in pmd_install(). */
90                 smp_wmb();
91                 pmd_populate_kernel(&init_mm, pmd, pgtable);
92                 flush_tlb_kernel_range(start, start + PMD_SIZE);
93         } else {
94                 pte_free_kernel(&init_mm, pgtable);
95         }
96         spin_unlock(&init_mm.page_table_lock);
97
98         return 0;
99 }
100
101 static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
102 {
103         int leaf;
104
105         spin_lock(&init_mm.page_table_lock);
106         leaf = pmd_leaf(*pmd);
107         spin_unlock(&init_mm.page_table_lock);
108
109         if (!leaf)
110                 return 0;
111
112         return __split_vmemmap_huge_pmd(pmd, start);
113 }
114
115 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
116                               unsigned long end,
117                               struct vmemmap_remap_walk *walk)
118 {
119         pte_t *pte = pte_offset_kernel(pmd, addr);
120
121         /*
122          * The reuse_page is found 'first' in table walk before we start
123          * remapping (which is calling @walk->remap_pte).
124          */
125         if (!walk->reuse_page) {
126                 walk->reuse_page = pte_page(*pte);
127                 /*
128                  * Because the reuse address is part of the range that we are
129                  * walking, skip the reuse address range.
130                  */
131                 addr += PAGE_SIZE;
132                 pte++;
133                 walk->nr_walked++;
134         }
135
136         for (; addr != end; addr += PAGE_SIZE, pte++) {
137                 walk->remap_pte(pte, addr, walk);
138                 walk->nr_walked++;
139         }
140 }
141
142 static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
143                              unsigned long end,
144                              struct vmemmap_remap_walk *walk)
145 {
146         pmd_t *pmd;
147         unsigned long next;
148
149         pmd = pmd_offset(pud, addr);
150         do {
151                 int ret;
152
153                 ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
154                 if (ret)
155                         return ret;
156
157                 next = pmd_addr_end(addr, end);
158                 vmemmap_pte_range(pmd, addr, next, walk);
159         } while (pmd++, addr = next, addr != end);
160
161         return 0;
162 }
163
164 static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
165                              unsigned long end,
166                              struct vmemmap_remap_walk *walk)
167 {
168         pud_t *pud;
169         unsigned long next;
170
171         pud = pud_offset(p4d, addr);
172         do {
173                 int ret;
174
175                 next = pud_addr_end(addr, end);
176                 ret = vmemmap_pmd_range(pud, addr, next, walk);
177                 if (ret)
178                         return ret;
179         } while (pud++, addr = next, addr != end);
180
181         return 0;
182 }
183
184 static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
185                              unsigned long end,
186                              struct vmemmap_remap_walk *walk)
187 {
188         p4d_t *p4d;
189         unsigned long next;
190
191         p4d = p4d_offset(pgd, addr);
192         do {
193                 int ret;
194
195                 next = p4d_addr_end(addr, end);
196                 ret = vmemmap_pud_range(p4d, addr, next, walk);
197                 if (ret)
198                         return ret;
199         } while (p4d++, addr = next, addr != end);
200
201         return 0;
202 }
203
204 static int vmemmap_remap_range(unsigned long start, unsigned long end,
205                                struct vmemmap_remap_walk *walk)
206 {
207         unsigned long addr = start;
208         unsigned long next;
209         pgd_t *pgd;
210
211         VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
212         VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
213
214         pgd = pgd_offset_k(addr);
215         do {
216                 int ret;
217
218                 next = pgd_addr_end(addr, end);
219                 ret = vmemmap_p4d_range(pgd, addr, next, walk);
220                 if (ret)
221                         return ret;
222         } while (pgd++, addr = next, addr != end);
223
224         /*
225          * We only change the mapping of the vmemmap virtual address range
226          * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
227          * belongs to the range.
228          */
229         flush_tlb_kernel_range(start + PAGE_SIZE, end);
230
231         return 0;
232 }
233
234 /*
235  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
236  * allocator or buddy allocator. If the PG_reserved flag is set, it means
237  * that it allocated from the memblock allocator, just free it via the
238  * free_bootmem_page(). Otherwise, use __free_page().
239  */
240 static inline void free_vmemmap_page(struct page *page)
241 {
242         if (PageReserved(page))
243                 free_bootmem_page(page);
244         else
245                 __free_page(page);
246 }
247
248 /* Free a list of the vmemmap pages */
249 static void free_vmemmap_page_list(struct list_head *list)
250 {
251         struct page *page, *next;
252
253         list_for_each_entry_safe(page, next, list, lru) {
254                 list_del(&page->lru);
255                 free_vmemmap_page(page);
256         }
257 }
258
259 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
260                               struct vmemmap_remap_walk *walk)
261 {
262         /*
263          * Remap the tail pages as read-only to catch illegal write operation
264          * to the tail pages.
265          */
266         pgprot_t pgprot = PAGE_KERNEL_RO;
267         pte_t entry = mk_pte(walk->reuse_page, pgprot);
268         struct page *page = pte_page(*pte);
269
270         list_add_tail(&page->lru, walk->vmemmap_pages);
271         set_pte_at(&init_mm, addr, pte, entry);
272 }
273
274 /*
275  * How many struct page structs need to be reset. When we reuse the head
276  * struct page, the special metadata (e.g. page->flags or page->mapping)
277  * cannot copy to the tail struct page structs. The invalid value will be
278  * checked in the free_tail_pages_check(). In order to avoid the message
279  * of "corrupted mapping in tail page". We need to reset at least 3 (one
280  * head struct page struct and two tail struct page structs) struct page
281  * structs.
282  */
283 #define NR_RESET_STRUCT_PAGE            3
284
285 static inline void reset_struct_pages(struct page *start)
286 {
287         int i;
288         struct page *from = start + NR_RESET_STRUCT_PAGE;
289
290         for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
291                 memcpy(start + i, from, sizeof(*from));
292 }
293
294 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
295                                 struct vmemmap_remap_walk *walk)
296 {
297         pgprot_t pgprot = PAGE_KERNEL;
298         struct page *page;
299         void *to;
300
301         BUG_ON(pte_page(*pte) != walk->reuse_page);
302
303         page = list_first_entry(walk->vmemmap_pages, struct page, lru);
304         list_del(&page->lru);
305         to = page_to_virt(page);
306         copy_page(to, (void *)walk->reuse_addr);
307         reset_struct_pages(to);
308
309         set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
310 }
311
312 /**
313  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
314  *                      to the page which @reuse is mapped to, then free vmemmap
315  *                      which the range are mapped to.
316  * @start:      start address of the vmemmap virtual address range that we want
317  *              to remap.
318  * @end:        end address of the vmemmap virtual address range that we want to
319  *              remap.
320  * @reuse:      reuse address.
321  *
322  * Return: %0 on success, negative error code otherwise.
323  */
324 int vmemmap_remap_free(unsigned long start, unsigned long end,
325                        unsigned long reuse)
326 {
327         int ret;
328         LIST_HEAD(vmemmap_pages);
329         struct vmemmap_remap_walk walk = {
330                 .remap_pte      = vmemmap_remap_pte,
331                 .reuse_addr     = reuse,
332                 .vmemmap_pages  = &vmemmap_pages,
333         };
334
335         /*
336          * In order to make remapping routine most efficient for the huge pages,
337          * the routine of vmemmap page table walking has the following rules
338          * (see more details from the vmemmap_pte_range()):
339          *
340          * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
341          *   should be continuous.
342          * - The @reuse address is part of the range [@reuse, @end) that we are
343          *   walking which is passed to vmemmap_remap_range().
344          * - The @reuse address is the first in the complete range.
345          *
346          * So we need to make sure that @start and @reuse meet the above rules.
347          */
348         BUG_ON(start - reuse != PAGE_SIZE);
349
350         mmap_read_lock(&init_mm);
351         ret = vmemmap_remap_range(reuse, end, &walk);
352         if (ret && walk.nr_walked) {
353                 end = reuse + walk.nr_walked * PAGE_SIZE;
354                 /*
355                  * vmemmap_pages contains pages from the previous
356                  * vmemmap_remap_range call which failed.  These
357                  * are pages which were removed from the vmemmap.
358                  * They will be restored in the following call.
359                  */
360                 walk = (struct vmemmap_remap_walk) {
361                         .remap_pte      = vmemmap_restore_pte,
362                         .reuse_addr     = reuse,
363                         .vmemmap_pages  = &vmemmap_pages,
364                 };
365
366                 vmemmap_remap_range(reuse, end, &walk);
367         }
368         mmap_read_unlock(&init_mm);
369
370         free_vmemmap_page_list(&vmemmap_pages);
371
372         return ret;
373 }
374
375 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
376                                    gfp_t gfp_mask, struct list_head *list)
377 {
378         unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
379         int nid = page_to_nid((struct page *)start);
380         struct page *page, *next;
381
382         while (nr_pages--) {
383                 page = alloc_pages_node(nid, gfp_mask, 0);
384                 if (!page)
385                         goto out;
386                 list_add_tail(&page->lru, list);
387         }
388
389         return 0;
390 out:
391         list_for_each_entry_safe(page, next, list, lru)
392                 __free_pages(page, 0);
393         return -ENOMEM;
394 }
395
396 /**
397  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
398  *                       to the page which is from the @vmemmap_pages
399  *                       respectively.
400  * @start:      start address of the vmemmap virtual address range that we want
401  *              to remap.
402  * @end:        end address of the vmemmap virtual address range that we want to
403  *              remap.
404  * @reuse:      reuse address.
405  * @gfp_mask:   GFP flag for allocating vmemmap pages.
406  *
407  * Return: %0 on success, negative error code otherwise.
408  */
409 int vmemmap_remap_alloc(unsigned long start, unsigned long end,
410                         unsigned long reuse, gfp_t gfp_mask)
411 {
412         LIST_HEAD(vmemmap_pages);
413         struct vmemmap_remap_walk walk = {
414                 .remap_pte      = vmemmap_restore_pte,
415                 .reuse_addr     = reuse,
416                 .vmemmap_pages  = &vmemmap_pages,
417         };
418
419         /* See the comment in the vmemmap_remap_free(). */
420         BUG_ON(start - reuse != PAGE_SIZE);
421
422         if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
423                 return -ENOMEM;
424
425         mmap_read_lock(&init_mm);
426         vmemmap_remap_range(reuse, end, &walk);
427         mmap_read_unlock(&init_mm);
428
429         return 0;
430 }
431 #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
432
433 /*
434  * Allocate a block of memory to be used to back the virtual memory map
435  * or to back the page tables that are used to create the mapping.
436  * Uses the main allocators if they are available, else bootmem.
437  */
438
439 static void * __ref __earlyonly_bootmem_alloc(int node,
440                                 unsigned long size,
441                                 unsigned long align,
442                                 unsigned long goal)
443 {
444         return memblock_alloc_try_nid_raw(size, align, goal,
445                                                MEMBLOCK_ALLOC_ACCESSIBLE, node);
446 }
447
448 void * __meminit vmemmap_alloc_block(unsigned long size, int node)
449 {
450         /* If the main allocator is up use that, fallback to bootmem. */
451         if (slab_is_available()) {
452                 gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
453                 int order = get_order(size);
454                 static bool warned;
455                 struct page *page;
456
457                 page = alloc_pages_node(node, gfp_mask, order);
458                 if (page)
459                         return page_address(page);
460
461                 if (!warned) {
462                         warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
463                                    "vmemmap alloc failure: order:%u", order);
464                         warned = true;
465                 }
466                 return NULL;
467         } else
468                 return __earlyonly_bootmem_alloc(node, size, size,
469                                 __pa(MAX_DMA_ADDRESS));
470 }
471
472 static void * __meminit altmap_alloc_block_buf(unsigned long size,
473                                                struct vmem_altmap *altmap);
474
475 /* need to make sure size is all the same during early stage */
476 void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
477                                          struct vmem_altmap *altmap)
478 {
479         void *ptr;
480
481         if (altmap)
482                 return altmap_alloc_block_buf(size, altmap);
483
484         ptr = sparse_buffer_alloc(size);
485         if (!ptr)
486                 ptr = vmemmap_alloc_block(size, node);
487         return ptr;
488 }
489
490 static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
491 {
492         return altmap->base_pfn + altmap->reserve + altmap->alloc
493                 + altmap->align;
494 }
495
496 static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
497 {
498         unsigned long allocated = altmap->alloc + altmap->align;
499
500         if (altmap->free > allocated)
501                 return altmap->free - allocated;
502         return 0;
503 }
504
505 static void * __meminit altmap_alloc_block_buf(unsigned long size,
506                                                struct vmem_altmap *altmap)
507 {
508         unsigned long pfn, nr_pfns, nr_align;
509
510         if (size & ~PAGE_MASK) {
511                 pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
512                                 __func__, size);
513                 return NULL;
514         }
515
516         pfn = vmem_altmap_next_pfn(altmap);
517         nr_pfns = size >> PAGE_SHIFT;
518         nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
519         nr_align = ALIGN(pfn, nr_align) - pfn;
520         if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
521                 return NULL;
522
523         altmap->alloc += nr_pfns;
524         altmap->align += nr_align;
525         pfn += nr_align;
526
527         pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
528                         __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
529         return __va(__pfn_to_phys(pfn));
530 }
531
532 void __meminit vmemmap_verify(pte_t *pte, int node,
533                                 unsigned long start, unsigned long end)
534 {
535         unsigned long pfn = pte_pfn(*pte);
536         int actual_node = early_pfn_to_nid(pfn);
537
538         if (node_distance(actual_node, node) > LOCAL_DISTANCE)
539                 pr_warn_once("[%lx-%lx] potential offnode page_structs\n",
540                         start, end - 1);
541 }
542
543 pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
544                                        struct vmem_altmap *altmap,
545                                        struct page *reuse)
546 {
547         pte_t *pte = pte_offset_kernel(pmd, addr);
548         if (pte_none(*pte)) {
549                 pte_t entry;
550                 void *p;
551
552                 if (!reuse) {
553                         p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
554                         if (!p)
555                                 return NULL;
556                 } else {
557                         /*
558                          * When a PTE/PMD entry is freed from the init_mm
559                          * there's a a free_pages() call to this page allocated
560                          * above. Thus this get_page() is paired with the
561                          * put_page_testzero() on the freeing path.
562                          * This can only called by certain ZONE_DEVICE path,
563                          * and through vmemmap_populate_compound_pages() when
564                          * slab is available.
565                          */
566                         get_page(reuse);
567                         p = page_to_virt(reuse);
568                 }
569                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
570                 set_pte_at(&init_mm, addr, pte, entry);
571         }
572         return pte;
573 }
574
575 static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
576 {
577         void *p = vmemmap_alloc_block(size, node);
578
579         if (!p)
580                 return NULL;
581         memset(p, 0, size);
582
583         return p;
584 }
585
586 pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
587 {
588         pmd_t *pmd = pmd_offset(pud, addr);
589         if (pmd_none(*pmd)) {
590                 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
591                 if (!p)
592                         return NULL;
593                 pmd_populate_kernel(&init_mm, pmd, p);
594         }
595         return pmd;
596 }
597
598 pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
599 {
600         pud_t *pud = pud_offset(p4d, addr);
601         if (pud_none(*pud)) {
602                 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
603                 if (!p)
604                         return NULL;
605                 pud_populate(&init_mm, pud, p);
606         }
607         return pud;
608 }
609
610 p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
611 {
612         p4d_t *p4d = p4d_offset(pgd, addr);
613         if (p4d_none(*p4d)) {
614                 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
615                 if (!p)
616                         return NULL;
617                 p4d_populate(&init_mm, p4d, p);
618         }
619         return p4d;
620 }
621
622 pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
623 {
624         pgd_t *pgd = pgd_offset_k(addr);
625         if (pgd_none(*pgd)) {
626                 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
627                 if (!p)
628                         return NULL;
629                 pgd_populate(&init_mm, pgd, p);
630         }
631         return pgd;
632 }
633
634 static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
635                                               struct vmem_altmap *altmap,
636                                               struct page *reuse)
637 {
638         pgd_t *pgd;
639         p4d_t *p4d;
640         pud_t *pud;
641         pmd_t *pmd;
642         pte_t *pte;
643
644         pgd = vmemmap_pgd_populate(addr, node);
645         if (!pgd)
646                 return NULL;
647         p4d = vmemmap_p4d_populate(pgd, addr, node);
648         if (!p4d)
649                 return NULL;
650         pud = vmemmap_pud_populate(p4d, addr, node);
651         if (!pud)
652                 return NULL;
653         pmd = vmemmap_pmd_populate(pud, addr, node);
654         if (!pmd)
655                 return NULL;
656         pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
657         if (!pte)
658                 return NULL;
659         vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
660
661         return pte;
662 }
663
664 static int __meminit vmemmap_populate_range(unsigned long start,
665                                             unsigned long end, int node,
666                                             struct vmem_altmap *altmap,
667                                             struct page *reuse)
668 {
669         unsigned long addr = start;
670         pte_t *pte;
671
672         for (; addr < end; addr += PAGE_SIZE) {
673                 pte = vmemmap_populate_address(addr, node, altmap, reuse);
674                 if (!pte)
675                         return -ENOMEM;
676         }
677
678         return 0;
679 }
680
681 int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
682                                          int node, struct vmem_altmap *altmap)
683 {
684         return vmemmap_populate_range(start, end, node, altmap, NULL);
685 }
686
687 /*
688  * For compound pages bigger than section size (e.g. x86 1G compound
689  * pages with 2M subsection size) fill the rest of sections as tail
690  * pages.
691  *
692  * Note that memremap_pages() resets @nr_range value and will increment
693  * it after each range successful onlining. Thus the value or @nr_range
694  * at section memmap populate corresponds to the in-progress range
695  * being onlined here.
696  */
697 static bool __meminit reuse_compound_section(unsigned long start_pfn,
698                                              struct dev_pagemap *pgmap)
699 {
700         unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
701         unsigned long offset = start_pfn -
702                 PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
703
704         return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
705 }
706
707 static pte_t * __meminit compound_section_tail_page(unsigned long addr)
708 {
709         pte_t *pte;
710
711         addr -= PAGE_SIZE;
712
713         /*
714          * Assuming sections are populated sequentially, the previous section's
715          * page data can be reused.
716          */
717         pte = pte_offset_kernel(pmd_off_k(addr), addr);
718         if (!pte)
719                 return NULL;
720
721         return pte;
722 }
723
724 static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
725                                                      unsigned long start,
726                                                      unsigned long end, int node,
727                                                      struct dev_pagemap *pgmap)
728 {
729         unsigned long size, addr;
730         pte_t *pte;
731         int rc;
732
733         if (reuse_compound_section(start_pfn, pgmap)) {
734                 pte = compound_section_tail_page(start);
735                 if (!pte)
736                         return -ENOMEM;
737
738                 /*
739                  * Reuse the page that was populated in the prior iteration
740                  * with just tail struct pages.
741                  */
742                 return vmemmap_populate_range(start, end, node, NULL,
743                                               pte_page(*pte));
744         }
745
746         size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
747         for (addr = start; addr < end; addr += size) {
748                 unsigned long next = addr, last = addr + size;
749
750                 /* Populate the head page vmemmap page */
751                 pte = vmemmap_populate_address(addr, node, NULL, NULL);
752                 if (!pte)
753                         return -ENOMEM;
754
755                 /* Populate the tail pages vmemmap page */
756                 next = addr + PAGE_SIZE;
757                 pte = vmemmap_populate_address(next, node, NULL, NULL);
758                 if (!pte)
759                         return -ENOMEM;
760
761                 /*
762                  * Reuse the previous page for the rest of tail pages
763                  * See layout diagram in Documentation/vm/vmemmap_dedup.rst
764                  */
765                 next += PAGE_SIZE;
766                 rc = vmemmap_populate_range(next, last, node, NULL,
767                                             pte_page(*pte));
768                 if (rc)
769                         return -ENOMEM;
770         }
771
772         return 0;
773 }
774
775 struct page * __meminit __populate_section_memmap(unsigned long pfn,
776                 unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
777                 struct dev_pagemap *pgmap)
778 {
779         unsigned long start = (unsigned long) pfn_to_page(pfn);
780         unsigned long end = start + nr_pages * sizeof(struct page);
781         int r;
782
783         if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
784                 !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
785                 return NULL;
786
787         if (is_power_of_2(sizeof(struct page)) &&
788             pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
789                 r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
790         else
791                 r = vmemmap_populate(start, end, nid, altmap);
792
793         if (r < 0)
794                 return NULL;
795
796         return pfn_to_page(pfn);
797 }