1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright IBM Corp. 2006
6 #include <linux/memory_hotplug.h>
7 #include <linux/memblock.h>
8 #include <linux/kasan.h>
11 #include <linux/init.h>
12 #include <linux/list.h>
13 #include <linux/hugetlb.h>
14 #include <linux/slab.h>
15 #include <linux/sort.h>
16 #include <asm/cacheflush.h>
17 #include <asm/nospec-branch.h>
18 #include <asm/pgalloc.h>
19 #include <asm/setup.h>
20 #include <asm/tlbflush.h>
21 #include <asm/sections.h>
22 #include <asm/set_memory.h>
24 static DEFINE_MUTEX(vmem_mutex);
26 static void __ref *vmem_alloc_pages(unsigned int order)
28 unsigned long size = PAGE_SIZE << order;
30 if (slab_is_available())
31 return (void *)__get_free_pages(GFP_KERNEL, order);
32 return memblock_alloc(size, size);
35 static void vmem_free_pages(unsigned long addr, int order)
37 /* We don't expect boot memory to be removed ever. */
38 if (!slab_is_available() ||
39 WARN_ON_ONCE(PageReserved(virt_to_page(addr))))
41 free_pages(addr, order);
44 void *vmem_crst_alloc(unsigned long val)
48 table = vmem_alloc_pages(CRST_ALLOC_ORDER);
50 crst_table_init(table, val);
54 pte_t __ref *vmem_pte_alloc(void)
56 unsigned long size = PTRS_PER_PTE * sizeof(pte_t);
59 if (slab_is_available())
60 pte = (pte_t *) page_table_alloc(&init_mm);
62 pte = (pte_t *) memblock_alloc(size, size);
65 memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
69 static void vmem_pte_free(unsigned long *table)
71 /* We don't expect boot memory to be removed ever. */
72 if (!slab_is_available() ||
73 WARN_ON_ONCE(PageReserved(virt_to_page(table))))
75 page_table_free(&init_mm, table);
78 #define PAGE_UNUSED 0xFD
81 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
82 * from unused_sub_pmd_start to next PMD_SIZE boundary.
84 static unsigned long unused_sub_pmd_start;
86 static void vmemmap_flush_unused_sub_pmd(void)
88 if (!unused_sub_pmd_start)
90 memset((void *)unused_sub_pmd_start, PAGE_UNUSED,
91 ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start);
92 unused_sub_pmd_start = 0;
95 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end)
98 * As we expect to add in the same granularity as we remove, it's
99 * sufficient to mark only some piece used to block the memmap page from
100 * getting removed (just in case the memmap never gets initialized,
101 * e.g., because the memory block never gets onlined).
103 memset((void *)start, 0, sizeof(struct page));
106 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
109 * We only optimize if the new used range directly follows the
110 * previously unused range (esp., when populating consecutive sections).
112 if (unused_sub_pmd_start == start) {
113 unused_sub_pmd_start = end;
114 if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE)))
115 unused_sub_pmd_start = 0;
118 vmemmap_flush_unused_sub_pmd();
119 vmemmap_mark_sub_pmd_used(start, end);
122 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
124 unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
126 vmemmap_flush_unused_sub_pmd();
128 /* Could be our memmap page is filled with PAGE_UNUSED already ... */
129 vmemmap_mark_sub_pmd_used(start, end);
131 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
132 if (!IS_ALIGNED(start, PMD_SIZE))
133 memset((void *)page, PAGE_UNUSED, start - page);
135 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
136 * consecutive sections. Remember for the last added PMD the last
137 * unused range in the populated PMD.
139 if (!IS_ALIGNED(end, PMD_SIZE))
140 unused_sub_pmd_start = end;
143 /* Returns true if the PMD is completely unused and can be freed. */
144 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
146 unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
148 vmemmap_flush_unused_sub_pmd();
149 memset((void *)start, PAGE_UNUSED, end - start);
150 return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE);
153 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
154 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
155 unsigned long end, bool add, bool direct)
157 unsigned long prot, pages = 0;
161 prot = pgprot_val(PAGE_KERNEL);
163 prot &= ~_PAGE_NOEXEC;
165 pte = pte_offset_kernel(pmd, addr);
166 for (; addr < end; addr += PAGE_SIZE, pte++) {
171 vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
172 pte_clear(&init_mm, addr, pte);
173 } else if (pte_none(*pte)) {
175 void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
179 set_pte(pte, __pte(__pa(new_page) | prot));
181 set_pte(pte, __pte(__pa(addr) | prot));
191 update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
195 static void try_free_pte_table(pmd_t *pmd, unsigned long start)
200 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
201 pte = pte_offset_kernel(pmd, start);
202 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
206 vmem_pte_free((unsigned long *) pmd_deref(*pmd));
210 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
211 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
212 unsigned long end, bool add, bool direct)
214 unsigned long next, prot, pages = 0;
219 prot = pgprot_val(SEGMENT_KERNEL);
221 prot &= ~_SEGMENT_ENTRY_NOEXEC;
223 pmd = pmd_offset(pud, addr);
224 for (; addr < end; addr = next, pmd++) {
225 next = pmd_addr_end(addr, end);
229 if (pmd_large(*pmd)) {
230 if (IS_ALIGNED(addr, PMD_SIZE) &&
231 IS_ALIGNED(next, PMD_SIZE)) {
233 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
236 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
237 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
242 } else if (pmd_none(*pmd)) {
243 if (IS_ALIGNED(addr, PMD_SIZE) &&
244 IS_ALIGNED(next, PMD_SIZE) &&
245 MACHINE_HAS_EDAT1 && direct &&
246 !debug_pagealloc_enabled()) {
247 set_pmd(pmd, __pmd(__pa(addr) | prot));
250 } else if (!direct && MACHINE_HAS_EDAT1) {
254 * Use 1MB frames for vmemmap if available. We
255 * always use large frames even if they are only
256 * partially used. Otherwise we would have also
257 * page tables since vmemmap_populate gets
258 * called for each section separately.
260 new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
262 set_pmd(pmd, __pmd(__pa(new_page) | prot));
263 if (!IS_ALIGNED(addr, PMD_SIZE) ||
264 !IS_ALIGNED(next, PMD_SIZE)) {
265 vmemmap_use_new_sub_pmd(addr, next);
270 pte = vmem_pte_alloc();
273 pmd_populate(&init_mm, pmd, pte);
274 } else if (pmd_large(*pmd)) {
276 vmemmap_use_sub_pmd(addr, next);
279 ret = modify_pte_table(pmd, addr, next, add, direct);
283 try_free_pte_table(pmd, addr & PMD_MASK);
288 update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
292 static void try_free_pmd_table(pud_t *pud, unsigned long start)
294 const unsigned long end = start + PUD_SIZE;
298 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
299 if (end > VMALLOC_START)
302 pmd = pmd_offset(pud, start);
303 for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
306 vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
310 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
311 bool add, bool direct)
313 unsigned long next, prot, pages = 0;
318 prot = pgprot_val(REGION3_KERNEL);
320 prot &= ~_REGION_ENTRY_NOEXEC;
321 pud = pud_offset(p4d, addr);
322 for (; addr < end; addr = next, pud++) {
323 next = pud_addr_end(addr, end);
327 if (pud_large(*pud)) {
328 if (IS_ALIGNED(addr, PUD_SIZE) &&
329 IS_ALIGNED(next, PUD_SIZE)) {
335 } else if (pud_none(*pud)) {
336 if (IS_ALIGNED(addr, PUD_SIZE) &&
337 IS_ALIGNED(next, PUD_SIZE) &&
338 MACHINE_HAS_EDAT2 && direct &&
339 !debug_pagealloc_enabled()) {
340 set_pud(pud, __pud(__pa(addr) | prot));
344 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
347 pud_populate(&init_mm, pud, pmd);
348 } else if (pud_large(*pud)) {
351 ret = modify_pmd_table(pud, addr, next, add, direct);
355 try_free_pmd_table(pud, addr & PUD_MASK);
360 update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
364 static void try_free_pud_table(p4d_t *p4d, unsigned long start)
366 const unsigned long end = start + P4D_SIZE;
370 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
371 if (end > VMALLOC_START)
374 pud = pud_offset(p4d, start);
375 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
379 vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
383 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
384 bool add, bool direct)
391 p4d = p4d_offset(pgd, addr);
392 for (; addr < end; addr = next, p4d++) {
393 next = p4d_addr_end(addr, end);
397 } else if (p4d_none(*p4d)) {
398 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
401 p4d_populate(&init_mm, p4d, pud);
403 ret = modify_pud_table(p4d, addr, next, add, direct);
407 try_free_pud_table(p4d, addr & P4D_MASK);
414 static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
416 const unsigned long end = start + PGDIR_SIZE;
420 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
421 if (end > VMALLOC_START)
424 p4d = p4d_offset(pgd, start);
425 for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
429 vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
433 static int modify_pagetable(unsigned long start, unsigned long end, bool add,
436 unsigned long addr, next;
441 if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
443 for (addr = start; addr < end; addr = next) {
444 next = pgd_addr_end(addr, end);
445 pgd = pgd_offset_k(addr);
450 } else if (pgd_none(*pgd)) {
451 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
454 pgd_populate(&init_mm, pgd, p4d);
456 ret = modify_p4d_table(pgd, addr, next, add, direct);
460 try_free_p4d_table(pgd, addr & PGDIR_MASK);
465 flush_tlb_kernel_range(start, end);
469 static int add_pagetable(unsigned long start, unsigned long end, bool direct)
471 return modify_pagetable(start, end, true, direct);
474 static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
476 return modify_pagetable(start, end, false, direct);
480 * Add a physical memory range to the 1:1 mapping.
482 static int vmem_add_range(unsigned long start, unsigned long size)
484 start = (unsigned long)__va(start);
485 return add_pagetable(start, start + size, true);
489 * Remove a physical memory range from the 1:1 mapping.
491 static void vmem_remove_range(unsigned long start, unsigned long size)
493 start = (unsigned long)__va(start);
494 remove_pagetable(start, start + size, true);
498 * Add a backed mem_map array to the virtual mem_map array.
500 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
501 struct vmem_altmap *altmap)
505 mutex_lock(&vmem_mutex);
506 /* We don't care about the node, just use NUMA_NO_NODE on allocations */
507 ret = add_pagetable(start, end, false);
509 remove_pagetable(start, end, false);
510 mutex_unlock(&vmem_mutex);
514 void vmemmap_free(unsigned long start, unsigned long end,
515 struct vmem_altmap *altmap)
517 mutex_lock(&vmem_mutex);
518 remove_pagetable(start, end, false);
519 mutex_unlock(&vmem_mutex);
522 void vmem_remove_mapping(unsigned long start, unsigned long size)
524 mutex_lock(&vmem_mutex);
525 vmem_remove_range(start, size);
526 mutex_unlock(&vmem_mutex);
529 struct range arch_get_mappable_range(void)
531 struct range mhp_range;
534 mhp_range.end = VMEM_MAX_PHYS - 1;
538 int vmem_add_mapping(unsigned long start, unsigned long size)
540 struct range range = arch_get_mappable_range();
543 if (start < range.start ||
544 start + size > range.end + 1 ||
545 start + size < start)
548 mutex_lock(&vmem_mutex);
549 ret = vmem_add_range(start, size);
551 vmem_remove_range(start, size);
552 mutex_unlock(&vmem_mutex);
557 * Allocate new or return existing page-table entry, but do not map it
558 * to any physical address. If missing, allocate segment- and region-
559 * table entries along. Meeting a large segment- or region-table entry
560 * while traversing is an error, since the function is expected to be
561 * called against virtual regions reserved for 4KB mappings only.
563 pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
572 pgd = pgd_offset_k(addr);
573 if (pgd_none(*pgd)) {
576 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
579 pgd_populate(&init_mm, pgd, p4d);
581 p4d = p4d_offset(pgd, addr);
582 if (p4d_none(*p4d)) {
585 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
588 p4d_populate(&init_mm, p4d, pud);
590 pud = pud_offset(p4d, addr);
591 if (pud_none(*pud)) {
594 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
597 pud_populate(&init_mm, pud, pmd);
598 } else if (WARN_ON_ONCE(pud_large(*pud))) {
601 pmd = pmd_offset(pud, addr);
602 if (pmd_none(*pmd)) {
605 pte = vmem_pte_alloc();
608 pmd_populate(&init_mm, pmd, pte);
609 } else if (WARN_ON_ONCE(pmd_large(*pmd))) {
612 ptep = pte_offset_kernel(pmd, addr);
617 int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc)
621 if (!IS_ALIGNED(addr, PAGE_SIZE))
623 ptep = vmem_get_alloc_pte(addr, alloc);
626 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
627 pte = mk_pte_phys(phys, prot);
632 int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot)
636 mutex_lock(&vmem_mutex);
637 rc = __vmem_map_4k_page(addr, phys, prot, true);
638 mutex_unlock(&vmem_mutex);
642 void vmem_unmap_4k_page(unsigned long addr)
646 mutex_lock(&vmem_mutex);
647 ptep = virt_to_kpte(addr);
648 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
649 pte_clear(&init_mm, addr, ptep);
650 mutex_unlock(&vmem_mutex);
653 static int __init memblock_region_cmp(const void *a, const void *b)
655 const struct memblock_region *r1 = a;
656 const struct memblock_region *r2 = b;
658 if (r1->base < r2->base)
660 if (r1->base > r2->base)
665 static void __init memblock_region_swap(void *a, void *b, int size)
667 swap(*(struct memblock_region *)a, *(struct memblock_region *)b);
671 #define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x))
673 static inline int set_memory_kasan(unsigned long start, unsigned long end)
675 start = PAGE_ALIGN_DOWN(__sha(start));
676 end = PAGE_ALIGN(__sha(end));
677 return set_memory_rwnx(start, (end - start) >> PAGE_SHIFT);
682 * map whole physical memory to virtual memory (identity mapping)
683 * we reserve enough space in the vmalloc area for vmemmap to hotplug
684 * additional memory segments.
686 void __init vmem_map_init(void)
688 struct memblock_region memory_rwx_regions[] = {
691 .size = sizeof(struct lowcore),
692 .flags = MEMBLOCK_NONE,
698 .base = __pa(_stext),
699 .size = _etext - _stext,
700 .flags = MEMBLOCK_NONE,
706 .base = __pa(_sinittext),
707 .size = _einittext - _sinittext,
708 .flags = MEMBLOCK_NONE,
714 .base = __stext_amode31,
715 .size = __etext_amode31 - __stext_amode31,
716 .flags = MEMBLOCK_NONE,
722 struct memblock_type memory_rwx = {
723 .regions = memory_rwx_regions,
724 .cnt = ARRAY_SIZE(memory_rwx_regions),
725 .max = ARRAY_SIZE(memory_rwx_regions),
727 phys_addr_t base, end;
731 * Set RW+NX attribute on all memory, except regions enumerated with
732 * memory_rwx exclude type. These regions need different attributes,
733 * which are enforced afterwards.
735 * __for_each_mem_range() iterate and exclude types should be sorted.
736 * The relative location of _stext and _sinittext is hardcoded in the
737 * linker script. However a location of __stext_amode31 and the kernel
738 * image itself are chosen dynamically. Thus, sort the exclude type.
740 sort(&memory_rwx_regions,
741 ARRAY_SIZE(memory_rwx_regions), sizeof(memory_rwx_regions[0]),
742 memblock_region_cmp, memblock_region_swap);
743 __for_each_mem_range(i, &memblock.memory, &memory_rwx,
744 NUMA_NO_NODE, MEMBLOCK_NONE, &base, &end, NULL) {
745 set_memory_rwnx((unsigned long)__va(base),
746 (end - base) >> PAGE_SHIFT);
750 for_each_mem_range(i, &base, &end)
751 set_memory_kasan(base, end);
753 set_memory_rox((unsigned long)_stext,
754 (unsigned long)(_etext - _stext) >> PAGE_SHIFT);
755 set_memory_ro((unsigned long)_etext,
756 (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT);
757 set_memory_rox((unsigned long)_sinittext,
758 (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT);
759 set_memory_rox(__stext_amode31,
760 (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT);
762 /* lowcore must be executable for LPSWE */
763 if (static_key_enabled(&cpu_has_bear))
765 set_memory_nx(PAGE_SIZE, 1);
766 if (debug_pagealloc_enabled())
767 set_memory_4k(0, ident_map_size >> PAGE_SHIFT);
769 pr_info("Write protected kernel read-only data: %luk\n",
770 (unsigned long)(__end_rodata - _stext) >> 10);