riscv: Move kernel mapping outside of linear mapping
authorAlexandre Ghiti <alex@ghiti.fr>
Sun, 11 Apr 2021 16:41:44 +0000 (12:41 -0400)
committerPalmer Dabbelt <palmerdabbelt@google.com>
Mon, 26 Apr 2021 15:25:04 +0000 (08:25 -0700)
This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
12 files changed:
arch/riscv/boot/loader.lds.S
arch/riscv/include/asm/page.h
arch/riscv/include/asm/pgtable.h
arch/riscv/include/asm/set_memory.h
arch/riscv/kernel/head.S
arch/riscv/kernel/module.c
arch/riscv/kernel/setup.c
arch/riscv/kernel/vmlinux.lds.S
arch/riscv/mm/fault.c
arch/riscv/mm/init.c
arch/riscv/mm/kasan_init.c
arch/riscv/mm/physaddr.c

index 47a5003c2e286650be7f633bf7b9c5a2250434af..62d94696a19c71366d287b7ac286606b43e8bee7 100644 (file)
@@ -1,13 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 #include <asm/page.h>
+#include <asm/pgtable.h>
 
 OUTPUT_ARCH(riscv)
 ENTRY(_start)
 
 SECTIONS
 {
-       . = PAGE_OFFSET;
+       . = KERNEL_LINK_ADDR;
 
        .payload : {
                *(.payload)
index adc9d26f3d752dd02e434f0be04958a871459a0c..f64b61296c0ce27f2ac8743c55ce4bff66b5ef98 100644 (file)
@@ -90,15 +90,37 @@ typedef struct page *pgtable_t;
 
 #ifdef CONFIG_MMU
 extern unsigned long va_pa_offset;
+#ifdef CONFIG_64BIT
+extern unsigned long va_kernel_pa_offset;
+#endif
 extern unsigned long pfn_base;
 #define ARCH_PFN_OFFSET                (pfn_base)
 #else
 #define va_pa_offset           0
+#ifdef CONFIG_64BIT
+#define va_kernel_pa_offset    0
+#endif
 #define ARCH_PFN_OFFSET                (PAGE_OFFSET >> PAGE_SHIFT)
 #endif /* CONFIG_MMU */
 
-#define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
-#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
+#ifdef CONFIG_64BIT
+extern unsigned long kernel_virt_addr;
+
+#define linear_mapping_pa_to_va(x)     ((void *)((unsigned long)(x) + va_pa_offset))
+#define kernel_mapping_pa_to_va(x)     ((void *)((unsigned long)(x) + va_kernel_pa_offset))
+#define __pa_to_va_nodebug(x)          linear_mapping_pa_to_va(x)
+
+#define linear_mapping_va_to_pa(x)     ((unsigned long)(x) - va_pa_offset)
+#define kernel_mapping_va_to_pa(x)     ((unsigned long)(x) - va_kernel_pa_offset)
+#define __va_to_pa_nodebug(x)  ({                                              \
+       unsigned long _x = x;                                                   \
+       (_x < kernel_virt_addr) ?                                               \
+               linear_mapping_va_to_pa(_x) : kernel_mapping_va_to_pa(_x);      \
+       })
+#else
+#define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
+#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
+#endif
 
 #ifdef CONFIG_DEBUG_VIRTUAL
 extern phys_addr_t __virt_to_phys(unsigned long x);
index ebf817c1bdf4b3d955875670494d1828bd042d26..5afda75cc2c3379362b972e3aaea2ef9890cf5ac 100644 (file)
 
 #include <asm/pgtable-bits.h>
 
-#ifndef __ASSEMBLY__
+#ifndef CONFIG_MMU
+#define KERNEL_LINK_ADDR       PAGE_OFFSET
+#else
 
-/* Page Upper Directory not used in RISC-V */
-#include <asm-generic/pgtable-nopud.h>
-#include <asm/page.h>
-#include <asm/tlbflush.h>
-#include <linux/mm_types.h>
+#define ADDRESS_SPACE_END      (UL(-1))
 
-#ifdef CONFIG_MMU
+#ifdef CONFIG_64BIT
+/* Leave 2GB for kernel and BPF at the end of the address space */
+#define KERNEL_LINK_ADDR       (ADDRESS_SPACE_END - SZ_2G + 1)
+#else
+#define KERNEL_LINK_ADDR       PAGE_OFFSET
+#endif
 
 #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
 #define VMALLOC_END      (PAGE_OFFSET - 1)
 #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
 
 #define BPF_JIT_REGION_SIZE    (SZ_128M)
+#ifdef CONFIG_64BIT
+/* KASLR should leave at least 128MB for BPF after the kernel */
+#define BPF_JIT_REGION_START   PFN_ALIGN((unsigned long)&_end)
+#define BPF_JIT_REGION_END     (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
+#else
 #define BPF_JIT_REGION_START   (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
 #define BPF_JIT_REGION_END     (VMALLOC_END)
+#endif
+
+/* Modules always live before the kernel */
+#ifdef CONFIG_64BIT
+#define MODULES_VADDR  (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
+#define MODULES_END    (PFN_ALIGN((unsigned long)&_start))
+#endif
 
 /*
  * Roughly size the vmemmap space to be large enough to fit enough
 #define FIXADDR_SIZE     PGDIR_SIZE
 #endif
 #define FIXADDR_START    (FIXADDR_TOP - FIXADDR_SIZE)
-
 #endif
 
+#ifndef __ASSEMBLY__
+
+/* Page Upper Directory not used in RISC-V */
+#include <asm-generic/pgtable-nopud.h>
+#include <asm/page.h>
+#include <asm/tlbflush.h>
+#include <linux/mm_types.h>
+
 #ifdef CONFIG_64BIT
 #include <asm/pgtable-64.h>
 #else
@@ -484,6 +506,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 
 #define kern_addr_valid(addr)   (1) /* FIXME */
 
+extern char _start[];
 extern void *dtb_early_va;
 extern uintptr_t dtb_early_pa;
 void setup_bootmem(void);
index 6887b3d9f3712bb5f4558b5c40b91b9559ad7335..a9c56776fa0e74d614e0ddcc40291c7c6ad80850 100644 (file)
@@ -17,6 +17,7 @@ int set_memory_x(unsigned long addr, int numpages);
 int set_memory_nx(unsigned long addr, int numpages);
 int set_memory_rw_nx(unsigned long addr, int numpages);
 void protect_kernel_text_data(void);
+void protect_kernel_linear_mapping_text_rodata(void);
 #else
 static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; }
 static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
index f5a9bad86e5831890441e9c7c216d406664b2540..6cb05f22e52aaf97b844b4f409e1f8d98abe5614 100644 (file)
@@ -69,7 +69,8 @@ pe_head_start:
 #ifdef CONFIG_MMU
 relocate:
        /* Relocate return address */
-       li a1, PAGE_OFFSET
+       la a1, kernel_virt_addr
+       REG_L a1, 0(a1)
        la a2, _start
        sub a1, a1, a2
        add ra, ra, a1
index 104fba889cf767cb159840954de99f985f7f0929..ce153771e5e9b6591abdd52b91c2902e035ca572 100644 (file)
@@ -408,12 +408,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 }
 
 #if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
-#define VMALLOC_MODULE_START \
-        max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
 void *module_alloc(unsigned long size)
 {
-       return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
-                                   VMALLOC_END, GFP_KERNEL,
+       return __vmalloc_node_range(size, 1, MODULES_VADDR,
+                                   MODULES_END, GFP_KERNEL,
                                    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
                                    __builtin_return_address(0));
 }
index e85bacff1b5075ee3b704f7a65e40d102d0a9de3..d208abc0b47322ed9260d49517b3a8b903c07c90 100644 (file)
@@ -263,8 +263,13 @@ void __init setup_arch(char **cmdline_p)
 
        sbi_init();
 
-       if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+       if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) {
                protect_kernel_text_data();
+#if defined(CONFIG_64BIT) && defined(CONFIG_MMU)
+               protect_kernel_linear_mapping_text_rodata();
+#endif
+       }
+
 #ifdef CONFIG_SWIOTLB
        swiotlb_init(1);
 #endif
index 7e61bc1dc36e2ea10b833566d8ebccb5586fbb91..56677137c85b118bd0641e2b13f0914fec5cd63e 100644 (file)
@@ -4,7 +4,8 @@
  * Copyright (C) 2017 SiFive
  */
 
-#define LOAD_OFFSET PAGE_OFFSET
+#include <asm/pgtable.h>
+#define LOAD_OFFSET KERNEL_LINK_ADDR
 #include <asm/vmlinux.lds.h>
 #include <asm/page.h>
 #include <asm/cache.h>
index 8f17519208c756b7a08bf089ec53004c8a5b0c89..1b14d523a95c3358af325ade921e12107836b0a1 100644 (file)
@@ -231,6 +231,19 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
                return;
        }
 
+#ifdef CONFIG_64BIT
+       /*
+        * Modules in 64bit kernels lie in their own virtual region which is not
+        * in the vmalloc region, but dealing with page faults in this region
+        * or the vmalloc region amounts to doing the same thing: checking that
+        * the mapping exists in init_mm.pgd and updating user page table, so
+        * just use vmalloc_fault.
+        */
+       if (unlikely(addr >= MODULES_VADDR && addr < MODULES_END)) {
+               vmalloc_fault(regs, code, addr);
+               return;
+       }
+#endif
        /* Enable interrupts if they were enabled in the parent context. */
        if (likely(regs->status & SR_PIE))
                local_irq_enable();
index 7f5036fbee8c522fbfbd8e63a02fac2662174140..dc9b988e0778a0594031a517487facbf804ee114 100644 (file)
@@ -25,6 +25,9 @@
 
 #include "../kernel/head.h"
 
+unsigned long kernel_virt_addr = KERNEL_LINK_ADDR;
+EXPORT_SYMBOL(kernel_virt_addr);
+
 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
                                                        __page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);
@@ -88,6 +91,10 @@ static void print_vm_layout(void)
                  (unsigned long)VMALLOC_END);
        print_mlm("lowmem", (unsigned long)PAGE_OFFSET,
                  (unsigned long)high_memory);
+#ifdef CONFIG_64BIT
+       print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR,
+                 (unsigned long)ADDRESS_SPACE_END);
+#endif
 }
 #else
 static void print_vm_layout(void) { }
@@ -116,8 +123,13 @@ void __init setup_bootmem(void)
        /* The maximal physical memory size is -PAGE_OFFSET. */
        memblock_enforce_memory_limit(-PAGE_OFFSET);
 
-       /* Reserve from the start of the kernel to the end of the kernel */
-       memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
+       /*
+        * Reserve from the start of the kernel to the end of the kernel
+        * and make sure we align the reservation on PMD_SIZE since we will
+        * map the kernel in the linear mapping as read-only: we do not want
+        * any allocation to happen between _end and the next pmd aligned page.
+        */
+       memblock_reserve(vmlinux_start, (vmlinux_end - vmlinux_start + PMD_SIZE - 1) & PMD_MASK);
 
        /*
         * memblock allocator is not aware of the fact that last 4K bytes of
@@ -152,8 +164,14 @@ void __init setup_bootmem(void)
 #ifdef CONFIG_MMU
 static struct pt_alloc_ops pt_ops;
 
+/* Offset between linear mapping virtual address and kernel load address */
 unsigned long va_pa_offset;
 EXPORT_SYMBOL(va_pa_offset);
+#ifdef CONFIG_64BIT
+/* Offset between kernel mapping virtual address and kernel load address */
+unsigned long va_kernel_pa_offset;
+EXPORT_SYMBOL(va_kernel_pa_offset);
+#endif
 unsigned long pfn_base;
 EXPORT_SYMBOL(pfn_base);
 
@@ -257,7 +275,7 @@ static pmd_t *get_pmd_virt_late(phys_addr_t pa)
 
 static phys_addr_t __init alloc_pmd_early(uintptr_t va)
 {
-       BUG_ON((va - PAGE_OFFSET) >> PGDIR_SHIFT);
+       BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
 
        return (uintptr_t)early_pmd;
 }
@@ -372,17 +390,34 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
 #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
 #endif
 
+uintptr_t load_pa, load_sz;
+
+static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
+{
+       uintptr_t va, end_va;
+
+       end_va = kernel_virt_addr + load_sz;
+       for (va = kernel_virt_addr; va < end_va; va += map_size)
+               create_pgd_mapping(pgdir, va,
+                                  load_pa + (va - kernel_virt_addr),
+                                  map_size, PAGE_KERNEL_EXEC);
+}
+
 asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 {
-       uintptr_t va, pa, end_va;
-       uintptr_t load_pa = (uintptr_t)(&_start);
-       uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
+       uintptr_t pa;
        uintptr_t map_size;
 #ifndef __PAGETABLE_PMD_FOLDED
        pmd_t fix_bmap_spmd, fix_bmap_epmd;
 #endif
+       load_pa = (uintptr_t)(&_start);
+       load_sz = (uintptr_t)(&_end) - load_pa;
 
        va_pa_offset = PAGE_OFFSET - load_pa;
+#ifdef CONFIG_64BIT
+       va_kernel_pa_offset = kernel_virt_addr - load_pa;
+#endif
+
        pfn_base = PFN_DOWN(load_pa);
 
        /*
@@ -410,26 +445,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
        create_pmd_mapping(fixmap_pmd, FIXADDR_START,
                           (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
        /* Setup trampoline PGD and PMD */
-       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
                           (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
-       create_pmd_mapping(trampoline_pmd, PAGE_OFFSET,
+       create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
                           load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
 #else
        /* Setup trampoline PGD */
-       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
                           load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC);
 #endif
 
        /*
-        * Setup early PGD covering entire kernel which will allows
+        * Setup early PGD covering entire kernel which will allow
         * us to reach paging_init(). We map all memory banks later
         * in setup_vm_final() below.
         */
-       end_va = PAGE_OFFSET + load_sz;
-       for (va = PAGE_OFFSET; va < end_va; va += map_size)
-               create_pgd_mapping(early_pg_dir, va,
-                                  load_pa + (va - PAGE_OFFSET),
-                                  map_size, PAGE_KERNEL_EXEC);
+       create_kernel_page_table(early_pg_dir, map_size);
 
 #ifndef __PAGETABLE_PMD_FOLDED
        /* Setup early PMD for DTB */
@@ -444,7 +475,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
                           pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL);
        dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1));
 #else /* CONFIG_BUILTIN_DTB */
+#ifdef CONFIG_64BIT
+       /*
+        * __va can't be used since it would return a linear mapping address
+        * whereas dtb_early_va will be used before setup_vm_final installs
+        * the linear mapping.
+        */
+       dtb_early_va = kernel_mapping_pa_to_va(dtb_pa);
+#else
        dtb_early_va = __va(dtb_pa);
+#endif /* CONFIG_64BIT */
 #endif /* CONFIG_BUILTIN_DTB */
 #else
 #ifndef CONFIG_BUILTIN_DTB
@@ -456,7 +496,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
                           pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL);
        dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1));
 #else /* CONFIG_BUILTIN_DTB */
+#ifdef CONFIG_64BIT
+       dtb_early_va = kernel_mapping_pa_to_va(dtb_pa);
+#else
        dtb_early_va = __va(dtb_pa);
+#endif /* CONFIG_64BIT */
 #endif /* CONFIG_BUILTIN_DTB */
 #endif
        dtb_early_pa = dtb_pa;
@@ -492,6 +536,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 #endif
 }
 
+#ifdef CONFIG_64BIT
+void protect_kernel_linear_mapping_text_rodata(void)
+{
+       unsigned long text_start = (unsigned long)lm_alias(_start);
+       unsigned long init_text_start = (unsigned long)lm_alias(__init_text_begin);
+       unsigned long rodata_start = (unsigned long)lm_alias(__start_rodata);
+       unsigned long data_start = (unsigned long)lm_alias(_data);
+
+       set_memory_ro(text_start, (init_text_start - text_start) >> PAGE_SHIFT);
+       set_memory_nx(text_start, (init_text_start - text_start) >> PAGE_SHIFT);
+
+       set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+       set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+}
+#endif
+
 static void __init setup_vm_final(void)
 {
        uintptr_t va, map_size;
@@ -513,7 +573,7 @@ static void __init setup_vm_final(void)
                           __pa_symbol(fixmap_pgd_next),
                           PGDIR_SIZE, PAGE_TABLE);
 
-       /* Map all memory banks */
+       /* Map all memory banks in the linear mapping */
        for_each_mem_range(i, &start, &end) {
                if (start >= end)
                        break;
@@ -525,10 +585,22 @@ static void __init setup_vm_final(void)
                for (pa = start; pa < end; pa += map_size) {
                        va = (uintptr_t)__va(pa);
                        create_pgd_mapping(swapper_pg_dir, va, pa,
-                                          map_size, PAGE_KERNEL_EXEC);
+                                          map_size,
+#ifdef CONFIG_64BIT
+                                          PAGE_KERNEL
+#else
+                                          PAGE_KERNEL_EXEC
+#endif
+                                       );
+
                }
        }
 
+#ifdef CONFIG_64BIT
+       /* Map the kernel */
+       create_kernel_page_table(swapper_pg_dir, PMD_SIZE);
+#endif
+
        /* Clear fixmap PTE and PMD mappings */
        clear_fixmap(FIX_PTE);
        clear_fixmap(FIX_PMD);
index 2c39f0386673e371a738428a73dea9ddfccb9e6b..28f4d52cf17e48df32993538cd0fc70677316b01 100644 (file)
@@ -171,6 +171,10 @@ void __init kasan_init(void)
        phys_addr_t _start, _end;
        u64 i;
 
+       /*
+        * Populate all kernel virtual address space with kasan_early_shadow_page
+        * except for the linear mapping and the modules/kernel/BPF mapping.
+        */
        kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
                                    (void *)kasan_mem_to_shadow((void *)
                                                                VMEMMAP_END));
@@ -183,6 +187,7 @@ void __init kasan_init(void)
                        (void *)kasan_mem_to_shadow((void *)VMALLOC_START),
                        (void *)kasan_mem_to_shadow((void *)VMALLOC_END));
 
+       /* Populate the linear mapping */
        for_each_mem_range(i, &_start, &_end) {
                void *start = (void *)__va(_start);
                void *end = (void *)__va(_end);
@@ -193,6 +198,10 @@ void __init kasan_init(void)
                kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end));
        };
 
+       /* Populate kernel, BPF, modules mapping */
+       kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
+                      kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
+
        for (i = 0; i < PTRS_PER_PTE; i++)
                set_pte(&kasan_early_shadow_pte[i],
                        mk_pte(virt_to_page(kasan_early_shadow_page),
index e8e4dcd39fede8d9726ae01892d3f71cf18d5173..35703d5ef5fd66605a5349c4fa39c7b566e19de8 100644 (file)
@@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys);
 
 phys_addr_t __phys_addr_symbol(unsigned long x)
 {
-       unsigned long kernel_start = (unsigned long)PAGE_OFFSET;
+       unsigned long kernel_start = (unsigned long)kernel_virt_addr;
        unsigned long kernel_end = (unsigned long)_end;
 
        /*