ARM: percpu: add SMP_ON_UP support
authorArd Biesheuvel <ardb@kernel.org>
Thu, 25 Nov 2021 09:26:44 +0000 (10:26 +0100)
committerArd Biesheuvel <ardb@kernel.org>
Mon, 6 Dec 2021 11:49:17 +0000 (12:49 +0100)
Permit the use of the TPIDRPRW system register for carrying the per-CPU
offset in generic SMP configurations that also target non-SMP capable
ARMv6 cores. This uses the SMP_ON_UP code patching framework to turn all
TPIDRPRW accesses into reads/writes of entry #0 in the __per_cpu_offset
array.

While at it, switch over some existing direct TPIDRPRW accesses in asm
code to invocations of a new helper that is patched in the same way when
necessary.

Note that CPU_V6+SMP without SMP_ON_UP results in a kernel that does not
boot on v6 CPUs without SMP extensions, so add this dependency to
Kconfig as well.

Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Nicolas Pitre <nico@fluxnic.net>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Vladimir Murzin <vladimir.murzin@arm.com> # ARMv7M
arch/arm/include/asm/assembler.h
arch/arm/include/asm/insn.h
arch/arm/include/asm/percpu.h
arch/arm/kernel/entry-armv.S
arch/arm/kernel/sleep.S
arch/arm/mm/Kconfig

index 2095638..f9b3dd0 100644 (file)
@@ -220,9 +220,7 @@ THUMB(      fpreg   .req    r7      )
 
        .macro  reload_current, t1:req, t2:req
 #ifdef CONFIG_CURRENT_POINTER_IN_TPIDRURO
-       adr_l   \t1, __entry_task               @ get __entry_task base address
-       mrc     p15, 0, \t2, c13, c0, 4         @ get per-CPU offset
-       ldr     \t1, [\t1, \t2]                 @ load variable
+       ldr_this_cpu \t1, __entry_task, \t1, \t2
        mcr     p15, 0, \t1, c13, c0, 3         @ store in TPIDRURO
 #endif
        .endm
@@ -312,6 +310,26 @@ THUMB(     fpreg   .req    r7      )
 #define ALT_UP_B(label) b label
 #endif
 
+       /*
+        * this_cpu_offset - load the per-CPU offset of this CPU into
+        *                   register 'rd'
+        */
+       .macro          this_cpu_offset, rd:req
+#ifdef CONFIG_SMP
+ALT_SMP(mrc            p15, 0, \rd, c13, c0, 4)
+#ifdef CONFIG_CPU_V6
+ALT_UP_B(.L1_\@)
+.L0_\@:
+       .subsection     1
+.L1_\@: ldr_va         \rd, __per_cpu_offset
+       b               .L0_\@
+       .previous
+#endif
+#else
+       mov             \rd, #0
+#endif
+       .endm
+
 /*
  * Instruction barrier
  */
@@ -649,6 +667,41 @@ THUMB(     orr     \reg , \reg , #PSR_T_BIT        )
        .endm
 
        /*
+        * ldr_this_cpu_armv6 - Load a 32-bit word from the per-CPU variable 'sym',
+        *                      without using a temp register. Supported in ARM mode
+        *                      only.
+        */
+       .macro          ldr_this_cpu_armv6, rd:req, sym:req
+       this_cpu_offset \rd
+       .globl          \sym
+       .reloc          .L0_\@, R_ARM_ALU_PC_G0_NC, \sym
+       .reloc          .L1_\@, R_ARM_ALU_PC_G1_NC, \sym
+       .reloc          .L2_\@, R_ARM_LDR_PC_G2, \sym
+       add             \rd, \rd, pc
+.L0_\@: sub            \rd, \rd, #4
+.L1_\@: sub            \rd, \rd, #0
+.L2_\@: ldr            \rd, [\rd, #4]
+       .endm
+
+       /*
+        * ldr_this_cpu - Load a 32-bit word from the per-CPU variable 'sym'
+        *                into register 'rd', which may be the stack pointer,
+        *                using 't1' and 't2' as general temp registers. These
+        *                are permitted to overlap with 'rd' if != sp
+        */
+       .macro          ldr_this_cpu, rd:req, sym:req, t1:req, t2:req
+#if __LINUX_ARM_ARCH__ >= 7 || \
+    (defined(MODULE) && defined(CONFIG_ARM_MODULE_PLTS)) || \
+    (defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 140000)
+       this_cpu_offset \t1
+       mov_l           \t2, \sym
+       ldr             \rd, [\t1, \t2]
+#else
+       ldr_this_cpu_armv6 \rd, \sym
+#endif
+       .endm
+
+       /*
         * rev_l - byte-swap a 32-bit value
         *
         * @val: source/destination register
index 5475cbf..a160ed3 100644 (file)
@@ -2,6 +2,30 @@
 #ifndef __ASM_ARM_INSN_H
 #define __ASM_ARM_INSN_H
 
+#include <linux/types.h>
+
+/*
+ * Avoid a literal load by emitting a sequence of ADD/LDR instructions with the
+ * appropriate relocations. The combined sequence has a range of -/+ 256 MiB,
+ * which should be sufficient for the core kernel as well as modules loaded
+ * into the module region. (Not supported by LLD before release 14)
+ */
+#if !(defined(MODULE) && defined(CONFIG_ARM_MODULE_PLTS)) && \
+    !(defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 140000)
+#define LOAD_SYM_ARMV6(reg, sym)                                       \
+       "       .globl  " #sym "                                \n\t"   \
+       "       .reloc  10f, R_ARM_ALU_PC_G0_NC, " #sym "       \n\t"   \
+       "       .reloc  11f, R_ARM_ALU_PC_G1_NC, " #sym "       \n\t"   \
+       "       .reloc  12f, R_ARM_LDR_PC_G2, " #sym "          \n\t"   \
+       "10:    sub     " #reg ", pc, #8                        \n\t"   \
+       "11:    sub     " #reg ", " #reg ", #4                  \n\t"   \
+       "12:    ldr     " #reg ", [" #reg ", #0]                \n\t"
+#else
+#define LOAD_SYM_ARMV6(reg, sym)                                       \
+       "       ldr     " #reg ", =" #sym "                     \n\t"   \
+       "       ldr     " #reg ", [" #reg "]                    \n\t"
+#endif
+
 static inline unsigned long
 arm_gen_nop(void)
 {
index e2fcb3c..a4a0d38 100644 (file)
@@ -5,15 +5,22 @@
 #ifndef _ASM_ARM_PERCPU_H_
 #define _ASM_ARM_PERCPU_H_
 
+#include <asm/insn.h>
+
 register unsigned long current_stack_pointer asm ("sp");
 
 /*
  * Same as asm-generic/percpu.h, except that we store the per cpu offset
  * in the TPIDRPRW. TPIDRPRW only exists on V6K and V7
  */
-#if defined(CONFIG_SMP) && !defined(CONFIG_CPU_V6)
+#ifdef CONFIG_SMP
 static inline void set_my_cpu_offset(unsigned long off)
 {
+       extern unsigned int smp_on_up;
+
+       if (IS_ENABLED(CONFIG_CPU_V6) && !smp_on_up)
+               return;
+
        /* Set TPIDRPRW */
        asm volatile("mcr p15, 0, %0, c13, c0, 4" : : "r" (off) : "memory");
 }
@@ -27,8 +34,20 @@ static inline unsigned long __my_cpu_offset(void)
         * We want to allow caching the value, so avoid using volatile and
         * instead use a fake stack read to hazard against barrier().
         */
-       asm("mrc p15, 0, %0, c13, c0, 4" : "=r" (off)
-               : "Q" (*(const unsigned long *)current_stack_pointer));
+       asm("0: mrc p15, 0, %0, c13, c0, 4                      \n\t"
+#ifdef CONFIG_CPU_V6
+           "1:                                                 \n\t"
+           "   .subsection 1                                   \n\t"
+           "2: " LOAD_SYM_ARMV6(%0, __per_cpu_offset) "        \n\t"
+           "   b       1b                                      \n\t"
+           "   .previous                                       \n\t"
+           "   .pushsection \".alt.smp.init\", \"a\"           \n\t"
+           "   .long   0b - .                                  \n\t"
+           "   b       . + (2b - 0b)                           \n\t"
+           "   .popsection                                     \n\t"
+#endif
+            : "=r" (off)
+            : "Q" (*(const unsigned long *)current_stack_pointer));
 
        return off;
 }
index 7f7ac96..43d917f 100644 (file)
        .macro  irq_handler, from_user:req
        mov     r0, sp
 #ifdef CONFIG_IRQSTACKS
-       mov_l   r2, irq_stack_ptr       @ Take base address
-       mrc     p15, 0, r3, c13, c0, 4  @ Get CPU offset
 #ifdef CONFIG_UNWINDER_ARM
        mov     fpreg, sp               @ Preserve original SP
 #else
        mov     r8, fp                  @ Preserve original FP
        mov     r9, sp                  @ Preserve original SP
 #endif
-       ldr     sp, [r2, r3]            @ Load SP from per-CPU var
+       ldr_this_cpu sp, irq_stack_ptr, r2, r3
+
        .if     \from_user == 0
 UNWIND(        .setfp  fpreg, sp               )
        @
@@ -876,16 +875,7 @@ __bad_stack:
 THUMB( bx      pc              )
 THUMB( nop                     )
 THUMB( .arm                    )
-       mrc     p15, 0, ip, c13, c0, 4          @ Get per-CPU offset
-
-       .globl  overflow_stack_ptr
-       .reloc  0f, R_ARM_ALU_PC_G0_NC, overflow_stack_ptr
-       .reloc  1f, R_ARM_ALU_PC_G1_NC, overflow_stack_ptr
-       .reloc  2f, R_ARM_LDR_PC_G2, overflow_stack_ptr
-       add     ip, ip, pc
-0:     add     ip, ip, #-4
-1:     add     ip, ip, #0
-2:     ldr     ip, [ip, #4]
+       ldr_this_cpu_armv6 ip, overflow_stack_ptr
 
        str     sp, [ip, #-4]!                  @ Preserve original SP value
        mov     sp, ip                          @ Switch to overflow stack
index 803b51e..f909baf 100644 (file)
@@ -71,9 +71,7 @@ ENTRY(__cpu_suspend)
        @ Run the suspend code from the overflow stack so we don't have to rely
        @ on vmalloc-to-phys conversions anywhere in the arch suspend code.
        @ The original SP value captured in R5 will be restored on the way out.
-       mov_l   r6, overflow_stack_ptr  @ Base pointer
-       mrc     p15, 0, r7, c13, c0, 4  @ Get per-CPU offset
-       ldr     sp, [r6, r7]            @ Address of this CPU's overflow stack
+       ldr_this_cpu sp, overflow_stack_ptr, r6, r7
 #endif
        add     r4, r4, #12             @ Space for pgd, virt sp, phys resume fn
        sub     sp, sp, r4              @ allocate CPU state on stack
index 58afba3..a91ff22 100644 (file)
@@ -386,6 +386,7 @@ config CPU_V6
        select CPU_PABRT_V6
        select CPU_THUMB_CAPABLE
        select CPU_TLB_V6 if MMU
+       select SMP_ON_UP if SMP
 
 # ARMv6k
 config CPU_V6K