x86/asm/entry/64: Move opportunistic sysret code to syscall code path
authorDenys Vlasenko <dvlasenk@redhat.com>
Thu, 2 Apr 2015 16:46:59 +0000 (18:46 +0200)
committerIngo Molnar <mingo@kernel.org>
Wed, 8 Apr 2015 07:02:12 +0000 (09:02 +0200)
This change does two things:

Copy-pastes "retint_swapgs:" code into syscall handling code,
the copy is under "syscall_return:" label. The code is unchanged
apart from some label renames.

Removes "opportunistic sysret" code from "retint_swapgs:" code
block, since now it won't be reached by syscall return. This in
fact removes most of the code in question.

   text    data     bss     dec     hex filename
  12530       0       0   12530    30f2 entry_64.o.before
  12562       0       0   12562    3112 entry_64.o

Run-tested.

Acked-and-Tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427993219-7291-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/kernel/entry_64.S

index 65485b3..e4c8103 100644 (file)
@@ -354,8 +354,8 @@ GLOBAL(int_with_check)
        movl TI_flags(%rcx),%edx
        andl %edi,%edx
        jnz   int_careful
-       andl    $~TS_COMPAT,TI_status(%rcx)
-       jmp   retint_swapgs
+       andl    $~TS_COMPAT,TI_status(%rcx)
+       jmp     syscall_return
 
        /* Either reschedule or signal or syscall exit tracking needed. */
        /* First do a reschedule test. */
@@ -399,9 +399,86 @@ int_restore_rest:
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
        jmp int_with_check
+
+syscall_return:
+       /* The IRETQ could re-enable interrupts: */
+       DISABLE_INTERRUPTS(CLBR_ANY)
+       TRACE_IRQS_IRETQ
+
+       /*
+        * Try to use SYSRET instead of IRET if we're returning to
+        * a completely clean 64-bit userspace context.
+        */
+       movq RCX(%rsp),%rcx
+       cmpq %rcx,RIP(%rsp)             /* RCX == RIP */
+       jne opportunistic_sysret_failed
+
+       /*
+        * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+        * in kernel space.  This essentially lets the user take over
+        * the kernel, since userspace controls RSP.  It's not worth
+        * testing for canonicalness exactly -- this check detects any
+        * of the 17 high bits set, which is true for non-canonical
+        * or kernel addresses.  (This will pessimize vsyscall=native.
+        * Big deal.)
+        *
+        * If virtual addresses ever become wider, this will need
+        * to be updated to remain correct on both old and new CPUs.
+        */
+       .ifne __VIRTUAL_MASK_SHIFT - 47
+       .error "virtual address width changed -- SYSRET checks need update"
+       .endif
+       shr $__VIRTUAL_MASK_SHIFT, %rcx
+       jnz opportunistic_sysret_failed
+
+       cmpq $__USER_CS,CS(%rsp)        /* CS must match SYSRET */
+       jne opportunistic_sysret_failed
+
+       movq R11(%rsp),%r11
+       cmpq %r11,EFLAGS(%rsp)          /* R11 == RFLAGS */
+       jne opportunistic_sysret_failed
+
+       /*
+        * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
+        * restoring TF results in a trap from userspace immediately after
+        * SYSRET.  This would cause an infinite loop whenever #DB happens
+        * with register state that satisfies the opportunistic SYSRET
+        * conditions.  For example, single-stepping this user code:
+        *
+        *           movq $stuck_here,%rcx
+        *           pushfq
+        *           popq %r11
+        *   stuck_here:
+        *
+        * would never get past 'stuck_here'.
+        */
+       testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+       jnz opportunistic_sysret_failed
+
+       /* nothing to check for RSP */
+
+       cmpq $__USER_DS,SS(%rsp)        /* SS must match SYSRET */
+       jne opportunistic_sysret_failed
+
+       /*
+        * We win!  This label is here just for ease of understanding
+        * perf profiles.  Nothing jumps here.
+        */
+syscall_return_via_sysret:
+       CFI_REMEMBER_STATE
+       /* r11 is already restored (see code above) */
+       RESTORE_C_REGS_EXCEPT_R11
+       movq RSP(%rsp),%rsp
+       USERGS_SYSRET64
+       CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
+       SWAPGS
+       jmp     restore_c_regs_and_iret
        CFI_ENDPROC
 END(system_call)
 
+
        .macro FORK_LIKE func
 ENTRY(stub_\func)
        CFI_STARTPROC
@@ -673,76 +750,8 @@ retint_swapgs:             /* return to user-space */
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_IRETQ
 
-       /*
-        * Try to use SYSRET instead of IRET if we're returning to
-        * a completely clean 64-bit userspace context.
-        */
-       movq RCX(%rsp),%rcx
-       cmpq %rcx,RIP(%rsp)             /* RCX == RIP */
-       jne opportunistic_sysret_failed
-
-       /*
-        * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
-        * in kernel space.  This essentially lets the user take over
-        * the kernel, since userspace controls RSP.  It's not worth
-        * testing for canonicalness exactly -- this check detects any
-        * of the 17 high bits set, which is true for non-canonical
-        * or kernel addresses.  (This will pessimize vsyscall=native.
-        * Big deal.)
-        *
-        * If virtual addresses ever become wider, this will need
-        * to be updated to remain correct on both old and new CPUs.
-        */
-       .ifne __VIRTUAL_MASK_SHIFT - 47
-       .error "virtual address width changed -- sysret checks need update"
-       .endif
-       shr $__VIRTUAL_MASK_SHIFT, %rcx
-       jnz opportunistic_sysret_failed
-
-       cmpq $__USER_CS,CS(%rsp)        /* CS must match SYSRET */
-       jne opportunistic_sysret_failed
-
-       movq R11(%rsp),%r11
-       cmpq %r11,EFLAGS(%rsp)          /* R11 == RFLAGS */
-       jne opportunistic_sysret_failed
-
-       /*
-        * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
-        * restoring TF results in a trap from userspace immediately after
-        * SYSRET.  This would cause an infinite loop whenever #DB happens
-        * with register state that satisfies the opportunistic SYSRET
-        * conditions.  For example, single-stepping this user code:
-        *
-        *           movq $stuck_here,%rcx
-        *           pushfq
-        *           popq %r11
-        *   stuck_here:
-        *
-        * would never get past 'stuck_here'.
-        */
-       testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-       jnz opportunistic_sysret_failed
-
-       /* nothing to check for RSP */
-
-       cmpq $__USER_DS,SS(%rsp)        /* SS must match SYSRET */
-       jne opportunistic_sysret_failed
-
-       /*
-        * We win!  This label is here just for ease of understanding
-        * perf profiles.  Nothing jumps here.
-        */
-irq_return_via_sysret:
-       CFI_REMEMBER_STATE
-       /* r11 is already restored (see code above) */
-       RESTORE_C_REGS_EXCEPT_R11
-       movq RSP(%rsp),%rsp
-       USERGS_SYSRET64
-       CFI_RESTORE_STATE
-
-opportunistic_sysret_failed:
        SWAPGS
-       jmp restore_args
+       jmp     restore_c_regs_and_iret
 
 /* Returning to kernel space */
 retint_kernel:
@@ -761,7 +770,12 @@ retint_kernel:
         * The iretq could re-enable interrupts:
         */
        TRACE_IRQS_IRETQ
-restore_args:
+
+/*
+ * At this label, code paths which return to kernel and to user,
+ * which come from interrupts/exception and from syscalls, merge.
+ */
+restore_c_regs_and_iret:
        RESTORE_C_REGS
        REMOVE_PT_GPREGS_FROM_STACK 8