xorl %ebx,%ebx
1: ret
-and the whole paranoid non-paranoid macro complexity is about whether
-to suffer that RDMSR cost.
-
If we are at an interrupt or user-trap/gate-alike boundary then we can
use the faster check: the stack will be a reliable indicator of
whether SWAPGS was already done: if we see that we are a secondary
stack but before we executed SWAPGS, then the only safe way to check
for GS is the slower method: the RDMSR.
-So we try only to mark those entry methods 'paranoid' that absolutely
-need the more expensive check for the GS base - and we generate all
-'normal' entry points with the regular (faster) entry macros.
+Therefore, super-atomic entries (except NMI, which is handled separately)
+must use idtentry with paranoid=1 to handle gsbase correctly. This
+triggers three main behavior changes:
+
+ - Interrupt entry will use the slower gsbase check.
+ - Interrupt entry from user mode will switch off the IST stack.
+ - Interrupt exit to kernel mode will not attempt to reschedule.
+
+We try to only use IST entries and the paranoid entry code for vectors
+that absolutely need the more expensive check for the GS base - and we
+generate all 'normal' entry points with the regular (faster) paranoid=0
+variant.
interrupt-gate descriptor. When an interrupt occurs and the hardware
loads such a descriptor, the hardware automatically sets the new stack
pointer based on the IST value, then invokes the interrupt handler. If
-software wants to allow nested IST interrupts then the handler must
-adjust the IST values on entry to and exit from the interrupt handler.
-(This is occasionally done, e.g. for debug exceptions.)
+the interrupt came from user mode, then the interrupt handler prologue
+will switch back to the per-thread stack. If software wants to allow
+nested IST interrupts then the handler must adjust the IST values on
+entry to and exit from the interrupt handler. (This is occasionally
+done, e.g. for debug exceptions.)
Events with different IST codes (i.e. with different stacks) can be
nested. For example, a debug interrupt can safely be interrupted by an
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
.if \paranoid
+ .if \paranoid == 1
+ CFI_REMEMBER_STATE
+ testl $3, CS(%rsp) /* If coming from userspace, switch */
+ jnz 1f /* stacks. */
+ .endif
call save_paranoid
.else
call error_entry
jmp error_exit /* %ebx: no swapgs flag */
.endif
+ .if \paranoid == 1
+ CFI_RESTORE_STATE
+ /*
+ * Paranoid entry from userspace. Switch stacks and treat it
+ * as a normal entry. This means that paranoid handlers
+ * run in real process context if user_mode(regs).
+ */
+1:
+ call error_entry
+
+ DEFAULT_FRAME 0
+
+ movq %rsp,%rdi /* pt_regs pointer */
+ call sync_regs
+ movq %rax,%rsp /* switch stack */
+
+ movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ call \do_sym
+
+ jmp error_exit /* %ebx: no swapgs flag */
+ .endif
+
CFI_ENDPROC
END(\sym)
.endm
idtentry bounds do_bounds has_error_code=0
idtentry invalid_op do_invalid_op has_error_code=0
idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=1
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
idtentry invalid_TSS do_invalid_TSS has_error_code=1
idtentry segment_not_present do_segment_not_present has_error_code=1
#endif
/*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
*
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
*/
/* ebx: no swapgs flag */
TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
TRACE_IRQS_IRETQ 0
SWAPGS_UNSAFE_STACK
RESTORE_ALL 8
- jmp irq_return
+ INTERRUPT_RETURN
paranoid_restore:
TRACE_IRQS_IRETQ_DEBUG 0
RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule
- movl %ebx,%edx /* arg3: thread flags */
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
-paranoid_schedule:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- SCHEDULE_USER
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
+ INTERRUPT_RETURN
CFI_ENDPROC
END(paranoid_exit)
#ifdef CONFIG_X86_64
/*
- * Help handler running on IST stack to switch back to user stack
- * for scheduling or signal handling. The actual stack switch is done in
- * entry.S
+ * Help handler running on IST stack to switch off the IST stack if the
+ * interrupted code was in user mode. The actual stack switch is done in
+ * entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = eregs;
- /* Did already sync */
- if (eregs == (struct pt_regs *)eregs->sp)
- ;
- /* Exception from user space */
- else if (user_mode(eregs))
- regs = task_pt_regs(current);
- /*
- * Exception from kernel and interrupts are enabled. Move to
- * kernel process stack.
- */
- else if (eregs->flags & X86_EFLAGS_IF)
- regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
- if (eregs != regs)
- *regs = *eregs;
+ struct pt_regs *regs = task_pt_regs(current);
+ *regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);