From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 24 Oct 2018 10:22:39 +0000 (+0100)
Subject: Merge branch 'siginfo-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebieder... 
X-Git-Tag: v5.4-rc1~2401
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ba9f6f8954afa5224e3ed60332f7b92242b7ed0f;p=platform%2Fkernel%2Flinux-rpi.git

Merge branch 'siginfo-linus' of git://git./linux/kernel/git/ebiederm/user-namespace

Pull siginfo updates from Eric Biederman:
 "I have been slowly sorting out siginfo and this is the culmination of
  that work.

  The primary result is in several ways the signal infrastructure has
  been made less error prone. The code has been updated so that manually
  specifying SEND_SIG_FORCED is never necessary. The conversion to the
  new siginfo sending functions is now complete, which makes it
  difficult to send a signal without filling in the proper siginfo
  fields.

  At the tail end of the patchset comes the optimization of decreasing
  the size of struct siginfo in the kernel from 128 bytes to about 48
  bytes on 64bit. The fundamental observation that enables this is by
  definition none of the known ways to use struct siginfo uses the extra
  bytes.

  This comes at the cost of a small user space observable difference.
  For the rare case of siginfo being injected into the kernel only what
  can be copied into kernel_siginfo is delivered to the destination, the
  rest of the bytes are set to 0. For cases where the signal and the
  si_code are known this is safe, because we know those bytes are not
  used. For cases where the signal and si_code combination is unknown
  the bits that won't fit into struct kernel_siginfo are tested to
  verify they are zero, and the send fails if they are not.

  I made an extensive search through userspace code and I could not find
  anything that would break because of the above change. If it turns out
  I did break something it will take just the revert of a single change
  to restore kernel_siginfo to the same size as userspace siginfo.

  Testing did reveal dependencies on preferring the signo passed to
  sigqueueinfo over si->signo, so bit the bullet and added the
  complexity necessary to handle that case.

  Testing also revealed bad things can happen if a negative signal
  number is passed into the system calls. Something no sane application
  will do but something a malicious program or a fuzzer might do. So I
  have fixed the code that performs the bounds checks to ensure negative
  signal numbers are handled"

* 'siginfo-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (80 commits)
  signal: Guard against negative signal numbers in copy_siginfo_from_user32
  signal: Guard against negative signal numbers in copy_siginfo_from_user
  signal: In sigqueueinfo prefer sig not si_signo
  signal: Use a smaller struct siginfo in the kernel
  signal: Distinguish between kernel_siginfo and siginfo
  signal: Introduce copy_siginfo_from_user and use it's return value
  signal: Remove the need for __ARCH_SI_PREABLE_SIZE and SI_PAD_SIZE
  signal: Fail sigqueueinfo if si_signo != sig
  signal/sparc: Move EMT_TAGOVF into the generic siginfo.h
  signal/unicore32: Use force_sig_fault where appropriate
  signal/unicore32: Generate siginfo in ucs32_notify_die
  signal/unicore32: Use send_sig_fault where appropriate
  signal/arc: Use force_sig_fault where appropriate
  signal/arc: Push siginfo generation into unhandled_exception
  signal/ia64: Use force_sig_fault where appropriate
  signal/ia64: Use the force_sig(SIGSEGV,...) in ia64_rt_sigreturn
  signal/ia64: Use the generic force_sigsegv in setup_frame
  signal/arm/kvm: Use send_sig_mceerr
  signal/arm: Use send_sig_fault where appropriate
  signal/arm: Use force_sig_fault where appropriate
  ...
---

ba9f6f8954afa5224e3ed60332f7b92242b7ed0f
diff --cc arch/arm64/kernel/traps.c
index 4066da7f,856b32a..5f4d9ac
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@@ -353,12 -366,6 +368,9 @@@ void force_signal_inject(int signal, in
  	const char *desc;
  	struct pt_regs *regs = current_pt_regs();
  
 +	if (WARN_ON(!user_mode(regs)))
 +		return;
 +
- 	clear_siginfo(&info);
- 
  	switch (signal) {
  	case SIGILL:
  		desc = "undefined instruction";
diff --cc arch/arm64/mm/fault.c
index d0e638e,f0ccb20..7d9571f
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@@ -789,17 -767,11 +773,11 @@@ asmlinkage void __exception do_sp_pc_ab
  	if (user_mode(regs)) {
  		if (instruction_pointer(regs) > TASK_SIZE)
  			arm64_apply_bp_hardening();
 -		local_irq_enable();
 +		local_daif_restore(DAIF_PROCCTX);
  	}
  
- 	clear_siginfo(&info);
- 	info.si_signo = SIGBUS;
- 	info.si_errno = 0;
- 	info.si_code  = BUS_ADRALN;
- 	info.si_addr  = (void __user *)addr;
- 	arm64_notify_die("SP/PC alignment exception", regs, &info, esr);
+ 	arm64_notify_die("SP/PC alignment exception", regs,
+ 			 SIGBUS, BUS_ADRALN, (void __user *)addr, esr);
  }
  
  int __init early_brk64(unsigned long addr, unsigned int esr,
diff --cc arch/x86/kernel/traps.c
index 5bd0a99,34a327f..8f6dcd8
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -202,11 -202,8 +202,8 @@@ do_trap_no_signal(struct task_struct *t
  						error_code, trapnr))
  				return 0;
  		}
- 		return -1;
- 	}
- 
- 	if (!user_mode(regs)) {
+ 	} else if (!user_mode(regs)) {
 -		if (fixup_exception(regs, trapnr))
 +		if (fixup_exception(regs, trapnr, error_code, 0))
  			return 0;
  
  		tsk->thread.error_code = error_code;
@@@ -560,19 -540,9 +544,19 @@@ do_general_protection(struct pt_regs *r
  
  		tsk->thread.error_code = error_code;
  		tsk->thread.trap_nr = X86_TRAP_GP;
 +
 +		/*
 +		 * To be potentially processing a kprobe fault and to
 +		 * trust the result from kprobe_running(), we have to
 +		 * be non-preemptible.
 +		 */
 +		if (!preemptible() && kprobe_running() &&
 +		    kprobe_fault_handler(regs, X86_TRAP_GP))
 +			return;
 +
- 		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+ 		if (notify_die(DIE_GPF, desc, regs, error_code,
  			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
- 			die("general protection fault", regs, error_code);
+ 			die(desc, regs, error_code);
  		return;
  	}
  
diff --cc arch/x86/mm/fault.c
index 2b1519b,8d77700..b24eb4e
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -1237,74 -1187,41 +1190,73 @@@ do_kern_addr_fault(struct pt_regs *regs
  	 * only copy the information from the master page table,
  	 * nothing more.
  	 *
 -	 * This verifies that the fault happens in kernel space
 -	 * (error_code & 4) == 0, and that the fault was not a
 -	 * protection error (error_code & 9) == 0.
 +	 * Before doing this on-demand faulting, ensure that the
 +	 * fault is not any of the following:
 +	 * 1. A fault on a PTE with a reserved bit set.
 +	 * 2. A fault caused by a user-mode access.  (Do not demand-
 +	 *    fault kernel memory due to user-mode accesses).
 +	 * 3. A fault caused by a page-level protection violation.
 +	 *    (A demand fault would be on a non-present page which
 +	 *     would have X86_PF_PROT==0).
  	 */
 -	if (unlikely(fault_in_kernel_space(address))) {
 -		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
 -			if (vmalloc_fault(address) >= 0)
 -				return;
 -		}
 -
 -		/* Can handle a stale RO->RW TLB: */
 -		if (spurious_fault(error_code, address))
 +	if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
 +		if (vmalloc_fault(address) >= 0)
  			return;
 +	}
  
 -		/* kprobes don't want to hook the spurious faults: */
 -		if (kprobes_fault(regs))
 -			return;
 -		/*
 -		 * Don't take the mm semaphore here. If we fixup a prefetch
 -		 * fault we could otherwise deadlock:
 -		 */
 -		bad_area_nosemaphore(regs, error_code, address);
 +	/* Was the fault spurious, caused by lazy TLB invalidation? */
 +	if (spurious_kernel_fault(hw_error_code, address))
 +		return;
  
 +	/* kprobes don't want to hook the spurious faults: */
 +	if (kprobes_fault(regs))
  		return;
 -	}
 +
 +	/*
 +	 * Note, despite being a "bad area", there are quite a few
 +	 * acceptable reasons to get here, such as erratum fixups
 +	 * and handling kernel code that can fault, like get_user().
 +	 *
 +	 * Don't take the mm semaphore here. If we fixup a prefetch
 +	 * fault we could otherwise deadlock:
 +	 */
- 	bad_area_nosemaphore(regs, hw_error_code, address, NULL);
++	bad_area_nosemaphore(regs, hw_error_code, address);
 +}
 +NOKPROBE_SYMBOL(do_kern_addr_fault);
 +
 +/* Handle faults in the user portion of the address space */
 +static inline
 +void do_user_addr_fault(struct pt_regs *regs,
 +			unsigned long hw_error_code,
 +			unsigned long address)
 +{
 +	unsigned long sw_error_code;
 +	struct vm_area_struct *vma;
 +	struct task_struct *tsk;
 +	struct mm_struct *mm;
 +	vm_fault_t fault, major = 0;
 +	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- 	u32 pkey;
 +
 +	tsk = current;
 +	mm = tsk->mm;
  
  	/* kprobes don't want to hook the spurious faults: */
  	if (unlikely(kprobes_fault(regs)))
  		return;
  
 -	if (unlikely(error_code & X86_PF_RSVD))
 -		pgtable_bad(regs, error_code, address);
 +	/*
 +	 * Reserved bits are never expected to be set on
 +	 * entries in the user portion of the page tables.
 +	 */
 +	if (unlikely(hw_error_code & X86_PF_RSVD))
 +		pgtable_bad(regs, hw_error_code, address);
  
 -	if (unlikely(smap_violation(error_code, regs))) {
 -		bad_area_nosemaphore(regs, error_code, address);
 +	/*
 +	 * Check for invalid kernel (supervisor) access to user
 +	 * pages in the user address space.
 +	 */
 +	if (unlikely(smap_violation(hw_error_code, regs))) {
- 		bad_area_nosemaphore(regs, hw_error_code, address, NULL);
++		bad_area_nosemaphore(regs, hw_error_code, address);
  		return;
  	}
  
@@@ -1313,7 -1230,7 +1265,7 @@@
  	 * in a region with pagefaults disabled then we must not take the fault
  	 */
  	if (unlikely(faulthandler_disabled() || !mm)) {
- 		bad_area_nosemaphore(regs, hw_error_code, address, NULL);
 -		bad_area_nosemaphore(regs, error_code, address);
++		bad_area_nosemaphore(regs, hw_error_code, address);
  		return;
  	}
  
@@@ -1361,49 -1252,31 +1313,49 @@@
  
  	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
  
 -	if (error_code & X86_PF_WRITE)
 +	if (sw_error_code & X86_PF_WRITE)
  		flags |= FAULT_FLAG_WRITE;
 -	if (error_code & X86_PF_INSTR)
 +	if (sw_error_code & X86_PF_INSTR)
  		flags |= FAULT_FLAG_INSTRUCTION;
  
 +#ifdef CONFIG_X86_64
 +	/*
 +	 * Instruction fetch faults in the vsyscall page might need
 +	 * emulation.  The vsyscall page is at a high address
 +	 * (>PAGE_OFFSET), but is considered to be part of the user
 +	 * address space.
 +	 *
 +	 * The vsyscall page does not have a "real" VMA, so do this
 +	 * emulation before we go searching for VMAs.
 +	 */
 +	if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
 +		if (emulate_vsyscall(regs, address))
 +			return;
 +	}
 +#endif
 +
  	/*
 -	 * When running in the kernel we expect faults to occur only to
 -	 * addresses in user space.  All other faults represent errors in
 -	 * the kernel and should generate an OOPS.  Unfortunately, in the
 -	 * case of an erroneous fault occurring in a code path which already
 -	 * holds mmap_sem we will deadlock attempting to validate the fault
 -	 * against the address space.  Luckily the kernel only validly
 -	 * references user space from well defined areas of code, which are
 -	 * listed in the exceptions table.
 +	 * Kernel-mode access to the user address space should only occur
 +	 * on well-defined single instructions listed in the exception
 +	 * tables.  But, an erroneous kernel fault occurring outside one of
 +	 * those areas which also holds mmap_sem might deadlock attempting
 +	 * to validate the fault against the address space.
  	 *
 -	 * As the vast majority of faults will be valid we will only perform
 -	 * the source reference check when there is a possibility of a
 -	 * deadlock. Attempt to lock the address space, if we cannot we then
 -	 * validate the source. If this is invalid we can skip the address
 -	 * space check, thus avoiding the deadlock:
 +	 * Only do the expensive exception table search when we might be at
 +	 * risk of a deadlock.  This happens if we
 +	 * 1. Failed to acquire mmap_sem, and
 +	 * 2. The access did not originate in userspace.  Note: either the
 +	 *    hardware or earlier page fault code may set X86_PF_USER
 +	 *    in sw_error_code.
  	 */
  	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 -		if (!(error_code & X86_PF_USER) &&
 +		if (!(sw_error_code & X86_PF_USER) &&
  		    !search_exception_tables(regs->ip)) {
 -			bad_area_nosemaphore(regs, error_code, address);
 +			/*
 +			 * Fault from code in kernel from
 +			 * which we do not expect faults.
 +			 */
- 			bad_area_nosemaphore(regs, sw_error_code, address, NULL);
++			bad_area_nosemaphore(regs, sw_error_code, address);
  			return;
  		}
  retry:
@@@ -1499,7 -1369,7 +1448,7 @@@ good_area
  
  	up_read(&mm->mmap_sem);
  	if (unlikely(fault & VM_FAULT_ERROR)) {
- 		mm_fault_error(regs, sw_error_code, address, &pkey, fault);
 -		mm_fault_error(regs, error_code, address, fault);
++		mm_fault_error(regs, sw_error_code, address, fault);
  		return;
  	}