targets for exploits that can control RIP.
emulate [default] Vsyscalls turn into traps and are
- emulated reasonably safely.
+ emulated reasonably safely. The vsyscall
+ page is readable.
- native Vsyscalls are native syscall instructions.
- This is a little bit faster than trapping
- and makes a few dynamic recompilers work
- better than they would in emulation mode.
- It also makes exploits much easier to write.
+ xonly Vsyscalls turn into traps and are
+ emulated reasonably safely. The vsyscall
+ page is not readable.
none Vsyscalls don't work at all. This makes
them quite hard to use for exploits but
choice
prompt "vsyscall table for legacy applications"
depends on X86_64
- default LEGACY_VSYSCALL_EMULATE
+ default LEGACY_VSYSCALL_XONLY
help
Legacy user code that does not know how to find the vDSO expects
to be able to issue three syscalls by calling fixed addresses in
it can be used to assist security vulnerability exploitation.
This setting can be changed at boot time via the kernel command
- line parameter vsyscall=[emulate|none].
+ line parameter vsyscall=[emulate|xonly|none].
On a system with recent enough glibc (2.14 or newer) and no
static binaries, you can say None without a performance penalty
to improve security.
- If unsure, select "Emulate".
+ If unsure, select "Emulate execution only".
config LEGACY_VSYSCALL_EMULATE
- bool "Emulate"
+ bool "Full emulation"
help
- The kernel traps and emulates calls into the fixed
- vsyscall address mapping. This makes the mapping
- non-executable, but it still contains known contents,
- which could be used in certain rare security vulnerability
- exploits. This configuration is recommended when userspace
- still uses the vsyscall area.
+ The kernel traps and emulates calls into the fixed vsyscall
+ address mapping. This makes the mapping non-executable, but
+ it still contains readable known contents, which could be
+ used in certain rare security vulnerability exploits. This
+ configuration is recommended when using legacy userspace
+ that still uses vsyscalls along with legacy binary
+ instrumentation tools that require code to be readable.
+
+ An example of this type of legacy userspace is running
+ Pin on an old binary that still uses vsyscalls.
+
+ config LEGACY_VSYSCALL_XONLY
+ bool "Emulate execution only"
+ help
+ The kernel traps and emulates calls into the fixed vsyscall
+ address mapping and does not allow reads. This
+ configuration is recommended when userspace might use the
+ legacy vsyscall area but support for legacy binary
+ instrumentation of legacy code is not needed. It mitigates
+ certain uses of the vsyscall area as an ASLR-bypassing
+ buffer.
config LEGACY_VSYSCALL_NONE
bool "None"
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
-static enum { EMULATE, NONE } vsyscall_mode =
+static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
NONE;
+#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
+ XONLY;
#else
EMULATE;
#endif
if (str) {
if (!strcmp("emulate", str))
vsyscall_mode = EMULATE;
+ else if (!strcmp("xonly", str))
+ vsyscall_mode = XONLY;
else if (!strcmp("none", str))
vsyscall_mode = NONE;
else
}
}
-bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
+bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
long ret;
unsigned long orig_dx;
+ /* Write faults or kernel-privilege faults never get fixed up. */
+ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
+ return false;
+
+ if (!(error_code & X86_PF_INSTR)) {
+ /* Failed vsyscall read */
+ if (vsyscall_mode == EMULATE)
+ return false;
+
+ /*
+ * User code tried and failed to read the vsyscall page.
+ */
+ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
+ return false;
+ }
+
/*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
static const struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
-static struct vm_area_struct gate_vma = {
+static struct vm_area_struct gate_vma __ro_after_init = {
.vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
- if (vsyscall_mode != NONE) {
+ /*
+ * For full emulation, the page needs to exist for real. In
+ * execute-only mode, there is no PTE at all backing the vsyscall
+ * page.
+ */
+ if (vsyscall_mode == EMULATE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
+ if (vsyscall_mode == XONLY)
+ gate_vma.vm_flags = VM_EXEC;
+
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
}
* Called on instruction fetch fault in vsyscall page.
* Returns true if handled.
*/
-extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
+extern bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address);
#else
static inline void map_vsyscall(void) {}
-static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
+static inline bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address)
{
return false;
}
* To avoid leaking information about the kernel page
* table layout, pretend that user-mode accesses to
* kernel addresses are always protection faults.
+ *
+ * NB: This means that failed vsyscalls with vsyscall=none
+ * will have the PROT bit. This doesn't leak any
+ * information and does not appear to cause any problems.
*/
if (address >= TASK_SIZE_MAX)
error_code |= X86_PF_PROT;
#ifdef CONFIG_X86_64
/*
- * Instruction fetch faults in the vsyscall page might need
- * emulation. The vsyscall page is at a high address
- * (>PAGE_OFFSET), but is considered to be part of the user
- * address space.
+ * Faults in the vsyscall page might need emulation. The
+ * vsyscall page is at a high address (>PAGE_OFFSET), but is
+ * considered to be part of the user address space.
*
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
+ *
+ * PKRU never rejects instruction fetches, so we don't need
+ * to consider the PF_PK bit.
*/
- if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
- if (emulate_vsyscall(regs, address))
+ if (is_vsyscall_vaddr(address)) {
+ if (emulate_vsyscall(hw_error_code, regs, address))
return;
}
#endif
#include <sched.h>
#include <stdbool.h>
#include <setjmp.h>
+#include <sys/uio.h>
#ifdef __x86_64__
# define VSYS(x) (x)
}
/* vsyscalls and vDSO */
-bool should_read_vsyscall = false;
+bool vsyscall_map_r = false, vsyscall_map_x = false;
typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
-gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
+const gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
gtod_t vdso_gtod;
typedef int (*vgettime_t)(clockid_t, struct timespec *);
vgettime_t vdso_gettime;
typedef long (*time_func_t)(time_t *t);
-time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
+const time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
time_func_t vdso_time;
typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
-getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
+const getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
getcpu_t vdso_getcpu;
static void init_vdso(void)
maps = fopen("/proc/self/maps", "r");
if (!maps) {
printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
- should_read_vsyscall = true;
+ vsyscall_map_r = true;
return 0;
}
}
printf("\tvsyscall permissions are %c-%c\n", r, x);
- should_read_vsyscall = (r == 'r');
- if (x != 'x') {
- vgtod = NULL;
- vtime = NULL;
- vgetcpu = NULL;
- }
+ vsyscall_map_r = (r == 'r');
+ vsyscall_map_x = (x == 'x');
found = true;
break;
if (!found) {
printf("\tno vsyscall map in /proc/self/maps\n");
- should_read_vsyscall = false;
- vgtod = NULL;
- vtime = NULL;
- vgetcpu = NULL;
+ vsyscall_map_r = false;
+ vsyscall_map_x = false;
}
return nerrs;
}
static jmp_buf jmpbuf;
+static volatile unsigned long segv_err;
static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
{
+ ucontext_t *ctx = (ucontext_t *)ctx_void;
+
+ segv_err = ctx->uc_mcontext.gregs[REG_ERR];
siglongjmp(jmpbuf, 1);
}
err(1, "syscall gettimeofday");
if (vdso_gtod)
ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
- if (vgtod)
+ if (vsyscall_map_x)
ret_vsys = vgtod(&tv_vsys, &tz_vsys);
if (sys_gtod(&tv_sys2, &tz_sys) != 0)
err(1, "syscall gettimeofday");
}
}
- if (vgtod) {
+ if (vsyscall_map_x) {
if (ret_vsys == 0) {
nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
} else {
t_sys1 = sys_time(&t2_sys1);
if (vdso_time)
t_vdso = vdso_time(&t2_vdso);
- if (vtime)
+ if (vsyscall_map_x)
t_vsys = vtime(&t2_vsys);
t_sys2 = sys_time(&t2_sys2);
if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
}
}
- if (vtime) {
+ if (vsyscall_map_x) {
if (t_vsys < 0 || t_vsys != t2_vsys) {
printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
nerrs++;
ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
if (vdso_getcpu)
ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
- if (vgetcpu)
+ if (vsyscall_map_x)
ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
if (ret_sys == 0) {
}
}
- if (vgetcpu) {
+ if (vsyscall_map_x) {
if (ret_vsys) {
printf("[FAIL]\tvsyscall getcpu() failed\n");
nerrs++;
can_read = false;
}
- if (can_read && !should_read_vsyscall) {
+ if (can_read && !vsyscall_map_r) {
printf("[FAIL]\tWe have read access, but we shouldn't\n");
return 1;
- } else if (!can_read && should_read_vsyscall) {
+ } else if (!can_read && vsyscall_map_r) {
printf("[FAIL]\tWe don't have read access, but we should\n");
return 1;
+ } else if (can_read) {
+ printf("[OK]\tWe have read access\n");
} else {
- printf("[OK]\tgot expected result\n");
+ printf("[OK]\tWe do not have read access: #PF(0x%lx)\n",
+ segv_err);
}
#endif
return 0;
}
+static int test_vsys_x(void)
+{
+#ifdef __x86_64__
+ if (vsyscall_map_x) {
+ /* We already tested this adequately. */
+ return 0;
+ }
+
+ printf("[RUN]\tMake sure that vsyscalls really page fault\n");
+
+ bool can_exec;
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ vgtod(NULL, NULL);
+ can_exec = true;
+ } else {
+ can_exec = false;
+ }
+
+ if (can_exec) {
+ printf("[FAIL]\tExecuting the vsyscall did not page fault\n");
+ return 1;
+ } else if (segv_err & (1 << 4)) { /* INSTR */
+ printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n",
+ segv_err);
+ } else {
+ printf("[FAILT]\tExecution failed with the wrong error: #PF(0x%lx)\n",
+ segv_err);
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+static int test_process_vm_readv(void)
+{
+#ifdef __x86_64__
+ char buf[4096];
+ struct iovec local, remote;
+ int ret;
+
+ printf("[RUN]\tprocess_vm_readv() from vsyscall page\n");
+
+ local.iov_base = buf;
+ local.iov_len = 4096;
+ remote.iov_base = (void *)0xffffffffff600000;
+ remote.iov_len = 4096;
+ ret = process_vm_readv(getpid(), &local, 1, &remote, 1, 0);
+ if (ret != 4096) {
+ printf("[OK]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno);
+ return 0;
+ }
+
+ if (vsyscall_map_r) {
+ if (!memcmp(buf, (const void *)0xffffffffff600000, 4096)) {
+ printf("[OK]\tIt worked and read correct data\n");
+ } else {
+ printf("[FAIL]\tIt worked but returned incorrect data\n");
+ return 1;
+ }
+ }
+#endif
+
+ return 0;
+}
#ifdef __x86_64__
#define X86_EFLAGS_TF (1UL << 8)
time_t tmp;
bool is_native;
- if (!vtime)
+ if (!vsyscall_map_x)
return 0;
printf("[RUN]\tchecking that vsyscalls are emulated\n");
sethandler(SIGSEGV, sigsegv, 0);
nerrs += test_vsys_r();
+ nerrs += test_vsys_x();
+
+ nerrs += test_process_vm_readv();
#ifdef __x86_64__
nerrs += test_emulation();