-Secure Memory Encryption (SME) is a feature found on AMD processors.
+Secure Memory Encryption (SME) and Secure Encrypted Virtualization (SEV) are
+features found on AMD processors.
SME provides the ability to mark individual pages of memory as encrypted using
the standard x86 page tables. A page that is marked encrypted will be
DRAM. SME can therefore be used to protect the contents of DRAM from physical
attacks on the system.
+SEV enables running encrypted virtual machines (VMs) in which the code and data
+of the guest VM are secured so that a decrypted version is available only
+within the VM itself. SEV guest VMs have the concept of private and shared
+memory. Private memory is encrypted with the guest-specific key, while shared
+memory may be encrypted with hypervisor key. When SME is enabled, the hypervisor
+key is the same key which is used in SME.
+
A page is encrypted when a page table entry has the encryption bit set (see
below on how to determine its position). The encryption bit can also be
specified in the cr3 register, allowing the PGD table to be encrypted. Each
successive level of page tables can also be encrypted by setting the encryption
bit in the page table entry that points to the next table. This allows the full
page table hierarchy to be encrypted. Note, this means that just because the
-encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted.
+encryption bit is set in cr3, doesn't imply the full hierarchy is encrypted.
Each page table entry in the hierarchy needs to have the encryption bit set to
achieve that. So, theoretically, you could have the encryption bit set in cr3
so that the PGD is encrypted, but not set the encryption bit in the PGD entry
for a PUD which results in the PUD pointed to by that entry to not be
encrypted.
-Support for SME can be determined through the CPUID instruction. The CPUID
-function 0x8000001f reports information related to SME:
+When SEV is enabled, instruction pages and guest page tables are always treated
+as private. All the DMA operations inside the guest must be performed on shared
+memory. Since the memory encryption bit is controlled by the guest OS when it
+is operating in 64-bit or 32-bit PAE mode, in all other modes the SEV hardware
+forces the memory encryption bit to 1.
+
+Support for SME and SEV can be determined through the CPUID instruction. The
+CPUID function 0x8000001f reports information related to SME:
0x8000001f[eax]:
Bit[0] indicates support for SME
+ Bit[1] indicates support for SEV
0x8000001f[ebx]:
Bits[5:0] pagetable bit number used to activate memory
encryption
Bit[23] 0 = memory encryption features are disabled
1 = memory encryption features are enabled
+If SEV is supported, MSR 0xc0010131 (MSR_AMD64_SEV) can be used to determine if
+SEV is active:
+
+ 0xc0010131:
+ Bit[0] 0 = memory encryption is not active
+ 1 = memory encryption is active
+
Linux relies on BIOS to set this bit if BIOS has determined that the reduction
in the physical address space as a result of enabling memory encryption (see
CPUID information above) will not conflict with the address space resource
Overview
--------
-The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is
+The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is
similar in concept to a DWARF unwinder. The difference is that the
format of the ORC data is much simpler than DWARF, which in turn allows
the ORC unwinder to be much simpler and faster.
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
-ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB)
+ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ...
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ifeq ($(has_libelf),1)
objtool_target := tools/objtool FORCE
else
- ifdef CONFIG_ORC_UNWINDER
- $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+ ifdef CONFIG_UNWINDER_ORC
+ $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
else
$(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
endif
* and that value will be returned. If all free regions are visited without
* func returning non-zero, then zero will be returned.
*/
-int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *))
+int arch_kexec_walk_mem(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
{
int ret = 0;
u64 i;
phys_addr_t mstart, mend;
+ struct resource res = { };
if (kbuf->top_down) {
for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0,
* range while in kexec, end points to the last byte
* in the range.
*/
- ret = func(mstart, mend - 1, kbuf);
+ res.start = mstart;
+ res.end = mend - 1;
+ ret = func(&res, kbuf);
if (ret)
break;
}
* range while in kexec, end points to the last byte
* in the range.
*/
- ret = func(mstart, mend - 1, kbuf);
+ res.start = mstart;
+ res.end = mend - 1;
+ ret = func(&res, kbuf);
if (ret)
break;
}
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
+ select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
select HAVE_STACK_VALIDATION if X86_64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
- default 0xdff8000000000000 if X86_5LEVEL
default 0xdffffc0000000000
config HAVE_INTEL_TXT
If unsure, say Y.
+config X86_INTEL_UMIP
+ def_bool n
+ depends on CPU_SUP_INTEL
+ prompt "Intel User Mode Instruction Prevention" if EXPERT
+ ---help---
+ The User Mode Instruction Prevention (UMIP) is a security
+ feature in newer Intel processors. If enabled, a general
+ protection fault is issued if the instructions SGDT, SLDT,
+ SIDT, SMSW and STR are executed in user mode.
+
config X86_INTEL_MPX
prompt "Intel MPX (Memory Protection Extensions)"
def_bool n
choice
prompt "Choose kernel unwinder"
- default FRAME_POINTER_UNWINDER
+ default UNWINDER_ORC if X86_64
+ default UNWINDER_FRAME_POINTER if X86_32
---help---
This determines which method will be used for unwinding kernel stack
traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
livepatch, lockdep, and more.
-config FRAME_POINTER_UNWINDER
- bool "Frame pointer unwinder"
- select FRAME_POINTER
- ---help---
- This option enables the frame pointer unwinder for unwinding kernel
- stack traces.
-
- The unwinder itself is fast and it uses less RAM than the ORC
- unwinder, but the kernel text size will grow by ~3% and the kernel's
- overall performance will degrade by roughly 5-10%.
-
- This option is recommended if you want to use the livepatch
- consistency model, as this is currently the only way to get a
- reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
-
-config ORC_UNWINDER
+config UNWINDER_ORC
bool "ORC unwinder"
depends on X86_64
select STACK_VALIDATION
Enabling this option will increase the kernel's runtime memory usage
by roughly 2-4MB, depending on your kernel config.
-config GUESS_UNWINDER
+config UNWINDER_FRAME_POINTER
+ bool "Frame pointer unwinder"
+ select FRAME_POINTER
+ ---help---
+ This option enables the frame pointer unwinder for unwinding kernel
+ stack traces.
+
+ The unwinder itself is fast and it uses less RAM than the ORC
+ unwinder, but the kernel text size will grow by ~3% and the kernel's
+ overall performance will degrade by roughly 5-10%.
+
+ This option is recommended if you want to use the livepatch
+ consistency model, as this is currently the only way to get a
+ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
+config UNWINDER_GUESS
bool "Guess unwinder"
depends on EXPERT
---help---
endchoice
config FRAME_POINTER
- depends on !ORC_UNWINDER && !GUESS_UNWINDER
+ depends on !UNWINDER_ORC && !UNWINDER_GUESS
bool
endmenu
setup
setup.bin
setup.elf
+fdimage
+mtools.conf
+image.iso
$(obj)/mtools.conf: $(src)/mtools.conf.in
sed -e 's|@OBJ@|$(obj)|g' < $< > $@
+quiet_cmd_genimage = GENIMAGE $3
+cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \
+ $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD)
+
# This requires write access to /dev/fd0
bzdisk: $(obj)/bzImage $(obj)/mtools.conf
- MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
- syslinux /dev/fd0 ; sync
- echo '$(image_cmdline)' | \
- MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
- fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync
+ $(call cmd,genimage,bzdisk,/dev/fd0)
# These require being root or having syslinux 2.02 or higher installed
fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
- dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
- MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
- syslinux $(obj)/fdimage ; sync
- echo '$(image_cmdline)' | \
- MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
- fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync
+ $(call cmd,genimage,fdimage144,$(obj)/fdimage)
+ @$(kecho) 'Kernel: $(obj)/fdimage is ready'
fdimage288: $(obj)/bzImage $(obj)/mtools.conf
- dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
- MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
- syslinux $(obj)/fdimage ; sync
- echo '$(image_cmdline)' | \
- MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
- fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync
+ $(call cmd,genimage,fdimage288,$(obj)/fdimage)
+ @$(kecho) 'Kernel: $(obj)/fdimage is ready'
isoimage: $(obj)/bzImage
- -rm -rf $(obj)/isoimage
- mkdir $(obj)/isoimage
- for i in lib lib64 share end ; do \
- if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \
- cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \
- if [ -f /usr/$$i/syslinux/ldlinux.c32 ]; then \
- cp /usr/$$i/syslinux/ldlinux.c32 $(obj)/isoimage ; \
- fi ; \
- break ; \
- fi ; \
- if [ $$i = end ] ; then exit 1 ; fi ; \
- done
- cp $(obj)/bzImage $(obj)/isoimage/linux
- echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
- fi
- mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
- -no-emul-boot -boot-load-size 4 -boot-info-table \
- $(obj)/isoimage
- isohybrid $(obj)/image.iso 2>/dev/null || true
- rm -rf $(obj)/isoimage
+ $(call cmd,genimage,isoimage,$(obj)/image.iso)
+ @$(kecho) 'Kernel: $(obj)/image.iso is ready'
bzlilo: $(obj)/bzImage
if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
+ vmlinux-objs-y += $(obj)/mem_encrypt.o
endif
$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
/*
* Build early 4G boot pagetable
*/
+ /*
+ * If SEV is active then set the encryption mask in the page tables.
+ * This will insure that when the kernel is copied and decompressed
+ * it will be done so encrypted.
+ */
+ call get_sev_encryption_bit
+ xorl %edx, %edx
+ testl %eax, %eax
+ jz 1f
+ subl $32, %eax /* Encryption bit is always above bit 31 */
+ bts %eax, %edx /* Set encryption mask for page tables */
+1:
+
/* Initialize Page tables to 0 */
leal pgtable(%ebx), %edi
xorl %eax, %eax
leal pgtable + 0(%ebx), %edi
leal 0x1007 (%edi), %eax
movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
/* Build Level 3 */
leal pgtable + 0x1000(%ebx), %edi
leal 0x1007(%edi), %eax
movl $4, %ecx
1: movl %eax, 0x00(%edi)
+ addl %edx, 0x04(%edi)
addl $0x00001000, %eax
addl $8, %edi
decl %ecx
movl $0x00000183, %eax
movl $2048, %ecx
1: movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
addl $0x00200000, %eax
addl $8, %edi
decl %ecx
--- /dev/null
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/asm-offsets.h>
+
+ .text
+ .code32
+ENTRY(get_sev_encryption_bit)
+ xor %eax, %eax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ push %ebx
+ push %ecx
+ push %edx
+ push %edi
+
+ /*
+ * RIP-relative addressing is needed to access the encryption bit
+ * variable. Since we are running in 32-bit mode we need this call/pop
+ * sequence to get the proper relative addressing.
+ */
+ call 1f
+1: popl %edi
+ subl $1b, %edi
+
+ movl enc_bit(%edi), %eax
+ cmpl $0, %eax
+ jge .Lsev_exit
+
+ /* Check if running under a hypervisor */
+ movl $1, %eax
+ cpuid
+ bt $31, %ecx /* Check the hypervisor bit */
+ jnc .Lno_sev
+
+ movl $0x80000000, %eax /* CPUID to check the highest leaf */
+ cpuid
+ cmpl $0x8000001f, %eax /* See if 0x8000001f is available */
+ jb .Lno_sev
+
+ /*
+ * Check for the SEV feature:
+ * CPUID Fn8000_001F[EAX] - Bit 1
+ * CPUID Fn8000_001F[EBX] - Bits 5:0
+ * Pagetable bit position used to indicate encryption
+ */
+ movl $0x8000001f, %eax
+ cpuid
+ bt $1, %eax /* Check if SEV is available */
+ jnc .Lno_sev
+
+ movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */
+ rdmsr
+ bt $MSR_AMD64_SEV_ENABLED_BIT, %eax /* Check if SEV is active */
+ jnc .Lno_sev
+
+ movl %ebx, %eax
+ andl $0x3f, %eax /* Return the encryption bit location */
+ movl %eax, enc_bit(%edi)
+ jmp .Lsev_exit
+
+.Lno_sev:
+ xor %eax, %eax
+ movl %eax, enc_bit(%edi)
+
+.Lsev_exit:
+ pop %edi
+ pop %edx
+ pop %ecx
+ pop %ebx
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+ ret
+ENDPROC(get_sev_encryption_bit)
+
+ .code64
+ENTRY(get_sev_encryption_mask)
+ xor %rax, %rax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ push %rbp
+ push %rdx
+
+ movq %rsp, %rbp /* Save current stack pointer */
+
+ call get_sev_encryption_bit /* Get the encryption bit position */
+ testl %eax, %eax
+ jz .Lno_sev_mask
+
+ xor %rdx, %rdx
+ bts %rax, %rdx /* Create the encryption mask */
+ mov %rdx, %rax /* ... and return it */
+
+.Lno_sev_mask:
+ movq %rbp, %rsp /* Restore original stack pointer */
+
+ pop %rdx
+ pop %rbp
+#endif
+
+ ret
+ENDPROC(get_sev_encryption_mask)
+
+ .data
+enc_bit:
+ .int 0xffffffff
{ }
#endif
+unsigned long get_sev_encryption_mask(void);
+
#endif
* Mapping information structure passed to kernel_ident_mapping_init().
* Due to relocation, pointers must be assigned at run time not build time.
*/
-static struct x86_mapping_info mapping_info = {
- .page_flag = __PAGE_KERNEL_LARGE_EXEC,
-};
+static struct x86_mapping_info mapping_info;
/* Locates and clears a region for a new top level page table. */
void initialize_identity_maps(void)
{
+ unsigned long sev_me_mask = get_sev_encryption_mask();
+
/* Init mapping_info with run-time function/buffer pointers. */
mapping_info.alloc_pgt_page = alloc_pgt_page;
mapping_info.context = &pgt_data;
+ mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask;
+ mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask;
/*
* It should be impossible for this not to already be true,
--- /dev/null
+#!/bin/sh
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 2017 by Changbin Du <changbin.du@intel.com>
+#
+# Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others
+#
+# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture
+#
+# Arguments:
+# $1 - fdimage format
+# $2 - target image file
+# $3 - kernel bzImage file
+# $4 - mtool configuration file
+# $5 - kernel cmdline
+# $6 - inird image file
+#
+
+# Use "make V=1" to debug this script
+case "${KBUILD_VERBOSE}" in
+*1*)
+ set -x
+ ;;
+esac
+
+verify () {
+ if [ ! -f "$1" ]; then
+ echo "" 1>&2
+ echo " *** Missing file: $1" 1>&2
+ echo "" 1>&2
+ exit 1
+ fi
+}
+
+
+export MTOOLSRC=$4
+FIMAGE=$2
+FBZIMAGE=$3
+KCMDLINE=$5
+FDINITRD=$6
+
+# Make sure the files actually exist
+verify "$FBZIMAGE"
+verify "$MTOOLSRC"
+
+genbzdisk() {
+ mformat a:
+ syslinux $FIMAGE
+ echo "$KCMDLINE" | mcopy - a:syslinux.cfg
+ if [ -f "$FDINITRD" ] ; then
+ mcopy "$FDINITRD" a:initrd.img
+ fi
+ mcopy $FBZIMAGE a:linux
+}
+
+genfdimage144() {
+ dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null
+ mformat v:
+ syslinux $FIMAGE
+ echo "$KCMDLINE" | mcopy - v:syslinux.cfg
+ if [ -f "$FDINITRD" ] ; then
+ mcopy "$FDINITRD" v:initrd.img
+ fi
+ mcopy $FBZIMAGE v:linux
+}
+
+genfdimage288() {
+ dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null
+ mformat w:
+ syslinux $FIMAGE
+ echo "$KCMDLINE" | mcopy - W:syslinux.cfg
+ if [ -f "$FDINITRD" ] ; then
+ mcopy "$FDINITRD" w:initrd.img
+ fi
+ mcopy $FBZIMAGE w:linux
+}
+
+genisoimage() {
+ tmp_dir=`dirname $FIMAGE`/isoimage
+ rm -rf $tmp_dir
+ mkdir $tmp_dir
+ for i in lib lib64 share end ; do
+ for j in syslinux ISOLINUX ; do
+ if [ -f /usr/$i/$j/isolinux.bin ] ; then
+ isolinux=/usr/$i/$j/isolinux.bin
+ cp $isolinux $tmp_dir
+ fi
+ done
+ for j in syslinux syslinux/modules/bios ; do
+ if [ -f /usr/$i/$j/ldlinux.c32 ]; then
+ ldlinux=/usr/$i/$j/ldlinux.c32
+ cp $ldlinux $tmp_dir
+ fi
+ done
+ if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
+ break
+ fi
+ if [ $i = end -a -z "$isolinux" ] ; then
+ echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
+ exit 1
+ fi
+ done
+ cp $FBZIMAGE $tmp_dir/linux
+ echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
+ if [ -f "$FDINITRD" ] ; then
+ cp "$FDINITRD" $tmp_dir/initrd.img
+ fi
+ mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \
+ -c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \
+ $tmp_dir
+ isohybrid $FIMAGE 2>/dev/null || true
+ rm -rf $tmp_dir
+}
+
+case $1 in
+ bzdisk) genbzdisk;;
+ fdimage144) genfdimage144;;
+ fdimage288) genfdimage288;;
+ isoimage) genisoimage;;
+ *) echo 'Unknown image format'; exit 1;
+esac
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
# CONFIG_HIGHMEM64G is not set
-CONFIG_GUESS_UNWINDER=y
-# CONFIG_FRAME_POINTER_UNWINDER is not set
+CONFIG_UNWINDER_GUESS=y
+# CONFIG_UNWINDER_FRAME_POINTER is not set
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_OPTIMIZE_INLINING=y
+CONFIG_UNWINDER_ORC=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
UNWIND_HINT_REGS offset=\offset
.endm
- .macro RESTORE_EXTRA_REGS offset=0
- movq 0*8+\offset(%rsp), %r15
- movq 1*8+\offset(%rsp), %r14
- movq 2*8+\offset(%rsp), %r13
- movq 3*8+\offset(%rsp), %r12
- movq 4*8+\offset(%rsp), %rbp
- movq 5*8+\offset(%rsp), %rbx
- UNWIND_HINT_REGS offset=\offset extra=0
- .endm
-
- .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
- .if \rstor_r11
- movq 6*8(%rsp), %r11
- .endif
- .if \rstor_r8910
- movq 7*8(%rsp), %r10
- movq 8*8(%rsp), %r9
- movq 9*8(%rsp), %r8
- .endif
- .if \rstor_rax
- movq 10*8(%rsp), %rax
- .endif
- .if \rstor_rcx
- movq 11*8(%rsp), %rcx
- .endif
- .if \rstor_rdx
- movq 12*8(%rsp), %rdx
- .endif
- movq 13*8(%rsp), %rsi
- movq 14*8(%rsp), %rdi
- UNWIND_HINT_IRET_REGS offset=16*8
- .endm
- .macro RESTORE_C_REGS
- RESTORE_C_REGS_HELPER 1,1,1,1,1
- .endm
- .macro RESTORE_C_REGS_EXCEPT_RAX
- RESTORE_C_REGS_HELPER 0,1,1,1,1
- .endm
- .macro RESTORE_C_REGS_EXCEPT_RCX
- RESTORE_C_REGS_HELPER 1,0,1,1,1
- .endm
- .macro RESTORE_C_REGS_EXCEPT_R11
- RESTORE_C_REGS_HELPER 1,1,0,1,1
- .endm
- .macro RESTORE_C_REGS_EXCEPT_RCX_R11
- RESTORE_C_REGS_HELPER 1,0,0,1,1
- .endm
-
- .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
- subq $-(15*8+\addskip), %rsp
+ .macro POP_EXTRA_REGS
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ .endm
+
+ .macro POP_C_REGS
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
.endm
.macro icebp
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
- RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RSP(%rsp), %rsp
+ addq $6*8, %rsp /* skip extra regs -- they were preserved */
UNWIND_HINT_EMPTY
- USERGS_SYSRET64
+ jmp .Lpop_c_regs_except_rcx_r11_and_sysret
1:
/*
call do_syscall_64 /* returns with IRQs disabled */
return_from_SYSCALL_64:
- RESTORE_EXTRA_REGS
TRACE_IRQS_IRETQ /* we're about to change IF */
/*
* Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context.
+ * a completely clean 64-bit userspace context. If we're not,
+ * go to the slow exit path.
*/
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
- cmpq %rcx, %r11 /* RCX == RIP */
- jne opportunistic_sysret_failed
+
+ cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
+ jne swapgs_restore_regs_and_return_to_usermode
/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode
cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode
movq R11(%rsp), %r11
cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode
/*
* SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
* would never get past 'stuck_here'.
*/
testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz opportunistic_sysret_failed
+ jnz swapgs_restore_regs_and_return_to_usermode
/* nothing to check for RSP */
cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode
/*
* We win! This label is here just for ease of understanding
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
- RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RSP(%rsp), %rsp
UNWIND_HINT_EMPTY
+ POP_EXTRA_REGS
+.Lpop_c_regs_except_rcx_r11_and_sysret:
+ popq %rsi /* skip r11 */
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rsi /* skip rcx */
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ movq RSP-ORIG_RAX(%rsp), %rsp
USERGS_SYSRET64
-
-opportunistic_sysret_failed:
- SWAPGS
- jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
ENTRY(stub_ptregs_64)
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
- SWAPGS
- jmp restore_regs_and_iret
+ jmp swapgs_restore_regs_and_return_to_usermode
1:
/* kernel thread */
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
+
+GLOBAL(swapgs_restore_regs_and_return_to_usermode)
+#ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates user mode. */
+ testb $3, CS(%rsp)
+ jnz 1f
+ ud2
+1:
+#endif
SWAPGS
- jmp restore_regs_and_iret
+ POP_EXTRA_REGS
+ POP_C_REGS
+ addq $8, %rsp /* skip regs->orig_ax */
+ INTERRUPT_RETURN
+
/* Returning to kernel space */
retint_kernel:
*/
TRACE_IRQS_IRETQ
-/*
- * At this label, code paths which return to kernel and to user,
- * which come from interrupts/exception and from syscalls, merge.
- */
-GLOBAL(restore_regs_and_iret)
- RESTORE_EXTRA_REGS
-restore_c_regs_and_iret:
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
+GLOBAL(restore_regs_and_return_to_kernel)
+#ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates kernel mode. */
+ testb $3, CS(%rsp)
+ jz 1f
+ ud2
+1:
+#endif
+ POP_EXTRA_REGS
+ POP_C_REGS
+ addq $8, %rsp /* skip regs->orig_ax */
INTERRUPT_RETURN
ENTRY(native_iret)
ASM_CLAC
- .ifeq \has_error_code
+ .if \has_error_code == 0
pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
idtentry stack_segment do_stack_segment has_error_code=1
#ifdef CONFIG_XEN
+idtentry xennmi do_nmi has_error_code=0
idtentry xendebug do_debug has_error_code=0
idtentry xenint3 do_int3 has_error_code=0
#endif
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
- jnz paranoid_exit_no_swapgs
+ jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
-paranoid_exit_no_swapgs:
+ jmp .Lparanoid_exit_restore
+.Lparanoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
- INTERRUPT_RETURN
+.Lparanoid_exit_restore:
+ jmp restore_regs_and_return_to_kernel
END(paranoid_exit)
/*
jmp retint_user
END(error_exit)
-/* Runs on exception stack */
-/* XXX: broken on Xen PV */
+/*
+ * Runs on exception stack. Xen PV does not go through this path at all,
+ * so we can use real assembly here.
+ */
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
+
/*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
* stacks lest we corrupt the "NMI executing" variable.
*/
- SWAPGS_UNSAFE_STACK
+ swapgs
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts.
*/
- SWAPGS
- jmp restore_regs_and_iret
+ jmp swapgs_restore_regs_and_return_to_usermode
.Lnmi_from_kernel:
/*
popq %rdx
/* We are returning to kernel mode, so this cannot result in a fault. */
- INTERRUPT_RETURN
+ iretq
first_nmi:
/* Restore rdx. */
pushfq /* RFLAGS */
pushq $__KERNEL_CS /* CS */
pushq $1f /* RIP */
- INTERRUPT_RETURN /* continues at repeat_nmi below */
+ iretq /* continues at repeat_nmi below */
UNWIND_HINT_IRET_REGS
1:
#endif
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
+ POP_EXTRA_REGS
+ POP_C_REGS
- /* Point RSP at the "iret" frame. */
- REMOVE_PT_GPREGS_FROM_STACK 6*8
+ /*
+ * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
+ * at the "iret" frame.
+ */
+ addq $6*8, %rsp
/*
* Clear "NMI executing". Set DF first so that we can easily
* distinguish the remaining code between here and IRET from
- * the SYSCALL entry and exit paths. On a native kernel, we
- * could just inspect RIP, but, on paravirt kernels,
- * INTERRUPT_RETURN can translate into a jump into a
- * hypercall page.
+ * the SYSCALL entry and exit paths.
+ *
+ * We arguably should just inspect RIP instead, but I (Andy) wrote
+ * this code when I had the misapprehension that Xen PV supported
+ * NMIs, and Xen PV would break that approach.
*/
std
movq $0, 5*8(%rsp) /* clear "NMI executing" */
/*
- * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
- * stack in a single instruction. We are returning to kernel
- * mode, so this cannot result in a fault.
+ * iretq reads the "iret" frame and exits the NMI stack in a
+ * single instruction. We are returning to kernel mode, so this
+ * cannot result in a fault. Similarly, we don't need to worry
+ * about espfix64 on the way back to kernel mode.
*/
- INTERRUPT_RETURN
+ iretq
END(nmi)
ENTRY(ignore_sysret)
/* Go back to user mode. */
TRACE_IRQS_ON
- SWAPGS
- jmp restore_regs_and_iret
+ jmp swapgs_restore_regs_and_return_to_usermode
END(entry_INT80_compat)
ENTRY(stub32_clone)
# SPDX-License-Identifier: GPL-2.0
-out := $(obj)/../../include/generated/asm
-uapi := $(obj)/../../include/generated/uapi/asm
+out := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
# Create output directory if not already present
_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
struct pvclock_vsyscall_time_info *pvti =
pvclock_pvti_cpu0_va();
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
- ret = vm_insert_pfn(
+ ret = vm_insert_pfn_prot(
vma,
vmf->address,
- __pa(pvti) >> PAGE_SHIFT);
+ __pa(pvti) >> PAGE_SHIFT,
+ pgprot_decrypted(vma->vm_page_prot));
}
} else if (sym_offset == image->sym_hvclock_page) {
struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
do {
- asm volatile(RDRAND_LONG "\n\t"
+ asm volatile(RDRAND_LONG
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
if (ok)
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
do {
- asm volatile(RDRAND_INT "\n\t"
+ asm volatile(RDRAND_INT
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
if (ok)
static inline bool rdseed_long(unsigned long *v)
{
bool ok;
- asm volatile(RDSEED_LONG "\n\t"
+ asm volatile(RDSEED_LONG
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
return ok;
static inline bool rdseed_int(unsigned int *v)
{
bool ok;
- asm volatile(RDSEED_INT "\n\t"
+ asm volatile(RDSEED_INT
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
return ok;
static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
{
bool negative;
- asm volatile(LOCK_PREFIX "andb %2,%1\n\t"
+ asm volatile(LOCK_PREFIX "andb %2,%1"
CC_SET(s)
: CC_OUT(s) (negative), ADDR
: "ir" ((char) ~(1 << nr)) : "memory");
{
bool oldbit;
- asm("bts %2,%1\n\t"
+ asm("bts %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
{
bool oldbit;
- asm volatile("btr %2,%1\n\t"
+ asm volatile("btr %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
{
bool oldbit;
- asm volatile("btc %2,%1\n\t"
+ asm volatile("btc %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr) : "memory");
{
bool oldbit;
- asm volatile("bt %2,%1\n\t"
+ asm volatile("bt %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr));
*/
#include <linux/types.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <asm/processor.h>
#include <asm/user32.h>
#include <asm/unistd.h>
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
-#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
-#define setup_clear_cpu_cap(bit) do { \
- clear_cpu_cap(&boot_cpu_data, bit); \
- set_bit(bit, (unsigned long *)cpu_caps_cleared); \
-} while (0)
+
+extern void setup_clear_cpu_cap(unsigned int bit);
+extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
+
#define setup_force_cpu_cap(bit) do { \
set_cpu_cap(&boot_cpu_data, bit); \
set_bit(bit, (unsigned long *)cpu_caps_set); \
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 18 /* N 32-bit words worth of info */
-#define NBUGINTS 1 /* N 32-bit bug flags */
+#define NCAPINTS 18 /* N 32-bit words worth of info */
+#define NBUGINTS 1 /* N 32-bit bug flags */
/*
* Note: If the comment begins with a quoted string, that string is used
* in /proc/cpuinfo instead of the macro name. If the string is "",
* this feature bit is not displayed in /proc/cpuinfo at all.
+ *
+ * When adding new features here that depend on other features,
+ * please update the table in kernel/cpu/cpuid-deps.c as well.
*/
-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
- /* (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
-#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
-#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
+/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */
+#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
-#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
+#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */
+#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */
+#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */
+#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */
/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
+#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
/* Other features, Linux-defined mapping, word 3 */
/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-/* cpu types for specific tunings: */
-#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
-#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
-#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */
-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
-#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
-#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
+#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+
+/* CPU types for specific tunings: */
+#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */
+#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
+#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
+#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
+#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
+#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
+#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
-#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
-#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
-#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
-#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
-#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
-#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
+/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */
+#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */
+#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */
+#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */
+#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
+#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */
+#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */
+#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */
+#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */
+#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */
+#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */
+#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */
+#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
-#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
-#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
-#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
+#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
-#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
-#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
-#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
-#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
-#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
+/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */
+#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */
+#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */
+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */
+#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */
+#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */
+#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
+#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */
/*
* Auxiliary flags: Linux defined - For features scattered in various
*
* Reuse free bits when adding new feature flags!
*/
-#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
-#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
-#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
-#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
-#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
+#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
-#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
+#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
-#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
-#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
-#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
-#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
+#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
+#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
+#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
-#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
+#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
/* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
-#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
-#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
+#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
+#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
-#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
-#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
-#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
-#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
-#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
-#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
-#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
-#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
-#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
-#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
-#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
-#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
+#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */
+#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */
+#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
+#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
+#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
+#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */
+#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
+#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
-/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
-#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
-#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
-#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
-#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
+/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */
+#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */
+#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
+#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
+#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
-#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */
+#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
-#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
-#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */
+#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
-/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
-#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
-#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
+/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
+#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
+#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
-/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
-#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
-#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
-#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
-#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
-#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
+#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
-/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
-#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
-#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
-#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
-#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
-#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
-#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
-#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
+#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
+#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
-#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
-#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
-#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
-#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
-#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
+#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
+#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */
+#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
+#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
+#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
+#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
+#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
+#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
-/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
-#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
-#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */
-#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */
+/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
+#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */
+#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
+#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
/*
* BUG word(s)
*/
-#define X86_BUG(x) (NCAPINTS*32 + (x))
+#define X86_BUG(x) (NCAPINTS*32 + (x))
-#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
-#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
-#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
-#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
-#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
-#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
-#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
-#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
-#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
+#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
+#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
#ifdef CONFIG_X86_32
/*
* 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
* to avoid confusion.
*/
-#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
#endif
-#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
-#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
-#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
-#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
+#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
+#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
+#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+
#endif /* _ASM_X86_CPUFEATURES_H */
# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31))
#endif
+#ifdef CONFIG_X86_INTEL_UMIP
+# define DISABLE_UMIP 0
+#else
+# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31))
+#endif
+
#ifdef CONFIG_X86_64
# define DISABLE_VME (1<<(X86_FEATURE_VME & 31))
# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31))
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
-#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
#define DISABLED_MASK17 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM)
#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS)
+/* Identifiers for segment registers */
+#define INAT_SEG_REG_IGNORE 0
+#define INAT_SEG_REG_DEFAULT 1
+#define INAT_SEG_REG_CS 2
+#define INAT_SEG_REG_SS 3
+#define INAT_SEG_REG_DS 4
+#define INAT_SEG_REG_ES 5
+#define INAT_SEG_REG_FS 6
+#define INAT_SEG_REG_GS 7
+
/* Attribute search APIs */
extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
extern int inat_get_last_prefix_id(insn_byte_t last_pfx);
--- /dev/null
+#ifndef _ASM_X86_INSN_EVAL_H
+#define _ASM_X86_INSN_EVAL_H
+/*
+ * A collection of utility functions for x86 instruction analysis to be
+ * used in a kernel context. Useful when, for instance, making sense
+ * of the registers indicated by operands.
+ */
+
+#include <linux/compiler.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <asm/ptrace.h>
+
+#define INSN_CODE_SEG_ADDR_SZ(params) ((params >> 4) & 0xf)
+#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)
+#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4))
+
+void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
+int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
+unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
+char insn_get_code_seg_params(struct pt_regs *regs);
+
+#endif /* _ASM_X86_INSN_EVAL_H */
#endif
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#include <linux/jump_label.h>
+
+extern struct static_key_false sev_enable_key;
+static inline bool sev_key_active(void)
+{
+ return static_branch_unlikely(&sev_enable_key);
+}
+
+#else /* !CONFIG_AMD_MEM_ENCRYPT */
+
+static inline bool sev_key_active(void) { return false; }
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
#define BUILDIO(bwl, bw, type) \
static inline void out##bwl(unsigned type value, int port) \
{ \
\
static inline void outs##bwl(int port, const void *addr, unsigned long count) \
{ \
- asm volatile("rep; outs" #bwl \
- : "+S"(addr), "+c"(count) : "d"(port) : "memory"); \
+ if (sev_key_active()) { \
+ unsigned type *value = (unsigned type *)addr; \
+ while (count) { \
+ out##bwl(*value, port); \
+ value++; \
+ count--; \
+ } \
+ } else { \
+ asm volatile("rep; outs" #bwl \
+ : "+S"(addr), "+c"(count) \
+ : "d"(port) : "memory"); \
+ } \
} \
\
static inline void ins##bwl(int port, void *addr, unsigned long count) \
{ \
- asm volatile("rep; ins" #bwl \
- : "+D"(addr), "+c"(count) : "d"(port) : "memory"); \
+ if (sev_key_active()) { \
+ unsigned type *value = (unsigned type *)addr; \
+ while (count) { \
+ *value = in##bwl(port); \
+ value++; \
+ count--; \
+ } \
+ } else { \
+ asm volatile("rep; ins" #bwl \
+ : "+D"(addr), "+c"(count) \
+ : "d"(port) : "memory"); \
+ } \
}
BUILDIO(b, b, char)
void __init sme_encrypt_kernel(void);
void __init sme_enable(struct boot_params *bp);
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
+
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void);
void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
+bool sme_active(void);
+bool sev_active(void);
+
#else /* !CONFIG_AMD_MEM_ENCRYPT */
#define sme_me_mask 0ULL
static inline void __init sme_encrypt_kernel(void) { }
static inline void __init sme_enable(struct boot_params *bp) { }
+static inline bool sme_active(void) { return false; }
+static inline bool sev_active(void) { return false; }
+
+static inline int __init
+early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
+static inline int __init
+early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+
#endif /* CONFIG_AMD_MEM_ENCRYPT */
/*
#include <asm/orc_types.h>
struct mod_arch_specific {
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
unsigned int num_orcs;
int *orc_unwind_ip;
struct orc_entry *orc_unwind;
#define MSR_AMD64_IBSBRTARGET 0xc001103b
#define MSR_AMD64_IBSOPDATA4 0xc001103d
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_SEV 0xc0010131
+#define MSR_AMD64_SEV_ENABLED_BIT 0
+#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
/* Fam 17h MSRs */
#define MSR_F17H_IRPERF 0xc00000e9
#include <linux/cpumask.h>
#include <asm/frame.h>
-static inline void load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
{
- PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
+ PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
}
/* The paravirtualized CPUID instruction. */
void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
- void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+ void (*load_sp0)(unsigned long sp0);
void (*set_iopl_mask)(unsigned mask);
{
bool oldbit;
- asm volatile("bt "__percpu_arg(2)",%1\n\t"
+ asm volatile("bt "__percpu_arg(2)",%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
_PAGE_DIRTY | _PAGE_ENC)
+#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER)
#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC)
struct thread_struct {
/* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+#ifdef CONFIG_X86_32
unsigned long sp0;
+#endif
unsigned long sp;
#ifdef CONFIG_X86_32
unsigned long sysenter_cs;
}
static inline void
-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+native_load_sp0(unsigned long sp0)
{
- tss->x86_tss.sp0 = thread->sp0;
-#ifdef CONFIG_X86_32
- /* Only happens when SEP is enabled, no need to test "SEP"arately: */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
-#endif
+ this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
}
static inline void native_swapgs(void)
#endif
}
+static inline bool on_thread_stack(void)
+{
+ return (unsigned long)(current_top_of_stack() -
+ current_stack_pointer) < THREAD_SIZE;
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
#define __cpuid native_cpuid
-static inline void load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
{
- native_load_sp0(tss, thread);
+ native_load_sp0(sp0);
}
#define set_iopl_mask native_set_iopl_mask
#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
TOP_OF_KERNEL_STACK_PADDING)
+#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
+
+#define task_pt_regs(task) \
+({ \
+ unsigned long __ptr = (unsigned long)task_stack_page(task); \
+ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
+ ((struct pt_regs *)__ptr) - 1; \
+})
+
#ifdef CONFIG_X86_32
/*
* User space process size: 3GB (default).
.addr_limit = KERNEL_DS, \
}
-/*
- * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
- * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessible even if the CPU haven't stored the SS/ESP registers
- * on the stack (interrupt gate does not save these registers
- * when switching to the same priv ring).
- * Therefore beware: accessing the ss/esp fields of the
- * "struct pt_regs" is possible, but they may contain the
- * completely wrong values.
- */
-#define task_pt_regs(task) \
-({ \
- unsigned long __ptr = (unsigned long)task_stack_page(task); \
- __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
- ((struct pt_regs *)__ptr) - 1; \
-})
-
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
#define STACK_TOP_MAX TASK_SIZE_MAX
#define INIT_THREAD { \
- .sp0 = TOP_OF_INIT_STACK, \
.addr_limit = KERNEL_DS, \
}
-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
#endif
}
-#ifdef CONFIG_X86_64
static inline bool user_64bit_mode(struct pt_regs *regs)
{
+#ifdef CONFIG_X86_64
#ifndef CONFIG_PARAVIRT
/*
* On non-paravirt systems, this is the only long mode CPL 3
/* Headers are too twisted for this to go in paravirt.h. */
return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
#endif
+#else /* !CONFIG_X86_64 */
+ return false;
+#endif
}
+#ifdef CONFIG_X86_64
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
#endif
#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
do { \
bool c; \
- asm volatile (fullop ";" CC_SET(cc) \
+ asm volatile (fullop CC_SET(cc) \
: [counter] "+m" (var), CC_OUT(cc) (c) \
: __VA_ARGS__ : clobbers); \
return c; \
#ifndef _ASM_X86_SWITCH_TO_H
#define _ASM_X86_SWITCH_TO_H
+#include <linux/sched/task_stack.h>
+
struct task_struct; /* one of the stranger aspects of C forward declarations */
struct task_struct *__switch_to_asm(struct task_struct *prev,
((last) = __switch_to_asm((prev), (next))); \
} while (0)
+#ifdef CONFIG_X86_32
+static inline void refresh_sysenter_cs(struct thread_struct *thread)
+{
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+ return;
+
+ this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+}
+#endif
+
+/* This is used when switching tasks or entering/exiting vm86 mode. */
+static inline void update_sp0(struct task_struct *task)
+{
+#ifdef CONFIG_X86_32
+ load_sp0(task->thread.sp0);
+#else
+ load_sp0(task_top_of_stack(task));
+#endif
+}
+
#endif /* _ASM_X86_SWITCH_TO_H */
asmlinkage long sys_iopl(unsigned int);
/* kernel/ldt.c */
-asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+asmlinkage long sys_modify_ldt(int, void __user *, unsigned long);
/* kernel/signal.c */
asmlinkage long sys_rt_sigreturn(void);
)
);
-DEFINE_EVENT(x86_fpu, x86_fpu_state,
- TP_PROTO(struct fpu *fpu),
- TP_ARGS(fpu)
-);
-
DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)
TP_ARGS(fpu)
);
-DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state,
- TP_PROTO(struct fpu *fpu),
- TP_ARGS(fpu)
-);
-
DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)
#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
asmlinkage void xen_divide_error(void);
+asmlinkage void xen_xennmi(void);
asmlinkage void xen_xendebug(void);
asmlinkage void xen_xenint3(void);
-asmlinkage void xen_nmi(void);
asmlinkage void xen_overflow(void);
asmlinkage void xen_bounds(void);
asmlinkage void xen_invalid_op(void);
X86_TRAP_IRET = 32, /* 32, IRET Exception */
};
+/*
+ * Page fault error code bits:
+ *
+ * bit 0 == 0: no page found 1: protection fault
+ * bit 1 == 0: read access 1: write access
+ * bit 2 == 0: kernel-mode access 1: user-mode access
+ * bit 3 == 1: use of reserved bit detected
+ * bit 4 == 1: fault was an instruction fetch
+ * bit 5 == 1: protection keys block access
+ */
+enum x86_pf_error_code {
+ X86_PF_PROT = 1 << 0,
+ X86_PF_WRITE = 1 << 1,
+ X86_PF_USER = 1 << 2,
+ X86_PF_RSVD = 1 << 3,
+ X86_PF_INSTR = 1 << 4,
+ X86_PF_PK = 1 << 5,
+};
#endif /* _ASM_X86_TRAPS_H */
--- /dev/null
+#ifndef _ASM_X86_UMIP_H
+#define _ASM_X86_UMIP_H
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_INTEL_UMIP
+bool fixup_umip_exception(struct pt_regs *regs);
+#else
+static inline bool fixup_umip_exception(struct pt_regs *regs) { return false; }
+#endif /* CONFIG_X86_INTEL_UMIP */
+#endif /* _ASM_X86_UMIP_H */
struct task_struct *task;
int graph_idx;
bool error;
-#if defined(CONFIG_ORC_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC)
bool signal, full_regs;
unsigned long sp, bp, ip;
struct pt_regs *regs;
-#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
+#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
bool got_irq;
unsigned long *bp, *orig_sp, ip;
struct pt_regs *regs;
__unwind_start(state, task, regs, first_frame);
}
-#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
if (unwind_done(state))
}
#endif
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
void unwind_init(void);
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
void *orc, size_t orc_size);
#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
#define KVM_PV_EOI_DISABLED 0x0
-
#endif /* _UAPI_ASM_X86_KVM_PARA_H */
#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_UMIP_BIT 11 /* enable UMIP support */
+#define X86_CR4_UMIP _BITUL(X86_CR4_UMIP_BIT)
#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */
#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT)
#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */
#define CX86_ARR_BASE 0xc4
#define CX86_RCR_BASE 0xdc
+#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+ X86_CR0_PG)
#endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */
KASAN_SANITIZE_head$(BITS).o := n
KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
-KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_paravirt.o := n
-OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
+obj-$(CONFIG_X86_INTEL_UMIP) += umip.o
-obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o
-obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o
-obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o
+obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
###
# 64 bit specific files
{
const s32 *poff;
- mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
if (*ptr == 0x3e)
text_poke(ptr, ((unsigned char []){0xf0}), 1);
}
- mutex_unlock(&text_mutex);
}
static void alternatives_smp_unlock(const s32 *start, const s32 *end,
{
const s32 *poff;
- mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
if (*ptr == 0xf0)
text_poke(ptr, ((unsigned char []){0x3E}), 1);
}
- mutex_unlock(&text_mutex);
}
struct smp_alt_module {
struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
-static DEFINE_MUTEX(smp_alt);
-static bool uniproc_patched = false; /* protected by smp_alt */
+static bool uniproc_patched = false; /* protected by text_mutex */
void __init_or_module alternatives_smp_module_add(struct module *mod,
char *name,
{
struct smp_alt_module *smp;
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
if (!uniproc_patched)
goto unlock;
smp_unlock:
alternatives_smp_unlock(locks, locks_end, text, text_end);
unlock:
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
kfree(item);
break;
}
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
void alternatives_enable_smp(void)
/* Why bother if there are no other CPUs? */
BUG_ON(num_possible_cpus() == 1);
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
if (uniproc_patched) {
pr_info("switching to SMP code\n");
mod->text, mod->text_end);
uniproc_patched = false;
}
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
-/* Return 1 if the address range is reserved for smp-alternatives */
+/*
+ * Return 1 if the address range is reserved for SMP-alternatives.
+ * Must hold text_mutex.
+ */
int alternatives_text_reserved(void *start, void *end)
{
struct smp_alt_module *mod;
u8 *text_start = start;
u8 *text_end = end;
+ lockdep_assert_held(&text_mutex);
+
list_for_each_entry(mod, &smp_alt_modules, next) {
if (mod->text > text_end || mod->text_end < text_start)
continue;
obj-y += match.o
obj-y += bugs.o
obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
+obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
}
}
+static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+{
+ /* Check the boot processor, plus build option for UMIP. */
+ if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+ goto out;
+
+ /* Check the current processor's cpuid bits. */
+ if (!cpu_has(c, X86_FEATURE_UMIP))
+ goto out;
+
+ cr4_set_bits(X86_CR4_UMIP);
+
+ return;
+
+out:
+ /*
+ * Make sure UMIP is disabled in case it was enabled in a
+ * previous boot (e.g., via kexec).
+ */
+ cr4_clear_bits(X86_CR4_UMIP);
+}
+
/*
* Protection Keys are not available in 32-bit mode.
*/
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
- /* Set up SMEP/SMAP */
+ /* Set up SMEP/SMAP/UMIP */
setup_smep(c);
setup_smap(c);
+ setup_umip(c);
/*
* The vendor-specific functions might have changed features.
pr_cont(")\n");
}
-static __init int setup_disablecpuid(char *arg)
+/*
+ * clearcpuid= was already parsed in fpu__init_parse_early_param.
+ * But we need to keep a dummy __setup around otherwise it would
+ * show up as an environment variable for init.
+ */
+static __init int setup_clearcpuid(char *arg)
{
- int bit;
-
- if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
-
return 1;
}
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("clearcpuid=", setup_clearcpuid);
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(union irq_stack_union,
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, me);
- load_sp0(t, ¤t->thread);
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
set_tss_desc(cpu, t);
load_TR_desc();
+
load_mm_ldt(&init_mm);
clear_all_debug_regs();
int cpu = smp_processor_id();
struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
- struct thread_struct *thread = &curr->thread;
wait_for_master_cpu(cpu);
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, curr);
- load_sp0(t, thread);
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
set_tss_desc(cpu, t);
load_TR_desc();
+
load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
--- /dev/null
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+const static struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ {}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ /*
+ * Note: This could use the non atomic __*_bit() variants, but the
+ * rest of the cpufeature code uses atomics as well, so keep it for
+ * consistency. Cleanup all of it separately.
+ */
+ if (!c) {
+ clear_cpu_cap(&boot_cpu_data, feature);
+ set_bit(feature, (unsigned long *)cpu_caps_cleared);
+ } else {
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+ }
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+ const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
}
#ifdef CONFIG_KEXEC_FILE
-static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
{
unsigned int *nr_ranges = arg;
return ret;
}
-static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
{
struct crash_elf_data *ced = arg;
Elf64_Ehdr *ehdr;
ehdr = ced->ehdr;
/* Exclude unwanted mem ranges */
- ret = elf_header_exclude_ranges(ced, start, end);
+ ret = elf_header_exclude_ranges(ced, res->start, res->end);
if (ret)
return ret;
return 0;
}
-static int memmap_entry_callback(u64 start, u64 end, void *arg)
+static int memmap_entry_callback(struct resource *res, void *arg)
{
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
- ei.addr = start;
- ei.size = end - start + 1;
+ ei.addr = res->start;
+ ei.size = resource_size(res);
ei.type = cmd->type;
add_e820_entry(params, &ei);
return ret;
}
-static int determine_backup_region(u64 start, u64 end, void *arg)
+static int determine_backup_region(struct resource *res, void *arg)
{
struct kimage *image = arg;
- image->arch.backup_src_start = start;
- image->arch.backup_src_sz = end - start + 1;
+ image->arch.backup_src_start = res->start;
+ image->arch.backup_src_sz = resource_size(res);
/* Expecting only one range for backup region */
return 1;
*/
static void __init fpu__init_parse_early_param(void)
{
+ char arg[32];
+ char *argptr = arg;
+ int bit;
+
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
+ sizeof(arg)) &&
+ get_option(&argptr, &bit) &&
+ bit >= 0 &&
+ bit < NCAPINTS * 32)
+ setup_clear_cpu_cap(bit);
}
/*
#include <asm/fpu/xstate.h>
#include <asm/tlbflush.h>
+#include <asm/cpufeature.h>
/*
* Although we spell it out in here, the Processor Trace
"unknown xstate feature" ,
};
+static short xsave_cpuid_features[] __initdata = {
+ X86_FEATURE_FPU,
+ X86_FEATURE_XMM,
+ X86_FEATURE_AVX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_INTEL_PT,
+ X86_FEATURE_PKU,
+};
+
/*
* Mask of xstate features supported by the CPU and the kernel:
*/
void fpu__xstate_clear_all_cpu_caps(void)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- setup_clear_cpu_cap(X86_FEATURE_AVX2);
- setup_clear_cpu_cap(X86_FEATURE_AVX512F);
- setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
- setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
- setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
- setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
- setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
- setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
- setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
- setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
- setup_clear_cpu_cap(X86_FEATURE_PKU);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
}
/*
unsigned int eax, ebx, ecx, edx;
static int on_boot_cpu __initdata = 1;
int err;
+ int i;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
goto out_disable;
}
+ /*
+ * Clear XSAVE features that are disabled in the normal CPUID.
+ */
+ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+ if (!boot_cpu_has(xsave_cpuid_features[i]))
+ xfeatures_mask &= ~BIT(i);
+ }
+
xfeatures_mask &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
#endif
.Ldefault_entry:
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
movl $(CR0_STATE & ~X86_CR0_PG),%eax
movl %eax,%cr0
# 24(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
pushl $0 # Dummy error code, to make stack frame uniform
.endif
pushl $i # 20(%esp) Vector number
*
*/
-#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
+#endif
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
.code64
.globl startup_64
startup_64:
+ UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
+ UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
movq $1f, %rax
jmp *%rax
1:
+ UNWIND_HINT_EMPTY
/* Check if nx is implemented */
movl $0x80000001, %eax
1: wrmsr /* Make changes effective */
/* Setup cr0 */
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
pushq %rax # target address in negative space
lretq
.Lafter_lret:
-ENDPROC(secondary_startup_64)
+END(secondary_startup_64)
#include "verify_cpu.S"
*/
ENTRY(start_cpu0)
movq initial_stack(%rip), %rsp
+ UNWIND_HINT_EMPTY
jmp .Ljump_to_C_code
ENDPROC(start_cpu0)
#endif
.quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
__FINITDATA
-bad_address:
- jmp bad_address
-
__INIT
ENTRY(early_idt_handler_array)
- # 104(%rsp) %rflags
- # 96(%rsp) %cs
- # 88(%rsp) %rip
- # 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
- pushq $0 # Dummy error code, to make stack frame uniform
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ UNWIND_HINT_IRET_REGS
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .else
+ UNWIND_HINT_IRET_REGS offset=8
.endif
pushq $i # 72(%rsp) Vector number
jmp early_idt_handler_common
+ UNWIND_HINT_IRET_REGS
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-ENDPROC(early_idt_handler_array)
+ UNWIND_HINT_IRET_REGS offset=16
+END(early_idt_handler_array)
early_idt_handler_common:
/*
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
cmpq $14,%rsi /* Page fault? */
jnz 10f
20:
decl early_recursion_flag(%rip)
- jmp restore_regs_and_iret
-ENDPROC(early_idt_handler_common)
+ jmp restore_regs_and_return_to_kernel
+END(early_idt_handler_common)
__INITDATA
.data
-#ifndef CONFIG_XEN
-NEXT_PAGE(init_top_pgt)
- .fill 512,8,0
-#else
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
* Don't set NX because code runs from these pages.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#else
+NEXT_PAGE(init_top_pgt)
+ .fill 512,8,0
#endif
#ifdef CONFIG_X86_5LEVEL
EXPORT_SYMBOL(phys_base)
#include "../../x86/xen/xen-head.S"
-
+
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
-static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
-static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
/*
cpu, (unsigned long long) slow_virt_to_phys(st));
}
-static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
{
wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}
+static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+{
+ early_set_memory_decrypted((unsigned long) ptr, size);
+}
+
+/*
+ * Iterate through all possible CPUs and map the memory region pointed
+ * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
+ *
+ * Note: we iterate through all possible CPUs to ensure that CPUs
+ * hotplugged will have their per-cpu variable already mapped as
+ * decrypted.
+ */
+static void __init sev_map_percpu_data(void)
+{
+ int cpu;
+
+ if (!sev_active())
+ return;
+
+ for_each_possible_cpu(cpu) {
+ __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
+ __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
+ __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
+ }
+}
+
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
+ /*
+ * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
+ * shares the guest physical address with the hypervisor.
+ */
+ sev_map_percpu_data();
+
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
kvm_spinlock_init();
kvm_cpu_online, kvm_cpu_down_prepare) < 0)
pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
#else
+ sev_map_percpu_data();
kvm_guest_cpu_init();
#endif
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
#include <asm/kvmclock.h>
/* The hypervisor will put information about time periodically here */
static struct pvclock_vsyscall_time_info *hv_clock;
-static struct pvclock_wall_clock wall_clock;
+static struct pvclock_wall_clock *wall_clock;
struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
{
int low, high;
int cpu;
- low = (int)__pa_symbol(&wall_clock);
- high = ((u64)__pa_symbol(&wall_clock) >> 32);
+ low = (int)slow_virt_to_phys(wall_clock);
+ high = ((u64)slow_virt_to_phys(wall_clock) >> 32);
native_write_msr(msr_kvm_wall_clock, low, high);
cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
- pvclock_read_wallclock(&wall_clock, vcpu_time, now);
+ pvclock_read_wallclock(wall_clock, vcpu_time, now);
put_cpu();
}
native_machine_shutdown();
}
+static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size,
+ phys_addr_t align)
+{
+ phys_addr_t mem;
+
+ mem = memblock_alloc(size, align);
+ if (!mem)
+ return 0;
+
+ if (sev_active()) {
+ if (early_set_memory_decrypted((unsigned long)__va(mem), size))
+ goto e_free;
+ }
+
+ return mem;
+e_free:
+ memblock_free(mem, size);
+ return 0;
+}
+
+static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size)
+{
+ if (sev_active())
+ early_set_memory_encrypted((unsigned long)__va(addr), size);
+
+ memblock_free(addr, size);
+}
+
void __init kvmclock_init(void)
{
struct pvclock_vcpu_time_info *vcpu_time;
- unsigned long mem;
- int size, cpu;
+ unsigned long mem, mem_wall_clock;
+ int size, cpu, wall_clock_size;
u8 flags;
size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
return;
- printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
- msr_kvm_system_time, msr_kvm_wall_clock);
+ wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock));
+ mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE);
+ if (!mem_wall_clock)
+ return;
- mem = memblock_alloc(size, PAGE_SIZE);
- if (!mem)
+ wall_clock = __va(mem_wall_clock);
+ memset(wall_clock, 0, wall_clock_size);
+
+ mem = kvm_memblock_alloc(size, PAGE_SIZE);
+ if (!mem) {
+ kvm_memblock_free(mem_wall_clock, wall_clock_size);
+ wall_clock = NULL;
return;
+ }
+
hv_clock = __va(mem);
memset(hv_clock, 0, size);
if (kvm_register_clock("primary cpu clock")) {
hv_clock = NULL;
- memblock_free(mem, size);
+ kvm_memblock_free(mem, size);
+ kvm_memblock_free(mem_wall_clock, wall_clock_size);
+ wall_clock = NULL;
return;
}
+ printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+ msr_kvm_system_time, msr_kvm_wall_clock);
+
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
return error;
}
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
- unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
{
int ret = -ENOSYS;
ret = write_ldt(ptr, bytecount, 0);
break;
}
- return ret;
+ /*
+ * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+ * return type, but tht ABI for sys_modify_ldt() expects
+ * 'int'. This cast gives us an int-sized value in %rax
+ * for the return code. The 'unsigned' is necessary so
+ * the compiler does not try to sign-extend the negative
+ * return codes into the high half of the register when
+ * taking the value from int->long.
+ */
+ return (unsigned int)ret;
}
#include <linux/init.h>
#include <linux/ioport.h>
-static int found(u64 start, u64 end, void *data)
+static int found(struct resource *res, void *data)
{
return 1;
}
*/
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
.x86_tss = {
- .sp0 = TOP_OF_INIT_STACK,
+ /*
+ * .sp0 is only used when entering ring 0 from a lower
+ * privilege level. Since the init task never runs anything
+ * but ring 0 code, there is no need for a valid value here.
+ * Poison it.
+ */
+ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
/*
* Reload esp0 and cpu_current_top_of_stack. This changes
- * current_thread_info().
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
*/
- load_sp0(tss, next);
+ update_sp0(next_p);
+ refresh_sysenter_cs(next);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
struct inactive_task_frame *frame;
struct task_struct *me = current;
- p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
*/
this_cpu_write(current_task, next_p);
- /* Reload esp0 and ss1. This changes current_thread_info(). */
- load_sp0(tss, next);
+ /* Reload sp0. */
+ update_sp0(next_p);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
* If SME is active, this memory will be marked encrypted by the
* kernel when it is accessed (including relocation). However, the
* ramdisk image was loaded decrypted by the bootloader, so make
- * sure that it is encrypted before accessing it.
+ * sure that it is encrypted before accessing it. For SEV the
+ * ramdisk will already be encrypted, so only do this for SME.
*/
- sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+ if (sme_active())
+ sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
initrd_start = 0;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
- per_cpu(cpu_current_top_of_stack, cpu) =
- (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#else
initial_gs = per_cpu_offset(cpu);
#endif
#include <asm/trace/mpx.h>
#include <asm/mpx.h>
#include <asm/vm86.h>
+#include <asm/umip.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
- BUG_ON((unsigned long)(current_top_of_stack() -
- current_stack_pointer) >= THREAD_SIZE);
+ BUG_ON(!on_thread_stack());
preempt_enable_no_resched();
}
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
+ if (static_cpu_has(X86_FEATURE_UMIP)) {
+ if (user_mode(regs) && fixup_umip_exception(regs))
+ return;
+ }
+
if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
--- /dev/null
+/*
+ * umip.c Emulation for instruction protected by the Intel User-Mode
+ * Instruction Prevention feature
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ * Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ */
+
+#include <linux/uaccess.h>
+#include <asm/umip.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/ratelimit.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "umip: " fmt
+
+/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
+ *
+ * The feature User-Mode Instruction Prevention present in recent Intel
+ * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str)
+ * from being executed with CPL > 0. Otherwise, a general protection fault is
+ * issued.
+ *
+ * Rather than relaying to the user space the general protection fault caused by
+ * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
+ * trapped and emulate the result of such instructions to provide dummy values.
+ * This allows to both conserve the current kernel behavior and not reveal the
+ * system resources that UMIP intends to protect (i.e., the locations of the
+ * global descriptor and interrupt descriptor tables, the segment selectors of
+ * the local descriptor table, the value of the task state register and the
+ * contents of the CR0 register).
+ *
+ * This emulation is needed because certain applications (e.g., WineHQ and
+ * DOSEMU2) rely on this subset of instructions to function.
+ *
+ * The instructions protected by UMIP can be split in two groups. Those which
+ * return a kernel memory address (sgdt and sidt) and those which return a
+ * value (sldt, str and smsw).
+ *
+ * For the instructions that return a kernel memory address, applications
+ * such as WineHQ rely on the result being located in the kernel memory space,
+ * not the actual location of the table. The result is emulated as a hard-coded
+ * value that, lies close to the top of the kernel memory. The limit for the GDT
+ * and the IDT are set to zero.
+ *
+ * Given that sldt and str are not commonly used in programs that run on WineHQ
+ * or DOSEMU2, they are not emulated.
+ *
+ * The instruction smsw is emulated to return the value that the register CR0
+ * has at boot time as set in the head_32.
+ *
+ * Also, emulation is provided only for 32-bit processes; 64-bit processes
+ * that attempt to use the instructions that UMIP protects will receive the
+ * SIGSEGV signal issued as a consequence of the general protection fault.
+ *
+ * Care is taken to appropriately emulate the results when segmentation is
+ * used. That is, rather than relying on USER_DS and USER_CS, the function
+ * insn_get_addr_ref() inspects the segment descriptor pointed by the
+ * registers in pt_regs. This ensures that we correctly obtain the segment
+ * base address and the address and operand sizes even if the user space
+ * application uses a local descriptor table.
+ */
+
+#define UMIP_DUMMY_GDT_BASE 0xfffe0000
+#define UMIP_DUMMY_IDT_BASE 0xffff0000
+
+/*
+ * The SGDT and SIDT instructions store the contents of the global descriptor
+ * table and interrupt table registers, respectively. The destination is a
+ * memory operand of X+2 bytes. X bytes are used to store the base address of
+ * the table and 2 bytes are used to store the limit. In 32-bit processes, the
+ * only processes for which emulation is provided, X has a value of 4.
+ */
+#define UMIP_GDT_IDT_BASE_SIZE 4
+#define UMIP_GDT_IDT_LIMIT_SIZE 2
+
+#define UMIP_INST_SGDT 0 /* 0F 01 /0 */
+#define UMIP_INST_SIDT 1 /* 0F 01 /1 */
+#define UMIP_INST_SMSW 3 /* 0F 01 /4 */
+
+/**
+ * identify_insn() - Identify a UMIP-protected instruction
+ * @insn: Instruction structure with opcode and ModRM byte.
+ *
+ * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected
+ * instruction that can be emulated.
+ *
+ * Returns:
+ *
+ * On success, a constant identifying a specific UMIP-protected instruction that
+ * can be emulated.
+ *
+ * -EINVAL on error or when not an UMIP-protected instruction that can be
+ * emulated.
+ */
+static int identify_insn(struct insn *insn)
+{
+ /* By getting modrm we also get the opcode. */
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ /* All the instructions of interest start with 0x0f. */
+ if (insn->opcode.bytes[0] != 0xf)
+ return -EINVAL;
+
+ if (insn->opcode.bytes[1] == 0x1) {
+ switch (X86_MODRM_REG(insn->modrm.value)) {
+ case 0:
+ return UMIP_INST_SGDT;
+ case 1:
+ return UMIP_INST_SIDT;
+ case 4:
+ return UMIP_INST_SMSW;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* SLDT AND STR are not emulated */
+ return -EINVAL;
+}
+
+/**
+ * emulate_umip_insn() - Emulate UMIP instructions and return dummy values
+ * @insn: Instruction structure with operands
+ * @umip_inst: A constant indicating the instruction to emulate
+ * @data: Buffer into which the dummy result is stored
+ * @data_size: Size of the emulated result
+ *
+ * Emulate an instruction protected by UMIP and provide a dummy result. The
+ * result of the emulation is saved in @data. The size of the results depends
+ * on both the instruction and type of operand (register vs memory address).
+ * The size of the result is updated in @data_size. Caller is responsible
+ * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE +
+ * UMIP_GDT_IDT_LIMIT_SIZE bytes.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error while emulating.
+ */
+static int emulate_umip_insn(struct insn *insn, int umip_inst,
+ unsigned char *data, int *data_size)
+{
+ unsigned long dummy_base_addr, dummy_value;
+ unsigned short dummy_limit = 0;
+
+ if (!data || !data_size || !insn)
+ return -EINVAL;
+ /*
+ * These two instructions return the base address and limit of the
+ * global and interrupt descriptor table, respectively. According to the
+ * Intel Software Development manual, the base address can be 24-bit,
+ * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is
+ * 16-bit, the returned value of the base address is supposed to be a
+ * zero-extended 24-byte number. However, it seems that a 32-byte number
+ * is always returned irrespective of the operand size.
+ */
+ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) {
+ /* SGDT and SIDT do not use registers operands. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ if (umip_inst == UMIP_INST_SGDT)
+ dummy_base_addr = UMIP_DUMMY_GDT_BASE;
+ else
+ dummy_base_addr = UMIP_DUMMY_IDT_BASE;
+
+ *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE;
+
+ memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE);
+ memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
+
+ } else if (umip_inst == UMIP_INST_SMSW) {
+ dummy_value = CR0_STATE;
+
+ /*
+ * Even though the CR0 register has 4 bytes, the number
+ * of bytes to be copied in the result buffer is determined
+ * by whether the operand is a register or a memory location.
+ * If operand is a register, return as many bytes as the operand
+ * size. If operand is memory, return only the two least
+ * siginificant bytes of CR0.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ *data_size = insn->opnd_bytes;
+ else
+ *data_size = 2;
+
+ memcpy(data, &dummy_value, *data_size);
+ /* STR and SLDT are not emulated */
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR
+ * @addr: Address that caused the signal
+ * @regs: Register set containing the instruction pointer
+ *
+ * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is
+ * intended to be used to provide a segmentation fault when the result of the
+ * UMIP emulation could not be copied to the user space memory.
+ *
+ * Returns: none
+ */
+static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
+{
+ siginfo_t info;
+ struct task_struct *tsk = current;
+
+ tsk->thread.cr2 = (unsigned long)addr;
+ tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = addr;
+ force_sig_info(SIGSEGV, &info, tsk);
+
+ if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
+ return;
+
+ pr_err_ratelimited("%s[%d] umip emulation segfault ip:%lx sp:%lx error:%x in %lx\n",
+ tsk->comm, task_pid_nr(tsk), regs->ip,
+ regs->sp, X86_PF_USER | X86_PF_WRITE,
+ regs->ip);
+}
+
+/**
+ * fixup_umip_exception() - Fixup a general protection fault caused by UMIP
+ * @regs: Registers as saved when entering the #GP handler
+ *
+ * The instructions sgdt, sidt, str, smsw, sldt cause a general protection
+ * fault if executed with CPL > 0 (i.e., from user space). If the offending
+ * user-space process is not in long mode, this function fixes the exception
+ * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not
+ * fixed up. Also long mode user-space processes are not fixed up.
+ *
+ * If operands are memory addresses, results are copied to user-space memory as
+ * indicated by the instruction pointed by eIP using the registers indicated in
+ * the instruction operands. If operands are registers, results are copied into
+ * the context that was saved when entering kernel mode.
+ *
+ * Returns:
+ *
+ * True if emulation was successful; false if not.
+ */
+bool fixup_umip_exception(struct pt_regs *regs)
+{
+ int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst;
+ unsigned long seg_base = 0, *reg_addr;
+ /* 10 bytes is the maximum size of the result of UMIP instructions */
+ unsigned char dummy_data[10] = { 0 };
+ unsigned char buf[MAX_INSN_SIZE];
+ void __user *uaddr;
+ struct insn insn;
+ char seg_defs;
+
+ if (!regs)
+ return false;
+
+ /* Do not emulate 64-bit processes. */
+ if (user_64bit_mode(regs))
+ return false;
+
+ /*
+ * If not in user-space long mode, a custom code segment could be in
+ * use. This is true in protected mode (if the process defined a local
+ * descriptor table), or virtual-8086 mode. In most of the cases
+ * seg_base will be zero as in USER_CS.
+ */
+ if (!user_64bit_mode(regs))
+ seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
+
+ if (seg_base == -1L)
+ return false;
+
+ not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
+ sizeof(buf));
+ nr_copied = sizeof(buf) - not_copied;
+
+ /*
+ * The copy_from_user above could have failed if user code is protected
+ * by a memory protection key. Give up on emulation in such a case.
+ * Should we issue a page fault?
+ */
+ if (!nr_copied)
+ return false;
+
+ insn_init(&insn, buf, nr_copied, user_64bit_mode(regs));
+
+ /*
+ * Override the default operand and address sizes with what is specified
+ * in the code segment descriptor. The instruction decoder only sets
+ * the address size it to either 4 or 8 address bytes and does nothing
+ * for the operand bytes. This OK for most of the cases, but we could
+ * have special cases where, for instance, a 16-bit code segment
+ * descriptor is used.
+ * If there is an address override prefix, the instruction decoder
+ * correctly updates these values, even for 16-bit defaults.
+ */
+ seg_defs = insn_get_code_seg_params(regs);
+ if (seg_defs == -EINVAL)
+ return false;
+
+ insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
+ insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
+
+ insn_get_length(&insn);
+ if (nr_copied < insn.length)
+ return false;
+
+ umip_inst = identify_insn(&insn);
+ if (umip_inst < 0)
+ return false;
+
+ if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size))
+ return false;
+
+ /*
+ * If operand is a register, write result to the copy of the register
+ * value that was pushed to the stack when entering into kernel mode.
+ * Upon exit, the value we write will be restored to the actual hardware
+ * register.
+ */
+ if (X86_MODRM_MOD(insn.modrm.value) == 3) {
+ reg_offset = insn_get_modrm_rm_off(&insn, regs);
+
+ /*
+ * Negative values are usually errors. In memory addressing,
+ * the exception is -EDOM. Since we expect a register operand,
+ * all negative values are errors.
+ */
+ if (reg_offset < 0)
+ return false;
+
+ reg_addr = (unsigned long *)((unsigned long)regs + reg_offset);
+ memcpy(reg_addr, dummy_data, dummy_data_size);
+ } else {
+ uaddr = insn_get_addr_ref(&insn, regs);
+ if ((unsigned long)uaddr == -1L)
+ return false;
+
+ nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size);
+ if (nr_copied > 0) {
+ /*
+ * If copy fails, send a signal and tell caller that
+ * fault was fixed up.
+ */
+ force_sig_info_umip_fault(uaddr, regs);
+ return true;
+ }
+ }
+
+ /* increase IP to let the program keep going */
+ regs->ip += insn.length;
+ return true;
+}
int i;
for (i = 0; i < insn->prefixes.nbytes; i++) {
- switch (insn->prefixes.bytes[i]) {
- case 0x26: /* INAT_PFX_ES */
- case 0x2E: /* INAT_PFX_CS */
- case 0x36: /* INAT_PFX_DS */
- case 0x3E: /* INAT_PFX_SS */
- case 0xF0: /* INAT_PFX_LOCK */
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
return true;
}
}
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
-verify_cpu:
+ENTRY(verify_cpu)
pushf # Save caller passed flags
push $0 # Kill any dangerous flags
popf
popf # Restore caller passed flags
xorl %eax, %eax
ret
+ENDPROC(verify_cpu)
#include <asm/irq.h>
#include <asm/traps.h>
#include <asm/vm86.h>
+#include <asm/switch_to.h>
/*
* Known problems:
void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86plus_struct __user *user;
struct vm86 *vm86 = current->thread.vm86;
do_exit(SIGSEGV);
}
- tss = &per_cpu(cpu_tss, get_cpu());
+ preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
- load_sp0(tss, &tsk->thread);
+ update_sp0(tsk);
+ refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
- put_cpu();
+ preempt_enable();
memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs));
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86 *vm86 = tsk->thread.vm86;
struct kernel_vm86_regs vm86regs;
vm86->saved_sp0 = tsk->thread.sp0;
lazy_save_gs(vm86->regs32.gs);
- tss = &per_cpu(cpu_tss, get_cpu());
/* make room for real-mode segments */
+ preempt_disable();
tsk->thread.sp0 += 16;
- if (static_cpu_has(X86_FEATURE_SEP))
+ if (static_cpu_has(X86_FEATURE_SEP)) {
tsk->thread.sysenter_cs = 0;
+ refresh_sysenter_cs(&tsk->thread);
+ }
- load_sp0(tss, &tsk->thread);
- put_cpu();
+ update_sp0(tsk);
+ preempt_enable();
if (vm86->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
-lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
--- /dev/null
+/*
+ * Utility functions for x86 operand and address decoding
+ *
+ * Copyright (C) Intel Corporation 2017
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ratelimit.h>
+#include <linux/mmu_context.h>
+#include <asm/desc_defs.h>
+#include <asm/desc.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/ldt.h>
+#include <asm/vm86.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "insn: " fmt
+
+enum reg_type {
+ REG_TYPE_RM = 0,
+ REG_TYPE_INDEX,
+ REG_TYPE_BASE,
+};
+
+/**
+ * is_string_insn() - Determine if instruction is a string instruction
+ * @insn: Instruction containing the opcode to inspect
+ *
+ * Returns:
+ *
+ * true if the instruction, determined by the opcode, is any of the
+ * string instructions as defined in the Intel Software Development manual.
+ * False otherwise.
+ */
+static bool is_string_insn(struct insn *insn)
+{
+ insn_get_opcode(insn);
+
+ /* All string instructions have a 1-byte opcode. */
+ if (insn->opcode.nbytes != 1)
+ return false;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x6c ... 0x6f: /* INS, OUTS */
+ case 0xa4 ... 0xa7: /* MOVS, CMPS */
+ case 0xaa ... 0xaf: /* STOS, LODS, SCAS */
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * get_seg_reg_override_idx() - obtain segment register override index
+ * @insn: Valid instruction with segment override prefixes
+ *
+ * Inspect the instruction prefixes in @insn and find segment overrides, if any.
+ *
+ * Returns:
+ *
+ * A constant identifying the segment register to use, among CS, SS, DS,
+ * ES, FS, or GS. INAT_SEG_REG_DEFAULT is returned if no segment override
+ * prefixes were found.
+ *
+ * -EINVAL in case of error.
+ */
+static int get_seg_reg_override_idx(struct insn *insn)
+{
+ int idx = INAT_SEG_REG_DEFAULT;
+ int num_overrides = 0, i;
+
+ insn_get_prefixes(insn);
+
+ /* Look for any segment override prefixes. */
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ idx = INAT_SEG_REG_CS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ idx = INAT_SEG_REG_SS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ idx = INAT_SEG_REG_DS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ idx = INAT_SEG_REG_ES;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_FS):
+ idx = INAT_SEG_REG_FS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_GS):
+ idx = INAT_SEG_REG_GS;
+ num_overrides++;
+ break;
+ /* No default action needed. */
+ }
+ }
+
+ /* More than one segment override prefix leads to undefined behavior. */
+ if (num_overrides > 1)
+ return -EINVAL;
+
+ return idx;
+}
+
+/**
+ * check_seg_overrides() - check if segment override prefixes are allowed
+ * @insn: Valid instruction with segment override prefixes
+ * @regoff: Operand offset, in pt_regs, for which the check is performed
+ *
+ * For a particular register used in register-indirect addressing, determine if
+ * segment override prefixes can be used. Specifically, no overrides are allowed
+ * for rDI if used with a string instruction.
+ *
+ * Returns:
+ *
+ * True if segment override prefixes can be used with the register indicated
+ * in @regoff. False if otherwise.
+ */
+static bool check_seg_overrides(struct insn *insn, int regoff)
+{
+ if (regoff == offsetof(struct pt_regs, di) && is_string_insn(insn))
+ return false;
+
+ return true;
+}
+
+/**
+ * resolve_default_seg() - resolve default segment register index for an operand
+ * @insn: Instruction with opcode and address size. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @off: Operand offset, in pt_regs, for which resolution is needed
+ *
+ * Resolve the default segment register index associated with the instruction
+ * operand register indicated by @off. Such index is resolved based on defaults
+ * described in the Intel Software Development Manual.
+ *
+ * Returns:
+ *
+ * If in protected mode, a constant identifying the segment register to use,
+ * among CS, SS, ES or DS. If in long mode, INAT_SEG_REG_IGNORE.
+ *
+ * -EINVAL in case of error.
+ */
+static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
+{
+ if (user_64bit_mode(regs))
+ return INAT_SEG_REG_IGNORE;
+ /*
+ * Resolve the default segment register as described in Section 3.7.4
+ * of the Intel Software Development Manual Vol. 1:
+ *
+ * + DS for all references involving r[ABCD]X, and rSI.
+ * + If used in a string instruction, ES for rDI. Otherwise, DS.
+ * + AX, CX and DX are not valid register operands in 16-bit address
+ * encodings but are valid for 32-bit and 64-bit encodings.
+ * + -EDOM is reserved to identify for cases in which no register
+ * is used (i.e., displacement-only addressing). Use DS.
+ * + SS for rSP or rBP.
+ * + CS for rIP.
+ */
+
+ switch (off) {
+ case offsetof(struct pt_regs, ax):
+ case offsetof(struct pt_regs, cx):
+ case offsetof(struct pt_regs, dx):
+ /* Need insn to verify address size. */
+ if (insn->addr_bytes == 2)
+ return -EINVAL;
+
+ case -EDOM:
+ case offsetof(struct pt_regs, bx):
+ case offsetof(struct pt_regs, si):
+ return INAT_SEG_REG_DS;
+
+ case offsetof(struct pt_regs, di):
+ if (is_string_insn(insn))
+ return INAT_SEG_REG_ES;
+ return INAT_SEG_REG_DS;
+
+ case offsetof(struct pt_regs, bp):
+ case offsetof(struct pt_regs, sp):
+ return INAT_SEG_REG_SS;
+
+ case offsetof(struct pt_regs, ip):
+ return INAT_SEG_REG_CS;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * resolve_seg_reg() - obtain segment register index
+ * @insn: Instruction with operands
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Operand offset, in pt_regs, used to deterimine segment register
+ *
+ * Determine the segment register associated with the operands and, if
+ * applicable, prefixes and the instruction pointed by @insn.
+ *
+ * The segment register associated to an operand used in register-indirect
+ * addressing depends on:
+ *
+ * a) Whether running in long mode (in such a case segments are ignored, except
+ * if FS or GS are used).
+ *
+ * b) Whether segment override prefixes can be used. Certain instructions and
+ * registers do not allow override prefixes.
+ *
+ * c) Whether segment overrides prefixes are found in the instruction prefixes.
+ *
+ * d) If there are not segment override prefixes or they cannot be used, the
+ * default segment register associated with the operand register is used.
+ *
+ * The function checks first if segment override prefixes can be used with the
+ * operand indicated by @regoff. If allowed, obtain such overridden segment
+ * register index. Lastly, if not prefixes were found or cannot be used, resolve
+ * the segment register index to use based on the defaults described in the
+ * Intel documentation. In long mode, all segment register indexes will be
+ * ignored, except if overrides were found for FS or GS. All these operations
+ * are done using helper functions.
+ *
+ * The operand register, @regoff, is represented as the offset from the base of
+ * pt_regs.
+ *
+ * As stated, the main use of this function is to determine the segment register
+ * index based on the instruction, its operands and prefixes. Hence, @insn
+ * must be valid. However, if @regoff indicates rIP, we don't need to inspect
+ * @insn at all as in this case CS is used in all cases. This case is checked
+ * before proceeding further.
+ *
+ * Please note that this function does not return the value in the segment
+ * register (i.e., the segment selector) but our defined index. The segment
+ * selector needs to be obtained using get_segment_selector() and passing the
+ * segment register index resolved by this function.
+ *
+ * Returns:
+ *
+ * An index identifying the segment register to use, among CS, SS, DS,
+ * ES, FS, or GS. INAT_SEG_REG_IGNORE is returned if running in long mode.
+ *
+ * -EINVAL in case of error.
+ */
+static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
+{
+ int idx;
+
+ /*
+ * In the unlikely event of having to resolve the segment register
+ * index for rIP, do it first. Segment override prefixes should not
+ * be used. Hence, it is not necessary to inspect the instruction,
+ * which may be invalid at this point.
+ */
+ if (regoff == offsetof(struct pt_regs, ip)) {
+ if (user_64bit_mode(regs))
+ return INAT_SEG_REG_IGNORE;
+ else
+ return INAT_SEG_REG_CS;
+ }
+
+ if (!insn)
+ return -EINVAL;
+
+ if (!check_seg_overrides(insn, regoff))
+ return resolve_default_seg(insn, regs, regoff);
+
+ idx = get_seg_reg_override_idx(insn);
+ if (idx < 0)
+ return idx;
+
+ if (idx == INAT_SEG_REG_DEFAULT)
+ return resolve_default_seg(insn, regs, regoff);
+
+ /*
+ * In long mode, segment override prefixes are ignored, except for
+ * overrides for FS and GS.
+ */
+ if (user_64bit_mode(regs)) {
+ if (idx != INAT_SEG_REG_FS &&
+ idx != INAT_SEG_REG_GS)
+ idx = INAT_SEG_REG_IGNORE;
+ }
+
+ return idx;
+}
+
+/**
+ * get_segment_selector() - obtain segment selector
+ * @regs: Register values as seen when entering kernel mode
+ * @seg_reg_idx: Segment register index to use
+ *
+ * Obtain the segment selector from any of the CS, SS, DS, ES, FS, GS segment
+ * registers. In CONFIG_X86_32, the segment is obtained from either pt_regs or
+ * kernel_vm86_regs as applicable. In CONFIG_X86_64, CS and SS are obtained
+ * from pt_regs. DS, ES, FS and GS are obtained by reading the actual CPU
+ * registers. This done for only for completeness as in CONFIG_X86_64 segment
+ * registers are ignored.
+ *
+ * Returns:
+ *
+ * Value of the segment selector, including null when running in
+ * long mode.
+ *
+ * -EINVAL on error.
+ */
+static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
+{
+#ifdef CONFIG_X86_64
+ unsigned short sel;
+
+ switch (seg_reg_idx) {
+ case INAT_SEG_REG_IGNORE:
+ return 0;
+ case INAT_SEG_REG_CS:
+ return (unsigned short)(regs->cs & 0xffff);
+ case INAT_SEG_REG_SS:
+ return (unsigned short)(regs->ss & 0xffff);
+ case INAT_SEG_REG_DS:
+ savesegment(ds, sel);
+ return sel;
+ case INAT_SEG_REG_ES:
+ savesegment(es, sel);
+ return sel;
+ case INAT_SEG_REG_FS:
+ savesegment(fs, sel);
+ return sel;
+ case INAT_SEG_REG_GS:
+ savesegment(gs, sel);
+ return sel;
+ default:
+ return -EINVAL;
+ }
+#else /* CONFIG_X86_32 */
+ struct kernel_vm86_regs *vm86regs = (struct kernel_vm86_regs *)regs;
+
+ if (v8086_mode(regs)) {
+ switch (seg_reg_idx) {
+ case INAT_SEG_REG_CS:
+ return (unsigned short)(regs->cs & 0xffff);
+ case INAT_SEG_REG_SS:
+ return (unsigned short)(regs->ss & 0xffff);
+ case INAT_SEG_REG_DS:
+ return vm86regs->ds;
+ case INAT_SEG_REG_ES:
+ return vm86regs->es;
+ case INAT_SEG_REG_FS:
+ return vm86regs->fs;
+ case INAT_SEG_REG_GS:
+ return vm86regs->gs;
+ case INAT_SEG_REG_IGNORE:
+ /* fall through */
+ default:
+ return -EINVAL;
+ }
+ }
+
+ switch (seg_reg_idx) {
+ case INAT_SEG_REG_CS:
+ return (unsigned short)(regs->cs & 0xffff);
+ case INAT_SEG_REG_SS:
+ return (unsigned short)(regs->ss & 0xffff);
+ case INAT_SEG_REG_DS:
+ return (unsigned short)(regs->ds & 0xffff);
+ case INAT_SEG_REG_ES:
+ return (unsigned short)(regs->es & 0xffff);
+ case INAT_SEG_REG_FS:
+ return (unsigned short)(regs->fs & 0xffff);
+ case INAT_SEG_REG_GS:
+ /*
+ * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS.
+ * The macro below takes care of both cases.
+ */
+ return get_user_gs(regs);
+ case INAT_SEG_REG_IGNORE:
+ /* fall through */
+ default:
+ return -EINVAL;
+ }
+#endif /* CONFIG_X86_64 */
+}
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+ enum reg_type type)
+{
+ int regno = 0;
+
+ static const int regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#endif
+ };
+ int nr_registers = ARRAY_SIZE(regoff);
+ /*
+ * Don't possibly decode a 32-bit instructions as
+ * reading a 64-bit-only register.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
+ nr_registers -= 8;
+
+ switch (type) {
+ case REG_TYPE_RM:
+ regno = X86_MODRM_RM(insn->modrm.value);
+
+ /*
+ * ModRM.mod == 0 and ModRM.rm == 5 means a 32-bit displacement
+ * follows the ModRM byte.
+ */
+ if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
+ return -EDOM;
+
+ if (X86_REX_B(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ case REG_TYPE_INDEX:
+ regno = X86_SIB_INDEX(insn->sib.value);
+ if (X86_REX_X(insn->rex_prefix.value))
+ regno += 8;
+
+ /*
+ * If ModRM.mod != 3 and SIB.index = 4 the scale*index
+ * portion of the address computation is null. This is
+ * true only if REX.X is 0. In such a case, the SIB index
+ * is used in the address computation.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4)
+ return -EDOM;
+ break;
+
+ case REG_TYPE_BASE:
+ regno = X86_SIB_BASE(insn->sib.value);
+ /*
+ * If ModRM.mod is 0 and SIB.base == 5, the base of the
+ * register-indirect addressing is 0. In this case, a
+ * 32-bit displacement follows the SIB byte.
+ */
+ if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
+ return -EDOM;
+
+ if (X86_REX_B(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ default:
+ pr_err_ratelimited("invalid register type: %d\n", type);
+ return -EINVAL;
+ }
+
+ if (regno >= nr_registers) {
+ WARN_ONCE(1, "decoded an instruction with an invalid register");
+ return -EINVAL;
+ }
+ return regoff[regno];
+}
+
+/**
+ * get_reg_offset_16() - Obtain offset of register indicated by instruction
+ * @insn: Instruction containing ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ * @offs1: Offset of the first operand register
+ * @offs2: Offset of the second opeand register, if applicable
+ *
+ * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte
+ * in @insn. This function is to be used with 16-bit address encodings. The
+ * @offs1 and @offs2 will be written with the offset of the two registers
+ * indicated by the instruction. In cases where any of the registers is not
+ * referenced by the instruction, the value will be set to -EDOM.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error.
+ */
+static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs,
+ int *offs1, int *offs2)
+{
+ /*
+ * 16-bit addressing can use one or two registers. Specifics of
+ * encodings are given in Table 2-1. "16-Bit Addressing Forms with the
+ * ModR/M Byte" of the Intel Software Development Manual.
+ */
+ static const int regoff1[] = {
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, bx),
+ };
+
+ static const int regoff2[] = {
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ -EDOM,
+ -EDOM,
+ -EDOM,
+ -EDOM,
+ };
+
+ if (!offs1 || !offs2)
+ return -EINVAL;
+
+ /* Operand is a register, use the generic function. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ *offs1 = insn_get_modrm_rm_off(insn, regs);
+ *offs2 = -EDOM;
+ return 0;
+ }
+
+ *offs1 = regoff1[X86_MODRM_RM(insn->modrm.value)];
+ *offs2 = regoff2[X86_MODRM_RM(insn->modrm.value)];
+
+ /*
+ * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement-
+ * only addressing. This means that no registers are involved in
+ * computing the effective address. Thus, ensure that the first
+ * register offset is invalild. The second register offset is already
+ * invalid under the aforementioned conditions.
+ */
+ if ((X86_MODRM_MOD(insn->modrm.value) == 0) &&
+ (X86_MODRM_RM(insn->modrm.value) == 6))
+ *offs1 = -EDOM;
+
+ return 0;
+}
+
+/**
+ * get_desc() - Obtain pointer to a segment descriptor
+ * @sel: Segment selector
+ *
+ * Given a segment selector, obtain a pointer to the segment descriptor.
+ * Both global and local descriptor tables are supported.
+ *
+ * Returns:
+ *
+ * Pointer to segment descriptor on success.
+ *
+ * NULL on error.
+ */
+static struct desc_struct *get_desc(unsigned short sel)
+{
+ struct desc_ptr gdt_desc = {0, 0};
+ unsigned long desc_base;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ struct desc_struct *desc = NULL;
+ struct ldt_struct *ldt;
+
+ /* Bits [15:3] contain the index of the desired entry. */
+ sel >>= 3;
+
+ mutex_lock(¤t->active_mm->context.lock);
+ ldt = current->active_mm->context.ldt;
+ if (ldt && sel < ldt->nr_entries)
+ desc = &ldt->entries[sel];
+
+ mutex_unlock(¤t->active_mm->context.lock);
+
+ return desc;
+ }
+#endif
+ native_store_gdt(&gdt_desc);
+
+ /*
+ * Segment descriptors have a size of 8 bytes. Thus, the index is
+ * multiplied by 8 to obtain the memory offset of the desired descriptor
+ * from the base of the GDT. As bits [15:3] of the segment selector
+ * contain the index, it can be regarded as multiplied by 8 already.
+ * All that remains is to clear bits [2:0].
+ */
+ desc_base = sel & ~(SEGMENT_RPL_MASK | SEGMENT_TI_MASK);
+
+ if (desc_base > gdt_desc.size)
+ return NULL;
+
+ return (struct desc_struct *)(gdt_desc.address + desc_base);
+}
+
+/**
+ * insn_get_seg_base() - Obtain base address of segment descriptor.
+ * @regs: Register values as seen when entering kernel mode
+ * @seg_reg_idx: Index of the segment register pointing to seg descriptor
+ *
+ * Obtain the base address of the segment as indicated by the segment descriptor
+ * pointed by the segment selector. The segment selector is obtained from the
+ * input segment register index @seg_reg_idx.
+ *
+ * Returns:
+ *
+ * In protected mode, base address of the segment. Zero in long mode,
+ * except when FS or GS are used. In virtual-8086 mode, the segment
+ * selector shifted 4 bits to the right.
+ *
+ * -1L in case of error.
+ */
+unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+ struct desc_struct *desc;
+ short sel;
+
+ sel = get_segment_selector(regs, seg_reg_idx);
+ if (sel < 0)
+ return -1L;
+
+ if (v8086_mode(regs))
+ /*
+ * Base is simply the segment selector shifted 4
+ * bits to the right.
+ */
+ return (unsigned long)(sel << 4);
+
+ if (user_64bit_mode(regs)) {
+ /*
+ * Only FS or GS will have a base address, the rest of
+ * the segments' bases are forced to 0.
+ */
+ unsigned long base;
+
+ if (seg_reg_idx == INAT_SEG_REG_FS)
+ rdmsrl(MSR_FS_BASE, base);
+ else if (seg_reg_idx == INAT_SEG_REG_GS)
+ /*
+ * swapgs was called at the kernel entry point. Thus,
+ * MSR_KERNEL_GS_BASE will have the user-space GS base.
+ */
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+ base = 0;
+ return base;
+ }
+
+ /* In protected mode the segment selector cannot be null. */
+ if (!sel)
+ return -1L;
+
+ desc = get_desc(sel);
+ if (!desc)
+ return -1L;
+
+ return get_desc_base(desc);
+}
+
+/**
+ * get_seg_limit() - Obtain the limit of a segment descriptor
+ * @regs: Register values as seen when entering kernel mode
+ * @seg_reg_idx: Index of the segment register pointing to seg descriptor
+ *
+ * Obtain the limit of the segment as indicated by the segment descriptor
+ * pointed by the segment selector. The segment selector is obtained from the
+ * input segment register index @seg_reg_idx.
+ *
+ * Returns:
+ *
+ * In protected mode, the limit of the segment descriptor in bytes.
+ * In long mode and virtual-8086 mode, segment limits are not enforced. Thus,
+ * limit is returned as -1L to imply a limit-less segment.
+ *
+ * Zero is returned on error.
+ */
+static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx)
+{
+ struct desc_struct *desc;
+ unsigned long limit;
+ short sel;
+
+ sel = get_segment_selector(regs, seg_reg_idx);
+ if (sel < 0)
+ return 0;
+
+ if (user_64bit_mode(regs) || v8086_mode(regs))
+ return -1L;
+
+ if (!sel)
+ return 0;
+
+ desc = get_desc(sel);
+ if (!desc)
+ return 0;
+
+ /*
+ * If the granularity bit is set, the limit is given in multiples
+ * of 4096. This also means that the 12 least significant bits are
+ * not tested when checking the segment limits. In practice,
+ * this means that the segment ends in (limit << 12) + 0xfff.
+ */
+ limit = get_desc_limit(desc);
+ if (desc->g)
+ limit = (limit << 12) + 0xfff;
+
+ return limit;
+}
+
+/**
+ * insn_get_code_seg_params() - Obtain code segment parameters
+ * @regs: Structure with register values as seen when entering kernel mode
+ *
+ * Obtain address and operand sizes of the code segment. It is obtained from the
+ * selector contained in the CS register in regs. In protected mode, the default
+ * address is determined by inspecting the L and D bits of the segment
+ * descriptor. In virtual-8086 mode, the default is always two bytes for both
+ * address and operand sizes.
+ *
+ * Returns:
+ *
+ * A signed 8-bit value containing the default parameters on success.
+ *
+ * -EINVAL on error.
+ */
+char insn_get_code_seg_params(struct pt_regs *regs)
+{
+ struct desc_struct *desc;
+ short sel;
+
+ if (v8086_mode(regs))
+ /* Address and operand size are both 16-bit. */
+ return INSN_CODE_SEG_PARAMS(2, 2);
+
+ sel = get_segment_selector(regs, INAT_SEG_REG_CS);
+ if (sel < 0)
+ return sel;
+
+ desc = get_desc(sel);
+ if (!desc)
+ return -EINVAL;
+
+ /*
+ * The most significant byte of the Type field of the segment descriptor
+ * determines whether a segment contains data or code. If this is a data
+ * segment, return error.
+ */
+ if (!(desc->type & BIT(3)))
+ return -EINVAL;
+
+ switch ((desc->l << 1) | desc->d) {
+ case 0: /*
+ * Legacy mode. CS.L=0, CS.D=0. Address and operand size are
+ * both 16-bit.
+ */
+ return INSN_CODE_SEG_PARAMS(2, 2);
+ case 1: /*
+ * Legacy mode. CS.L=0, CS.D=1. Address and operand size are
+ * both 32-bit.
+ */
+ return INSN_CODE_SEG_PARAMS(4, 4);
+ case 2: /*
+ * IA-32e 64-bit mode. CS.L=1, CS.D=0. Address size is 64-bit;
+ * operand size is 32-bit.
+ */
+ return INSN_CODE_SEG_PARAMS(4, 8);
+ case 3: /* Invalid setting. CS.L=1, CS.D=1 */
+ /* fall through */
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the r/m part of the ModRM byte. The
+ * register is obtained as an offset from the base of pt_regs. In specific
+ * cases, the returned value can be -EDOM to indicate that the particular value
+ * of ModRM does not refer to a register and shall be ignored.
+ */
+int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs)
+{
+ return get_reg_offset(insn, regs, REG_TYPE_RM);
+}
+
+/**
+ * get_seg_base_limit() - obtain base address and limit of a segment
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Operand offset, in pt_regs, used to resolve segment descriptor
+ * @base: Obtained segment base
+ * @limit: Obtained segment limit
+ *
+ * Obtain the base address and limit of the segment associated with the operand
+ * @regoff and, if any or allowed, override prefixes in @insn. This function is
+ * different from insn_get_seg_base() as the latter does not resolve the segment
+ * associated with the instruction operand. If a limit is not needed (e.g.,
+ * when running in long mode), @limit can be NULL.
+ *
+ * Returns:
+ *
+ * 0 on success. @base and @limit will contain the base address and of the
+ * resolved segment, respectively.
+ *
+ * -EINVAL on error.
+ */
+static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs,
+ int regoff, unsigned long *base,
+ unsigned long *limit)
+{
+ int seg_reg_idx;
+
+ if (!base)
+ return -EINVAL;
+
+ seg_reg_idx = resolve_seg_reg(insn, regs, regoff);
+ if (seg_reg_idx < 0)
+ return seg_reg_idx;
+
+ *base = insn_get_seg_base(regs, seg_reg_idx);
+ if (*base == -1L)
+ return -EINVAL;
+
+ if (!limit)
+ return 0;
+
+ *limit = get_seg_limit(regs, seg_reg_idx);
+ if (!(*limit))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_reg() - Obtain effective address from register operand
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, with the effective address
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the effective address stored in the register operand as indicated by
+ * the ModRM byte. This function is to be used only with register addressing
+ * (i.e., ModRM.mod is 3). The effective address is saved in @eff_addr. The
+ * register operand, as an offset from the base of pt_regs, is saved in @regoff;
+ * such offset can then be used to resolve the segment associated with the
+ * operand. This function can be used with any of the supported address sizes
+ * in x86.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the effective address stored in the
+ * operand indicated by ModRM. @regoff will have such operand as an offset from
+ * the base of pt_regs.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs,
+ int *regoff, long *eff_addr)
+{
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ if (X86_MODRM_MOD(insn->modrm.value) != 3)
+ return -EINVAL;
+
+ *regoff = get_reg_offset(insn, regs, REG_TYPE_RM);
+ if (*regoff < 0)
+ return -EINVAL;
+
+ /* Ignore bytes that are outside the address size. */
+ if (insn->addr_bytes == 2)
+ *eff_addr = regs_get_register(regs, *regoff) & 0xffff;
+ else if (insn->addr_bytes == 4)
+ *eff_addr = regs_get_register(regs, *regoff) & 0xffffffff;
+ else /* 64-bit address */
+ *eff_addr = regs_get_register(regs, *regoff);
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_modrm() - Obtain referenced effective address via ModRM
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the effective address referenced by the ModRM byte of @insn. After
+ * identifying the registers involved in the register-indirect memory reference,
+ * its value is obtained from the operands in @regs. The computed address is
+ * stored @eff_addr. Also, the register operand that indicates the associated
+ * segment is stored in @regoff, this parameter can later be used to determine
+ * such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address. @regoff
+ * will have a register, as an offset from the base of pt_regs, that can be used
+ * to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs,
+ int *regoff, long *eff_addr)
+{
+ long tmp;
+
+ if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
+ return -EINVAL;
+
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ if (X86_MODRM_MOD(insn->modrm.value) > 2)
+ return -EINVAL;
+
+ *regoff = get_reg_offset(insn, regs, REG_TYPE_RM);
+
+ /*
+ * -EDOM means that we must ignore the address_offset. In such a case,
+ * in 64-bit mode the effective address relative to the rIP of the
+ * following instruction.
+ */
+ if (*regoff == -EDOM) {
+ if (user_64bit_mode(regs))
+ tmp = regs->ip + insn->length;
+ else
+ tmp = 0;
+ } else if (*regoff < 0) {
+ return -EINVAL;
+ } else {
+ tmp = regs_get_register(regs, *regoff);
+ }
+
+ if (insn->addr_bytes == 4) {
+ int addr32 = (int)(tmp & 0xffffffff) + insn->displacement.value;
+
+ *eff_addr = addr32 & 0xffffffff;
+ } else {
+ *eff_addr = tmp + insn->displacement.value;
+ }
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_modrm_16() - Obtain referenced effective address via ModRM
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the 16-bit effective address referenced by the ModRM byte of @insn.
+ * After identifying the registers involved in the register-indirect memory
+ * reference, its value is obtained from the operands in @regs. The computed
+ * address is stored @eff_addr. Also, the register operand that indicates
+ * the associated segment is stored in @regoff, this parameter can later be used
+ * to determine such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address. @regoff
+ * will have a register, as an offset from the base of pt_regs, that can be used
+ * to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs,
+ int *regoff, short *eff_addr)
+{
+ int addr_offset1, addr_offset2, ret;
+ short addr1 = 0, addr2 = 0, displacement;
+
+ if (insn->addr_bytes != 2)
+ return -EINVAL;
+
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ if (X86_MODRM_MOD(insn->modrm.value) > 2)
+ return -EINVAL;
+
+ ret = get_reg_offset_16(insn, regs, &addr_offset1, &addr_offset2);
+ if (ret < 0)
+ return -EINVAL;
+
+ /*
+ * Don't fail on invalid offset values. They might be invalid because
+ * they cannot be used for this particular value of ModRM. Instead, use
+ * them in the computation only if they contain a valid value.
+ */
+ if (addr_offset1 != -EDOM)
+ addr1 = regs_get_register(regs, addr_offset1) & 0xffff;
+
+ if (addr_offset2 != -EDOM)
+ addr2 = regs_get_register(regs, addr_offset2) & 0xffff;
+
+ displacement = insn->displacement.value & 0xffff;
+ *eff_addr = addr1 + addr2 + displacement;
+
+ /*
+ * The first operand register could indicate to use of either SS or DS
+ * registers to obtain the segment selector. The second operand
+ * register can only indicate the use of DS. Thus, the first operand
+ * will be used to obtain the segment selector.
+ */
+ *regoff = addr_offset1;
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_sib() - Obtain referenced effective address via SIB
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the effective address referenced by the SIB byte of @insn. After
+ * identifying the registers involved in the indexed, register-indirect memory
+ * reference, its value is obtained from the operands in @regs. The computed
+ * address is stored @eff_addr. Also, the register operand that indicates the
+ * associated segment is stored in @regoff, this parameter can later be used to
+ * determine such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address.
+ * @base_offset will have a register, as an offset from the base of pt_regs,
+ * that can be used to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs,
+ int *base_offset, long *eff_addr)
+{
+ long base, indx;
+ int indx_offset;
+
+ if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
+ return -EINVAL;
+
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ if (X86_MODRM_MOD(insn->modrm.value) > 2)
+ return -EINVAL;
+
+ insn_get_sib(insn);
+
+ if (!insn->sib.nbytes)
+ return -EINVAL;
+
+ *base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
+ indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
+
+ /*
+ * Negative values in the base and index offset means an error when
+ * decoding the SIB byte. Except -EDOM, which means that the registers
+ * should not be used in the address computation.
+ */
+ if (*base_offset == -EDOM)
+ base = 0;
+ else if (*base_offset < 0)
+ return -EINVAL;
+ else
+ base = regs_get_register(regs, *base_offset);
+
+ if (indx_offset == -EDOM)
+ indx = 0;
+ else if (indx_offset < 0)
+ return -EINVAL;
+ else
+ indx = regs_get_register(regs, indx_offset);
+
+ if (insn->addr_bytes == 4) {
+ int addr32, base32, idx32;
+
+ base32 = base & 0xffffffff;
+ idx32 = indx & 0xffffffff;
+
+ addr32 = base32 + idx32 * (1 << X86_SIB_SCALE(insn->sib.value));
+ addr32 += insn->displacement.value;
+
+ *eff_addr = addr32 & 0xffffffff;
+ } else {
+ *eff_addr = base + indx * (1 << X86_SIB_SCALE(insn->sib.value));
+ *eff_addr += insn->displacement.value;
+ }
+
+ return 0;
+}
+
+/**
+ * get_addr_ref_16() - Obtain the 16-bit address referred by instruction
+ * @insn: Instruction containing ModRM byte and displacement
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * This function is to be used with 16-bit address encodings. Obtain the memory
+ * address referred by the instruction's ModRM and displacement bytes. Also, the
+ * segment used as base is determined by either any segment override prefixes in
+ * @insn or the default segment of the registers involved in the address
+ * computation. In protected mode, segment limits are enforced.
+ *
+ * Returns:
+ *
+ * Linear address referenced by the instruction operands on success.
+ *
+ * -1L on error.
+ */
+static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs)
+{
+ unsigned long linear_addr = -1L, seg_base, seg_limit;
+ int ret, regoff;
+ short eff_addr;
+ long tmp;
+
+ insn_get_modrm(insn);
+ insn_get_displacement(insn);
+
+ if (insn->addr_bytes != 2)
+ goto out;
+
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ ret = get_eff_addr_reg(insn, regs, ®off, &tmp);
+ if (ret)
+ goto out;
+
+ eff_addr = tmp;
+ } else {
+ ret = get_eff_addr_modrm_16(insn, regs, ®off, &eff_addr);
+ if (ret)
+ goto out;
+ }
+
+ ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit);
+ if (ret)
+ goto out;
+
+ /*
+ * Before computing the linear address, make sure the effective address
+ * is within the limits of the segment. In virtual-8086 mode, segment
+ * limits are not enforced. In such a case, the segment limit is -1L to
+ * reflect this fact.
+ */
+ if ((unsigned long)(eff_addr & 0xffff) > seg_limit)
+ goto out;
+
+ linear_addr = (unsigned long)(eff_addr & 0xffff) + seg_base;
+
+ /* Limit linear address to 20 bits */
+ if (v8086_mode(regs))
+ linear_addr &= 0xfffff;
+
+out:
+ return (void __user *)linear_addr;
+}
+
+/**
+ * get_addr_ref_32() - Obtain a 32-bit linear address
+ * @insn: Instruction with ModRM, SIB bytes and displacement
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * This function is to be used with 32-bit address encodings to obtain the
+ * linear memory address referred by the instruction's ModRM, SIB,
+ * displacement bytes and segment base address, as applicable. If in protected
+ * mode, segment limits are enforced.
+ *
+ * Returns:
+ *
+ * Linear address referenced by instruction and registers on success.
+ *
+ * -1L on error.
+ */
+static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs)
+{
+ unsigned long linear_addr = -1L, seg_base, seg_limit;
+ int eff_addr, regoff;
+ long tmp;
+ int ret;
+
+ if (insn->addr_bytes != 4)
+ goto out;
+
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ ret = get_eff_addr_reg(insn, regs, ®off, &tmp);
+ if (ret)
+ goto out;
+
+ eff_addr = tmp;
+
+ } else {
+ if (insn->sib.nbytes) {
+ ret = get_eff_addr_sib(insn, regs, ®off, &tmp);
+ if (ret)
+ goto out;
+
+ eff_addr = tmp;
+ } else {
+ ret = get_eff_addr_modrm(insn, regs, ®off, &tmp);
+ if (ret)
+ goto out;
+
+ eff_addr = tmp;
+ }
+ }
+
+ ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit);
+ if (ret)
+ goto out;
+
+ /*
+ * In protected mode, before computing the linear address, make sure
+ * the effective address is within the limits of the segment.
+ * 32-bit addresses can be used in long and virtual-8086 modes if an
+ * address override prefix is used. In such cases, segment limits are
+ * not enforced. When in virtual-8086 mode, the segment limit is -1L
+ * to reflect this situation.
+ *
+ * After computed, the effective address is treated as an unsigned
+ * quantity.
+ */
+ if (!user_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit))
+ goto out;
+
+ /*
+ * Even though 32-bit address encodings are allowed in virtual-8086
+ * mode, the address range is still limited to [0x-0xffff].
+ */
+ if (v8086_mode(regs) && (eff_addr & ~0xffff))
+ goto out;
+
+ /*
+ * Data type long could be 64 bits in size. Ensure that our 32-bit
+ * effective address is not sign-extended when computing the linear
+ * address.
+ */
+ linear_addr = (unsigned long)(eff_addr & 0xffffffff) + seg_base;
+
+ /* Limit linear address to 20 bits */
+ if (v8086_mode(regs))
+ linear_addr &= 0xfffff;
+
+out:
+ return (void __user *)linear_addr;
+}
+
+/**
+ * get_addr_ref_64() - Obtain a 64-bit linear address
+ * @insn: Instruction struct with ModRM and SIB bytes and displacement
+ * @regs: Structure with register values as seen when entering kernel mode
+ *
+ * This function is to be used with 64-bit address encodings to obtain the
+ * linear memory address referred by the instruction's ModRM, SIB,
+ * displacement bytes and segment base address, as applicable.
+ *
+ * Returns:
+ *
+ * Linear address referenced by instruction and registers on success.
+ *
+ * -1L on error.
+ */
+#ifndef CONFIG_X86_64
+static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs)
+{
+ return (void __user *)-1L;
+}
+#else
+static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs)
+{
+ unsigned long linear_addr = -1L, seg_base;
+ int regoff, ret;
+ long eff_addr;
+
+ if (insn->addr_bytes != 8)
+ goto out;
+
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ ret = get_eff_addr_reg(insn, regs, ®off, &eff_addr);
+ if (ret)
+ goto out;
+
+ } else {
+ if (insn->sib.nbytes) {
+ ret = get_eff_addr_sib(insn, regs, ®off, &eff_addr);
+ if (ret)
+ goto out;
+ } else {
+ ret = get_eff_addr_modrm(insn, regs, ®off, &eff_addr);
+ if (ret)
+ goto out;
+ }
+
+ }
+
+ ret = get_seg_base_limit(insn, regs, regoff, &seg_base, NULL);
+ if (ret)
+ goto out;
+
+ linear_addr = (unsigned long)eff_addr + seg_base;
+
+out:
+ return (void __user *)linear_addr;
+}
+#endif /* CONFIG_X86_64 */
+
+/**
+ * insn_get_addr_ref() - Obtain the linear address referred by instruction
+ * @insn: Instruction structure containing ModRM byte and displacement
+ * @regs: Structure with register values as seen when entering kernel mode
+ *
+ * Obtain the linear address referred by the instruction's ModRM, SIB and
+ * displacement bytes, and segment base, as applicable. In protected mode,
+ * segment limits are enforced.
+ *
+ * Returns:
+ *
+ * Linear address referenced by instruction and registers on success.
+ *
+ * -1L on error.
+ */
+void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
+{
+ if (!insn || !regs)
+ return (void __user *)-1L;
+
+ switch (insn->addr_bytes) {
+ case 2:
+ return get_addr_ref_16(insn, regs);
+ case 4:
+ return get_addr_ref_32(insn, regs);
+ case 8:
+ return get_addr_ref_64(insn, regs);
+ default:
+ return (void __user *)-1L;
+ }
+}
#include <asm/trace/exceptions.h>
/*
- * Page fault error code bits:
- *
- * bit 0 == 0: no page found 1: protection fault
- * bit 1 == 0: read access 1: write access
- * bit 2 == 0: kernel-mode access 1: user-mode access
- * bit 3 == 1: use of reserved bit detected
- * bit 4 == 1: fault was an instruction fetch
- * bit 5 == 1: protection keys block access
- */
-enum x86_pf_error_code {
-
- PF_PROT = 1 << 0,
- PF_WRITE = 1 << 1,
- PF_USER = 1 << 2,
- PF_RSVD = 1 << 3,
- PF_INSTR = 1 << 4,
- PF_PK = 1 << 5,
-};
-
-/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
*/
- if (error_code & PF_INSTR)
+ if (error_code & X86_PF_INSTR)
return 0;
instr = (void *)convert_ip_to_linear(current, regs);
* siginfo so userspace can discover which protection key was set
* on the PTE.
*
- * If we get here, we know that the hardware signaled a PF_PK
+ * If we get here, we know that the hardware signaled a X86_PF_PK
* fault and that there was a VMA once we got in the fault
* handler. It does *not* guarantee that the VMA we find here
* was the one that we faulted on.
/*
* force_sig_info_fault() is called from a number of
* contexts, some of which have a VMA and some of which
- * do not. The PF_PK handing happens after we have a
+ * do not. The X86_PF_PK handing happens after we have a
* valid VMA, so we should never reach this without a
* valid VMA.
*/
if (!oops_may_print())
return;
- if (error_code & PF_INSTR) {
+ if (error_code & X86_PF_INSTR) {
unsigned int level;
pgd_t *pgd;
pte_t *pte;
*/
if (current->thread.sig_on_uaccess_err && signal) {
tsk->thread.trap_nr = X86_TRAP_PF;
- tsk->thread.error_code = error_code | PF_USER;
+ tsk->thread.error_code = error_code | X86_PF_USER;
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
struct task_struct *tsk = current;
/* User mode accesses just cause a SIGSEGV */
- if (error_code & PF_USER) {
+ if (error_code & X86_PF_USER) {
/*
* It's possible to have interrupts off here:
*/
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
- if (unlikely((error_code & PF_INSTR) &&
+ if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
* are always protection faults.
*/
if (address >= TASK_SIZE_MAX)
- error_code |= PF_PROT;
+ error_code |= X86_PF_PROT;
if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return false;
- if (error_code & PF_PK)
+ if (error_code & X86_PF_PK)
return true;
/* this checks permission keys on the VMA: */
- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
- (error_code & PF_INSTR), foreign))
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
return true;
return false;
}
int code = BUS_ADRERR;
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER)) {
+ if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 *pkey, unsigned int fault)
{
- if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
return;
}
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER)) {
+ if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
- if ((error_code & PF_WRITE) && !pte_write(*pte))
+ if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
- if ((error_code & PF_INSTR) && !pte_exec(*pte))
+ if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
- * changes, so no spurious fault will ever set PF_PK.
+ * changes, so no spurious fault will ever set X86_PF_PK.
*/
- if ((error_code & PF_PK))
+ if ((error_code & X86_PF_PK))
return 1;
return 1;
* change, so user accesses are not expected to cause spurious
* faults.
*/
- if (error_code != (PF_WRITE | PF_PROT)
- && error_code != (PF_INSTR | PF_PROT))
+ if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+ error_code != (X86_PF_INSTR | X86_PF_PROT))
return 0;
pgd = init_mm.pgd + pgd_index(address);
* always an unconditional error and can never result in
* a follow-up action to resolve the fault, like a COW.
*/
- if (error_code & PF_PK)
+ if (error_code & X86_PF_PK)
return 1;
/*
* Make sure to check the VMA so that we do not perform
- * faults just to hit a PF_PK as soon as we fill in a
+ * faults just to hit a X86_PF_PK as soon as we fill in a
* page.
*/
- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
- (error_code & PF_INSTR), foreign))
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
return 1;
- if (error_code & PF_WRITE) {
+ if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
return 1;
}
/* read, present: */
- if (unlikely(error_code & PF_PROT))
+ if (unlikely(error_code & X86_PF_PROT))
return 1;
/* read, not present: */
if (!static_cpu_has(X86_FEATURE_SMAP))
return false;
- if (error_code & PF_USER)
+ if (error_code & X86_PF_USER)
return false;
if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+ if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
if (unlikely(kprobes_fault(regs)))
return;
- if (unlikely(error_code & PF_RSVD))
+ if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
*/
if (user_mode(regs)) {
local_irq_enable();
- error_code |= PF_USER;
+ error_code |= X86_PF_USER;
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (error_code & PF_WRITE)
+ if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (error_code & PF_INSTR)
+ if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
/*
* space check, thus avoiding the deadlock:
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
- if ((error_code & PF_USER) == 0 &&
+ if (!(error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
bad_area(regs, error_code, address);
return;
}
- if (error_code & PF_USER) {
+ if (error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
void register_page_bootmem_memmap(unsigned long section_nr,
- struct page *start_page, unsigned long size)
+ struct page *start_page, unsigned long nr_pages)
{
unsigned long addr = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + size);
+ unsigned long end = (unsigned long)(start_page + nr_pages);
unsigned long next;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
- unsigned int nr_pages;
+ unsigned int nr_pmd_pages;
struct page *page;
for (; addr < end; addr = next) {
if (pmd_none(*pmd))
continue;
- nr_pages = 1 << (get_order(PMD_SIZE));
+ nr_pmd_pages = 1 << get_order(PMD_SIZE);
page = pmd_page(*pmd);
- while (nr_pages--)
+ while (nr_pmd_pages--)
get_page_bootmem(section_nr, page++,
SECTION_INFO);
}
#include "physaddr.h"
+struct ioremap_mem_flags {
+ bool system_ram;
+ bool desc_other;
+};
+
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
return err;
}
-static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
- void *arg)
+static bool __ioremap_check_ram(struct resource *res)
{
+ unsigned long start_pfn, stop_pfn;
unsigned long i;
- for (i = 0; i < nr_pages; ++i)
- if (pfn_valid(start_pfn + i) &&
- !PageReserved(pfn_to_page(start_pfn + i)))
- return 1;
+ if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
+ return false;
- return 0;
+ start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ stop_pfn = (res->end + 1) >> PAGE_SHIFT;
+ if (stop_pfn > start_pfn) {
+ for (i = 0; i < (stop_pfn - start_pfn); ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return true;
+ }
+
+ return false;
+}
+
+static int __ioremap_check_desc_other(struct resource *res)
+{
+ return (res->desc != IORES_DESC_NONE);
+}
+
+static int __ioremap_res_check(struct resource *res, void *arg)
+{
+ struct ioremap_mem_flags *flags = arg;
+
+ if (!flags->system_ram)
+ flags->system_ram = __ioremap_check_ram(res);
+
+ if (!flags->desc_other)
+ flags->desc_other = __ioremap_check_desc_other(res);
+
+ return flags->system_ram && flags->desc_other;
+}
+
+/*
+ * To avoid multiple resource walks, this function walks resources marked as
+ * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
+ * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ */
+static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
+ struct ioremap_mem_flags *flags)
+{
+ u64 start, end;
+
+ start = (u64)addr;
+ end = start + size - 1;
+ memset(flags, 0, sizeof(*flags));
+
+ walk_mem_res(start, end, flags, __ioremap_res_check);
}
/*
unsigned long size, enum page_cache_mode pcm, void *caller)
{
unsigned long offset, vaddr;
- resource_size_t pfn, last_pfn, last_addr;
+ resource_size_t last_addr;
const resource_size_t unaligned_phys_addr = phys_addr;
const unsigned long unaligned_size = size;
+ struct ioremap_mem_flags mem_flags;
struct vm_struct *area;
enum page_cache_mode new_pcm;
pgprot_t prot;
return NULL;
}
+ __ioremap_check_mem(phys_addr, size, &mem_flags);
+
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- pfn = phys_addr >> PAGE_SHIFT;
- last_pfn = last_addr >> PAGE_SHIFT;
- if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
- __ioremap_check_ram) == 1) {
+ if (mem_flags.system_ram) {
WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
&phys_addr, &last_addr);
return NULL;
pcm = new_pcm;
}
+ /*
+ * If the page being mapped is in memory and SEV is active then
+ * make sure the memory encryption attribute is enabled in the
+ * resulting mapping.
+ */
prot = PAGE_KERNEL_IO;
+ if (sev_active() && mem_flags.desc_other)
+ prot = pgprot_encrypted(prot);
+
switch (pcm) {
case _PAGE_CACHE_MODE_UC:
default:
* areas should be mapped decrypted. And since the encryption key can
* change across reboots, persistent memory should also be mapped
* decrypted.
+ *
+ * If SEV is active, that implies that BIOS/UEFI also ran encrypted so
+ * only persistent memory should be mapped decrypted.
*/
static bool memremap_should_map_decrypted(resource_size_t phys_addr,
unsigned long size)
case E820_TYPE_ACPI:
case E820_TYPE_NVS:
case E820_TYPE_UNUSABLE:
+ /* For SEV, these areas are encrypted */
+ if (sev_active())
+ break;
+ /* Fallthrough */
+
case E820_TYPE_PRAM:
return true;
default:
bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
unsigned long flags)
{
- if (!sme_active())
+ if (!mem_encrypt_active())
return true;
if (flags & MEMREMAP_ENC)
if (flags & MEMREMAP_DEC)
return false;
- if (memremap_is_setup_data(phys_addr, size) ||
- memremap_is_efi_data(phys_addr, size) ||
- memremap_should_map_decrypted(phys_addr, size))
- return false;
+ if (sme_active()) {
+ if (memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size))
+ return false;
+ }
- return true;
+ return !memremap_should_map_decrypted(phys_addr, size);
}
/*
unsigned long size,
pgprot_t prot)
{
- if (!sme_active())
+ bool encrypted_prot;
+
+ if (!mem_encrypt_active())
return prot;
- if (early_memremap_is_setup_data(phys_addr, size) ||
- memremap_is_efi_data(phys_addr, size) ||
- memremap_should_map_decrypted(phys_addr, size))
- prot = pgprot_decrypted(prot);
- else
- prot = pgprot_encrypted(prot);
+ encrypted_prot = true;
+
+ if (sme_active()) {
+ if (early_memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size))
+ encrypted_prot = false;
+ }
+
+ if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size))
+ encrypted_prot = false;
- return prot;
+ return encrypted_prot ? pgprot_encrypted(prot)
+ : pgprot_decrypted(prot);
}
bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
extern struct range pfn_mapped[E820_MAX_ENTRIES];
+static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+
static int __init map_range(struct range *range)
{
unsigned long start;
unsigned long end)
{
pgd_t *pgd;
+ /* See comment in kasan_init() */
+ unsigned long pgd_end = end & PGDIR_MASK;
- for (; start < end; start += PGDIR_SIZE) {
+ for (; start < pgd_end; start += PGDIR_SIZE) {
pgd = pgd_offset_k(start);
/*
* With folded p4d, pgd_clear() is nop, use p4d_clear()
else
pgd_clear(pgd);
}
+
+ pgd = pgd_offset_k(start);
+ for (; start < end; start += P4D_SIZE)
+ p4d_clear(p4d_offset(pgd, start));
+}
+
+static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
+{
+ unsigned long p4d;
+
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return (p4d_t *)pgd;
+
+ p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
+ p4d += __START_KERNEL_map - phys_base;
+ return (p4d_t *)p4d + p4d_index(addr);
+}
+
+static void __init kasan_early_p4d_populate(pgd_t *pgd,
+ unsigned long addr,
+ unsigned long end)
+{
+ pgd_t pgd_entry;
+ p4d_t *p4d, p4d_entry;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
+ set_pgd(pgd, pgd_entry);
+ }
+
+ p4d = early_p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_none(*p4d))
+ continue;
+
+ p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
+ set_p4d(p4d, p4d_entry);
+ } while (p4d++, addr = next, addr != end && p4d_none(*p4d));
}
static void __init kasan_map_early_shadow(pgd_t *pgd)
{
- int i;
- unsigned long start = KASAN_SHADOW_START;
+ /* See comment in kasan_init() */
+ unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
unsigned long end = KASAN_SHADOW_END;
+ unsigned long next;
- for (i = pgd_index(start); start < end; i++) {
- switch (CONFIG_PGTABLE_LEVELS) {
- case 4:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
- _KERNPG_TABLE);
- break;
- case 5:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
- _KERNPG_TABLE);
- break;
- default:
- BUILD_BUG();
- }
- start += PGDIR_SIZE;
- }
+ pgd += pgd_index(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_early_p4d_populate(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
}
#ifdef CONFIG_KASAN_INLINE
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
- for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+ for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
#endif
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+ /*
+ * We use the same shadow offset for 4- and 5-level paging to
+ * facilitate boot-time switching between paging modes.
+ * As result in 5-level paging mode KASAN_SHADOW_START and
+ * KASAN_SHADOW_END are not aligned to PGD boundary.
+ *
+ * KASAN_SHADOW_START doesn't share PGD with anything else.
+ * We claim whole PGD entry to make things easier.
+ *
+ * KASAN_SHADOW_END lands in the last PGD entry and it collides with
+ * bunch of things like kernel code, modules, EFI mapping, etc.
+ * We need to take extra steps to not overwrite them.
+ */
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ void *ptr;
+
+ ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
+ memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
+ set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
+ __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
+ }
+
load_cr3(early_top_pgt);
__flush_tlb_all();
- clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+ clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
- kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
kasan_mem_to_shadow((void *)PAGE_OFFSET));
for (i = 0; i < E820_MAX_ENTRIES; i++) {
#include <asm/msr.h>
#include <asm/cmdline.h>
+#include "mm_internal.h"
+
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
static char sme_cmdline_off[] __initdata = "off";
*/
u64 sme_me_mask __section(.data) = 0;
EXPORT_SYMBOL(sme_me_mask);
+DEFINE_STATIC_KEY_FALSE(sev_enable_key);
+EXPORT_SYMBOL_GPL(sev_enable_key);
+
+static bool sev_enabled __section(.data);
/* Buffer used for early in-place encryption by BSP, no locking needed */
static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
if (!sme_me_mask)
return;
- local_flush_tlb();
wbinvd();
/*
/* Update the protection map with memory encryption mask */
for (i = 0; i < ARRAY_SIZE(protection_map); i++)
protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+ if (sev_active())
+ swiotlb_force = SWIOTLB_FORCE;
+}
+
+static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, unsigned long attrs)
+{
+ unsigned long dma_mask;
+ unsigned int order;
+ struct page *page;
+ void *vaddr = NULL;
+
+ dma_mask = dma_alloc_coherent_mask(dev, gfp);
+ order = get_order(size);
+
+ /*
+ * Memory will be memset to zero after marking decrypted, so don't
+ * bother clearing it before.
+ */
+ gfp &= ~__GFP_ZERO;
+
+ page = alloc_pages_node(dev_to_node(dev), gfp, order);
+ if (page) {
+ dma_addr_t addr;
+
+ /*
+ * Since we will be clearing the encryption bit, check the
+ * mask with it already cleared.
+ */
+ addr = __sme_clr(phys_to_dma(dev, page_to_phys(page)));
+ if ((addr + size) > dma_mask) {
+ __free_pages(page, get_order(size));
+ } else {
+ vaddr = page_address(page);
+ *dma_handle = addr;
+ }
+ }
+
+ if (!vaddr)
+ vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
+
+ if (!vaddr)
+ return NULL;
+
+ /* Clear the SME encryption bit for DMA use if not swiotlb area */
+ if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) {
+ set_memory_decrypted((unsigned long)vaddr, 1 << order);
+ memset(vaddr, 0, PAGE_SIZE << order);
+ *dma_handle = __sme_clr(*dma_handle);
+ }
+
+ return vaddr;
}
+static void sev_free(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle, unsigned long attrs)
+{
+ /* Set the SME encryption bit for re-use if not swiotlb area */
+ if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle)))
+ set_memory_encrypted((unsigned long)vaddr,
+ 1 << get_order(size));
+
+ swiotlb_free_coherent(dev, size, vaddr, dma_handle);
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ pgprot_t old_prot, new_prot;
+ unsigned long pfn, pa, size;
+ pte_t new_pte;
+
+ switch (level) {
+ case PG_LEVEL_4K:
+ pfn = pte_pfn(*kpte);
+ old_prot = pte_pgprot(*kpte);
+ break;
+ case PG_LEVEL_2M:
+ pfn = pmd_pfn(*(pmd_t *)kpte);
+ old_prot = pmd_pgprot(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ pfn = pud_pfn(*(pud_t *)kpte);
+ old_prot = pud_pgprot(*(pud_t *)kpte);
+ break;
+ default:
+ return;
+ }
+
+ new_prot = old_prot;
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ /* If prot is same then do nothing. */
+ if (pgprot_val(old_prot) == pgprot_val(new_prot))
+ return;
+
+ pa = pfn << page_level_shift(level);
+ size = page_level_size(level);
+
+ /*
+ * We are going to perform in-place en-/decryption and change the
+ * physical page attribute from C=1 to C=0 or vice versa. Flush the
+ * caches to ensure that data gets accessed with the correct C-bit.
+ */
+ clflush_cache_range(__va(pa), size);
+
+ /* Encrypt/decrypt the contents in-place */
+ if (enc)
+ sme_early_encrypt(pa, size);
+ else
+ sme_early_decrypt(pa, size);
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+ unsigned long size, bool enc)
+{
+ unsigned long vaddr_end, vaddr_next;
+ unsigned long psize, pmask;
+ int split_page_size_mask;
+ int level, ret;
+ pte_t *kpte;
+
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Check whether we can change the large page in one go.
+ * We request a split when the address is not aligned and
+ * the number of pages to set/clear encryption bit is smaller
+ * than the number of pages in the large page.
+ */
+ if (vaddr == (vaddr & pmask) &&
+ ((vaddr_end - vaddr) >= psize)) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & pmask) + psize;
+ continue;
+ }
+
+ /*
+ * The virtual address is part of a larger page, create the next
+ * level page table mapping (4K or 2M). If it is part of a 2M
+ * page then we request a split of the large page into 4K
+ * chunks. A 1GB large page is split into 2M pages, resp.
+ */
+ if (level == PG_LEVEL_2M)
+ split_page_size_mask = 0;
+ else
+ split_page_size_mask = 1 << PG_LEVEL_2M;
+
+ kernel_physical_mapping_init(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
+ }
+
+ ret = 0;
+
+out:
+ __flush_tlb_all();
+ return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * sme_active() and sev_active() functions are used for this. When a
+ * distinction isn't needed, the mem_encrypt_active() function can be used.
+ *
+ * The trampoline code is a good example for this requirement. Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted. So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+bool sme_active(void)
+{
+ return sme_me_mask && !sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sme_active);
+
+bool sev_active(void)
+{
+ return sme_me_mask && sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sev_active);
+
+static const struct dma_map_ops sev_dma_ops = {
+ .alloc = sev_alloc,
+ .free = sev_free,
+ .map_page = swiotlb_map_page,
+ .unmap_page = swiotlb_unmap_page,
+ .map_sg = swiotlb_map_sg_attrs,
+ .unmap_sg = swiotlb_unmap_sg_attrs,
+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = swiotlb_sync_single_for_device,
+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
+ .mapping_error = swiotlb_dma_mapping_error,
+};
+
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
swiotlb_update_mem_attributes();
- pr_info("AMD Secure Memory Encryption (SME) active\n");
+ /*
+ * With SEV, DMA operations cannot use encryption. New DMA ops
+ * are required in order to mark the DMA areas as decrypted or
+ * to use bounce buffers.
+ */
+ if (sev_active())
+ dma_ops = &sev_dma_ops;
+
+ /*
+ * With SEV, we need to unroll the rep string I/O instructions.
+ */
+ if (sev_active())
+ static_branch_enable(&sev_enable_key);
+
+ pr_info("AMD %s active\n",
+ sev_active() ? "Secure Encrypted Virtualization (SEV)"
+ : "Secure Memory Encryption (SME)");
}
void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
{
const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
unsigned int eax, ebx, ecx, edx;
+ unsigned long feature_mask;
bool active_by_default;
unsigned long me_mask;
char buffer[16];
u64 msr;
- /* Check for the SME support leaf */
+ /* Check for the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
return;
+#define AMD_SME_BIT BIT(0)
+#define AMD_SEV_BIT BIT(1)
/*
- * Check for the SME feature:
- * CPUID Fn8000_001F[EAX] - Bit 0
- * Secure Memory Encryption support
- * CPUID Fn8000_001F[EBX] - Bits 5:0
- * Pagetable bit position used to indicate encryption
+ * Set the feature mask (SME or SEV) based on whether we are
+ * running under a hypervisor.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
*/
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
- if (!(eax & 1))
+ if (!(eax & feature_mask))
return;
me_mask = 1UL << (ebx & 0x3f);
- /* Check if SME is enabled */
- msr = __rdmsr(MSR_K8_SYSCFG);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ /* Check if memory encryption is enabled */
+ if (feature_mask == AMD_SME_BIT) {
+ /* For SME, check the SYSCFG MSR */
+ msr = __rdmsr(MSR_K8_SYSCFG);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+ } else {
+ /* For SEV, check the SEV MSR */
+ msr = __rdmsr(MSR_AMD64_SEV);
+ if (!(msr & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* SEV state cannot be controlled by a command line option */
+ sme_me_mask = me_mask;
+ sev_enabled = true;
return;
+ }
/*
* Fixups have not been applied to phys_base yet and we're running
#include <linux/sched/sysctl.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/mman.h>
#include <asm/mmu_context.h>
#include <asm/mpx.h>
return addr;
}
-enum reg_type {
- REG_TYPE_RM = 0,
- REG_TYPE_INDEX,
- REG_TYPE_BASE,
-};
-
-static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
- enum reg_type type)
-{
- int regno = 0;
-
- static const int regoff[] = {
- offsetof(struct pt_regs, ax),
- offsetof(struct pt_regs, cx),
- offsetof(struct pt_regs, dx),
- offsetof(struct pt_regs, bx),
- offsetof(struct pt_regs, sp),
- offsetof(struct pt_regs, bp),
- offsetof(struct pt_regs, si),
- offsetof(struct pt_regs, di),
-#ifdef CONFIG_X86_64
- offsetof(struct pt_regs, r8),
- offsetof(struct pt_regs, r9),
- offsetof(struct pt_regs, r10),
- offsetof(struct pt_regs, r11),
- offsetof(struct pt_regs, r12),
- offsetof(struct pt_regs, r13),
- offsetof(struct pt_regs, r14),
- offsetof(struct pt_regs, r15),
-#endif
- };
- int nr_registers = ARRAY_SIZE(regoff);
- /*
- * Don't possibly decode a 32-bit instructions as
- * reading a 64-bit-only register.
- */
- if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
- nr_registers -= 8;
-
- switch (type) {
- case REG_TYPE_RM:
- regno = X86_MODRM_RM(insn->modrm.value);
- if (X86_REX_B(insn->rex_prefix.value))
- regno += 8;
- break;
-
- case REG_TYPE_INDEX:
- regno = X86_SIB_INDEX(insn->sib.value);
- if (X86_REX_X(insn->rex_prefix.value))
- regno += 8;
- break;
-
- case REG_TYPE_BASE:
- regno = X86_SIB_BASE(insn->sib.value);
- if (X86_REX_B(insn->rex_prefix.value))
- regno += 8;
- break;
-
- default:
- pr_err("invalid register type");
- BUG();
- break;
- }
-
- if (regno >= nr_registers) {
- WARN_ONCE(1, "decoded an instruction with an invalid register");
- return -EINVAL;
- }
- return regoff[regno];
-}
-
-/*
- * return the address being referenced be instruction
- * for rm=3 returning the content of the rm reg
- * for rm!=3 calculates the address using SIB and Disp
- */
-static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs)
-{
- unsigned long addr, base, indx;
- int addr_offset, base_offset, indx_offset;
- insn_byte_t sib;
-
- insn_get_modrm(insn);
- insn_get_sib(insn);
- sib = insn->sib.value;
-
- if (X86_MODRM_MOD(insn->modrm.value) == 3) {
- addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
- if (addr_offset < 0)
- goto out_err;
- addr = regs_get_register(regs, addr_offset);
- } else {
- if (insn->sib.nbytes) {
- base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
- if (base_offset < 0)
- goto out_err;
-
- indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
- if (indx_offset < 0)
- goto out_err;
-
- base = regs_get_register(regs, base_offset);
- indx = regs_get_register(regs, indx_offset);
- addr = base + indx * (1 << X86_SIB_SCALE(sib));
- } else {
- addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
- if (addr_offset < 0)
- goto out_err;
- addr = regs_get_register(regs, addr_offset);
- }
- addr += insn->displacement.value;
- }
- return (void __user *)addr;
-out_err:
- return (void __user *)-1;
-}
-
static int mpx_insn_decode(struct insn *insn,
struct pt_regs *regs)
{
info->si_signo = SIGSEGV;
info->si_errno = 0;
info->si_code = SEGV_BNDERR;
- info->si_addr = mpx_get_addr_ref(&insn, regs);
+ info->si_addr = insn_get_addr_ref(&insn, regs);
/*
* We were not able to extract an address from the instruction,
* probably because there was something invalid in it.
unsigned long start;
int ret;
- /* Nothing to do if the SME is not active */
- if (!sme_active())
+ /* Nothing to do if memory encryption is not active */
+ if (!mem_encrypt_active())
return 0;
/* Should not be working on unaligned addresses */
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/ucs2_string.h>
+#include <linux/mem_encrypt.h>
#include <asm/setup.h>
#include <asm/page.h>
* as trim_bios_range() will reserve the first page and isolate it away
* from memory allocators anyway.
*/
- if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) {
+ pf = _PAGE_RW;
+ if (sev_active())
+ pf |= _PAGE_ENC;
+
+ if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, pf)) {
pr_err("Failed to create 1:1 mapping for the first page!\n");
return 1;
}
if (!(md->attribute & EFI_MEMORY_WB))
flags |= _PAGE_PCD;
+ if (sev_active())
+ flags |= _PAGE_ENC;
+
pfn = md->phys_addr >> PAGE_SHIFT;
if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags))
pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
if (!(md->attribute & EFI_MEMORY_RO))
pf |= _PAGE_RW;
+ if (sev_active())
+ pf |= _PAGE_ENC;
+
return efi_update_mappings(md, pf);
}
(md->type != EFI_RUNTIME_SERVICES_CODE))
pf |= _PAGE_RW;
+ if (sev_active())
+ pf |= _PAGE_ENC;
+
efi_update_mappings(md, pf);
}
}
/*
* If SME is active, the trampoline area will need to be in
* decrypted memory in order to bring up other processors
- * successfully.
+ * successfully. This is not needed for SEV.
*/
- set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
+ if (sme_active())
+ set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
memcpy(base, real_mode_blob, size);
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <os.h>
mm->arch.ldt.entry_count = 0;
}
-int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
{
- return do_modify_ldt_skas(func, ptr, bytecount);
+ /* See non-um modify_ldt() for why we do this cast */
+ return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount);
}
#ifdef CONFIG_X86_MCE
{ machine_check, xen_machine_check, true },
#endif
- { nmi, xen_nmi, true },
+ { nmi, xen_xennmi, true },
{ overflow, xen_overflow, false },
#ifdef CONFIG_IA32_EMULATION
{ entry_INT80_compat, xen_entry_INT80_compat, false },
}
}
-static void xen_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static void xen_load_sp0(unsigned long sp0)
{
struct multicall_space mcs;
mcs = xen_mc_entry(0);
- MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
+ MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
- tss->x86_tss.sp0 = thread->sp0;
+ this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
}
void xen_set_iopl_mask(unsigned mask)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
-#if CONFIG_PGTABLE_LEVELS == 4
+#ifdef CONFIG_X86_64
__visible pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+#endif /* CONFIG_X86_64 */
static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
bool last, unsigned long limit)
{
- int i, nr, flush = 0;
+ int flush = 0;
+ pud_t *pud;
- nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
- for (i = 0; i < nr; i++) {
- pud_t *pud;
- if (p4d_none(p4d[i]))
- continue;
+ if (p4d_none(*p4d))
+ return flush;
- pud = pud_offset(&p4d[i], 0);
- if (PTRS_PER_PUD > 1)
- flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
- flush |= xen_pud_walk(mm, pud, func,
- last && i == nr - 1, limit);
- }
+ pud = pud_offset(p4d, 0);
+ if (PTRS_PER_PUD > 1)
+ flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
+ flush |= xen_pud_walk(mm, pud, func, last, limit);
return flush;
}
continue;
p4d = p4d_offset(&pgd[i], 0);
- if (PTRS_PER_P4D > 1)
- flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
}
{
pgd_t *pgd;
p4d_t *p4d;
- unsigned int i;
bool unpin;
unpin = (vaddr == 2 * PGDIR_SIZE);
vaddr &= PMD_MASK;
pgd = pgd_offset_k(vaddr);
p4d = p4d_offset(pgd, 0);
- for (i = 0; i < PTRS_PER_P4D; i++) {
- if (p4d_none(p4d[i]))
- continue;
- xen_cleanmfnmap_p4d(p4d + i, unpin);
- }
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- set_pgd(pgd, __pgd(0));
- xen_cleanmfnmap_free_pgtbl(p4d, unpin);
- }
+ if (!p4d_none(*p4d))
+ xen_cleanmfnmap_p4d(p4d, unpin);
}
static void __init xen_pagetable_p2m_free(void)
xen_release_ptpage(pfn, PT_PMD);
}
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
*/
void __init xen_relocate_p2m(void)
{
- phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
- int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
+ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
pte_t *pt;
pmd_t *pmd;
pud_t *pud;
- p4d_t *p4d = NULL;
pgd_t *pgd;
unsigned long *new_p2m;
int save_pud;
n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
- if (PTRS_PER_P4D > 1)
- n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
- else
- n_p4d = 0;
- n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
+ n_frames = n_pte + n_pt + n_pmd + n_pud;
new_area = xen_find_free_area(PFN_PHYS(n_frames));
if (!new_area) {
* To avoid any possible virtual address collision, just use
* 2 * PUD_SIZE for the new area.
*/
- p4d_phys = new_area;
- pud_phys = p4d_phys + PFN_PHYS(n_p4d);
+ pud_phys = new_area;
pmd_phys = pud_phys + PFN_PHYS(n_pud);
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
pgd = __va(read_cr3_pa());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
- idx_p4d = 0;
save_pud = n_pud;
- do {
- if (n_p4d > 0) {
- p4d = early_memremap(p4d_phys, PAGE_SIZE);
- clear_page(p4d);
- n_pud = min(save_pud, PTRS_PER_P4D);
- }
- for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
- pud = early_memremap(pud_phys, PAGE_SIZE);
- clear_page(pud);
- for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
- idx_pmd++) {
- pmd = early_memremap(pmd_phys, PAGE_SIZE);
- clear_page(pmd);
- for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
- idx_pt++) {
- pt = early_memremap(pt_phys, PAGE_SIZE);
- clear_page(pt);
- for (idx_pte = 0;
- idx_pte < min(n_pte, PTRS_PER_PTE);
- idx_pte++) {
- set_pte(pt + idx_pte,
- pfn_pte(p2m_pfn, PAGE_KERNEL));
- p2m_pfn++;
- }
- n_pte -= PTRS_PER_PTE;
- early_memunmap(pt, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pt_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
- PFN_DOWN(pt_phys));
- set_pmd(pmd + idx_pt,
- __pmd(_PAGE_TABLE | pt_phys));
- pt_phys += PAGE_SIZE;
+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+ pud = early_memremap(pud_phys, PAGE_SIZE);
+ clear_page(pud);
+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+ idx_pmd++) {
+ pmd = early_memremap(pmd_phys, PAGE_SIZE);
+ clear_page(pmd);
+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+ idx_pt++) {
+ pt = early_memremap(pt_phys, PAGE_SIZE);
+ clear_page(pt);
+ for (idx_pte = 0;
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ set_pte(pt + idx_pte,
+ pfn_pte(p2m_pfn, PAGE_KERNEL));
+ p2m_pfn++;
}
- n_pt -= PTRS_PER_PMD;
- early_memunmap(pmd, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pmd_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
- PFN_DOWN(pmd_phys));
- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
- pmd_phys += PAGE_SIZE;
+ n_pte -= PTRS_PER_PTE;
+ early_memunmap(pt, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pt_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+ PFN_DOWN(pt_phys));
+ set_pmd(pmd + idx_pt,
+ __pmd(_PAGE_TABLE | pt_phys));
+ pt_phys += PAGE_SIZE;
}
- n_pmd -= PTRS_PER_PUD;
- early_memunmap(pud, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pud_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
- if (n_p4d > 0)
- set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
- else
- set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
- pud_phys += PAGE_SIZE;
- }
- if (n_p4d > 0) {
- save_pud -= PTRS_PER_P4D;
- early_memunmap(p4d, PAGE_SIZE);
- make_lowmem_page_readonly(__va(p4d_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
- set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
- p4d_phys += PAGE_SIZE;
+ n_pt -= PTRS_PER_PMD;
+ early_memunmap(pmd, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pmd_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+ PFN_DOWN(pmd_phys));
+ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pmd_phys += PAGE_SIZE;
}
- } while (++idx_p4d < n_p4d);
+ n_pmd -= PTRS_PER_PUD;
+ early_memunmap(pud, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pud_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+ pud_phys += PAGE_SIZE;
+ }
/* Now copy the old p2m info to the new area. */
memcpy(new_p2m, xen_p2m_addr, size);
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
pv_mmu_ops.set_p4d = xen_set_p4d;
#endif
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_p4d = xen_set_p4d_hyper,
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+#endif /* CONFIG_X86_64 */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
* single-threaded.
*/
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/smp.h>
#endif
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+ /*
+ * Bring up the CPU in cpu_bringup_and_idle() with the stack
+ * pointing just below where pt_regs would be if it were a normal
+ * kernel entry.
+ */
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.ss = __KERNEL_DS;
+ ctxt->user_regs.cs = __KERNEL_CS;
+ ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle);
xen_copy_trap_info(ctxt->trap_ctxt);
ctxt->gdt_frames[0] = gdt_mfn;
ctxt->gdt_ents = GDT_ENTRIES;
+ /*
+ * Set SS:SP that Xen will use when entering guest kernel mode
+ * from guest user mode. Subsequent calls to load_sp0() can
+ * change this value.
+ */
ctxt->kernel_ss = __KERNEL_DS;
- ctxt->kernel_sp = idle->thread.sp0;
+ ctxt->kernel_sp = task_top_of_stack(idle);
#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
(unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip =
(unsigned long)xen_failsafe_callback;
- ctxt->user_regs.cs = __KERNEL_CS;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
BUG();
xen_pv_trap xendebug
xen_pv_trap int3
xen_pv_trap xenint3
-xen_pv_trap nmi
+xen_pv_trap xennmi
xen_pv_trap overflow
xen_pv_trap bounds
xen_pv_trap invalid_op
#include <asm/boot.h>
#include <asm/asm.h>
#include <asm/page_types.h>
+#include <asm/unwind_hints.h>
#include <xen/interface/elfnote.h>
#include <xen/interface/features.h>
#ifdef CONFIG_XEN_PV
__INIT
ENTRY(startup_xen)
+ UNWIND_HINT_EMPTY
cld
/* Clear .bss */
mov $init_thread_union+THREAD_SIZE, %_ASM_SP
jmp xen_start_kernel
-
+END(startup_xen)
__FINIT
#endif
.pushsection .text
.balign PAGE_SIZE
ENTRY(hypercall_page)
- .skip PAGE_SIZE
+ .rept (PAGE_SIZE / 32)
+ UNWIND_HINT_EMPTY
+ .skip 32
+ .endr
#define HYPERCALL(n) \
.equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
.type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
#include <asm/xen-hypercalls.h>
#undef HYPERCALL
-
+END(hypercall_page)
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
#define BUG_TABLE
#endif
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
#define ORC_UNWIND_TABLE \
. = ALIGN(4); \
.orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \
#endif
/*
+ * Memory encryption operates on a page basis. Since we need to clear
+ * the memory encryption mask for this section, it needs to be aligned
+ * on a page boundary and be a page-size multiple in length.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#define PERCPU_DECRYPTED_SECTION \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..decrypted) \
+ . = ALIGN(PAGE_SIZE);
+#else
+#define PERCPU_DECRYPTED_SECTION
+#endif
+
+
+/*
* Default discarded sections.
*
* Some archs want to discard exit text/data at runtime rather than
. = ALIGN(cacheline); \
*(.data..percpu) \
*(.data..percpu..shared_aligned) \
+ PERCPU_DECRYPTED_SECTION \
VMLINUX_SYMBOL(__per_cpu_end) = .;
/**
walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *));
extern int
+walk_mem_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *));
+extern int
walk_system_ram_res(u64 start, u64 end, void *arg,
- int (*func)(u64, u64, void *));
+ int (*func)(struct resource *, void *));
extern int
walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
- void *arg, int (*func)(u64, u64, void *));
+ void *arg, int (*func)(struct resource *, void *));
/* True if any part of r1 overlaps r2 */
static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
};
int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
- int (*func)(u64, u64, void *));
+ int (*func)(struct resource *, void *));
extern int kexec_add_buffer(struct kexec_buf *kbuf);
int kexec_locate_mem_hole(struct kexec_buf *kbuf);
#endif /* CONFIG_KEXEC_FILE */
#define sme_me_mask 0ULL
+static inline bool sme_active(void) { return false; }
+static inline bool sev_active(void) { return false; }
+
#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
-static inline bool sme_active(void)
+static inline bool mem_encrypt_active(void)
{
- return !!sme_me_mask;
+ return sme_me_mask;
}
static inline u64 sme_get_me_mask(void)
void vmemmap_free(unsigned long start, unsigned long end);
#endif
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
- unsigned long size);
+ unsigned long nr_pages);
enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
#ifdef CONFIG_SPARSEMEM_EXTREME
-extern struct mem_section *mem_section[NR_SECTION_ROOTS];
+extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif
static inline struct mem_section *__nr_to_section(unsigned long nr)
{
+#ifdef CONFIG_SPARSEMEM_EXTREME
+ if (!mem_section)
+ return NULL;
+#endif
if (!mem_section[SECTION_NR_TO_ROOT(nr)])
return NULL;
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
DEFINE_PER_CPU_SECTION(type, name, "..read_mostly")
/*
+ * Declaration/definition used for per-CPU variables that should be accessed
+ * as decrypted when memory encryption is enabled in the guest.
+ */
+#if defined(CONFIG_VIRTUALIZATION) && defined(CONFIG_AMD_MEM_ENCRYPT)
+
+#define DECLARE_PER_CPU_DECRYPTED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, "..decrypted")
+
+#define DEFINE_PER_CPU_DECRYPTED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, "..decrypted")
+#else
+#define DEFINE_PER_CPU_DECRYPTED(type, name) DEFINE_PER_CPU(type, name)
+#endif
+
+/*
* Intermodule exports for per-CPU variables. sparse forgets about
* address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to
* noop if __CHECKER__.
* mutex protecting text section modification (dynamic code patching).
* some users need to sleep (allocating memory...) while they hold this lock.
*
+ * Note: Also protects SMP-alternatives modification on x86.
+ *
* NOT exported to modules - patching kernel text is a really delicate matter.
*/
DEFINE_MUTEX(text_mutex);
return 1;
}
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+static int locate_mem_hole_callback(struct resource *res, void *arg)
{
struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ u64 start = res->start, end = res->end;
unsigned long sz = end - start + 1;
/* Returning 0 will take to next memory range */
* func returning non-zero, then zero will be returned.
*/
int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
- int (*func)(u64, u64, void *))
+ int (*func)(struct resource *, void *))
{
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return walk_iomem_res_desc(crashk_res.desc,
res->start = p->start;
if (res->end > p->end)
res->end = p->end;
+ res->flags = p->flags;
+ res->desc = p->desc;
return 0;
}
+static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
+ bool first_level_children_only,
+ void *arg,
+ int (*func)(struct resource *, void *))
+{
+ u64 orig_end = res->end;
+ int ret = -1;
+
+ while ((res->start < res->end) &&
+ !find_next_iomem_res(res, desc, first_level_children_only)) {
+ ret = (*func)(res, arg);
+ if (ret)
+ break;
+
+ res->start = res->end + 1;
+ res->end = orig_end;
+ }
+
+ return ret;
+}
+
/*
* Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children.
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
*/
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
- u64 end, void *arg, int (*func)(u64, u64, void *))
+ u64 end, void *arg, int (*func)(struct resource *, void *))
{
struct resource res;
- u64 orig_end;
- int ret = -1;
res.start = start;
res.end = end;
res.flags = flags;
- orig_end = res.end;
-
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, desc, false))) {
-
- ret = (*func)(res.start, res.end, arg);
- if (ret)
- break;
-
- res.start = res.end + 1;
- res.end = orig_end;
- }
- return ret;
+ return __walk_iomem_res_desc(&res, desc, false, arg, func);
}
/*
* ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
- int (*func)(u64, u64, void *))
+ int (*func)(struct resource *, void *))
{
struct resource res;
- u64 orig_end;
- int ret = -1;
res.start = start;
res.end = end;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- orig_end = res.end;
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
- ret = (*func)(res.start, res.end, arg);
- if (ret)
- break;
- res.start = res.end + 1;
- res.end = orig_end;
- }
- return ret;
+
+ return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+ arg, func);
+}
+
+/*
+ * This function calls the @func callback against all memory ranges, which
+ * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ */
+int walk_mem_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+ return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+ arg, func);
}
#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
{
return 1;
}
+
/*
* This generic page_is_ram() returns true if specified address is
* registered as System RAM in iomem_resource list.
that runtime stack traces are more reliable.
This is also a prerequisite for generation of ORC unwind data, which
- is needed for CONFIG_ORC_UNWINDER.
+ is needed for CONFIG_UNWINDER_ORC.
For more information, see
tools/objtool/Documentation/stack-validation.txt.
if (no_iotlb_memory)
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
- if (sme_active())
- pr_warn_once("SME is active and system is using DMA bounce buffers\n");
+ if (mem_encrypt_active())
+ pr_warn_once("%s is active and system is using DMA bounce buffers\n",
+ sme_active() ? "SME" : "SEV");
mask = dma_get_seg_boundary(hwdev);
return 1;
}
+static void gup_pgd_range(unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pgd_t *pgdp;
+
+ pgdp = pgd_offset(current->mm, addr);
+ do {
+ pgd_t pgd = READ_ONCE(*pgdp);
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ return;
+ if (unlikely(pgd_huge(pgd))) {
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+ pages, nr))
+ return;
+ } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
+ if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
+ PGDIR_SHIFT, next, write, pages, nr))
+ return;
+ } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
+ return;
+ } while (pgdp++, addr = next, addr != end);
+}
+
+#ifndef gup_fast_permitted
+/*
+ * Check if it's allowed to use __get_user_pages_fast() for the range, or
+ * we need to fall back to the slow version:
+ */
+bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ return end >= start;
+}
+#endif
+
/*
* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
* the regular GUP. It will only return non-negative values.
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
- unsigned long next, flags;
- pgd_t *pgdp;
+ unsigned long flags;
int nr = 0;
start &= PAGE_MASK;
* block IPIs that come from THPs splitting.
*/
- local_irq_save(flags);
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = READ_ONCE(*pgdp);
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- break;
- if (unlikely(pgd_huge(pgd))) {
- if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
- pages, &nr))
- break;
- } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
- if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, write, pages, &nr))
- break;
- } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
- break;
- } while (pgdp++, addr = next, addr != end);
- local_irq_restore(flags);
+ if (gup_fast_permitted(start, nr_pages, write)) {
+ local_irq_save(flags);
+ gup_pgd_range(addr, end, write, pages, &nr);
+ local_irq_restore(flags);
+ }
return nr;
}
-#ifndef gup_fast_permitted
-/*
- * Check if it's allowed to use __get_user_pages_fast() for the range, or
- * we need to fall back to the slow version:
- */
-bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
-{
- unsigned long len, end;
-
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
- return end >= start;
-}
-#endif
-
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
+ unsigned long addr, len, end;
int nr = 0, ret = 0;
start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ (void __user *)start, len)))
+ return 0;
if (gup_fast_permitted(start, nr_pages, write)) {
- nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ local_irq_disable();
+ gup_pgd_range(addr, end, write, pages, &nr);
+ local_irq_enable();
ret = nr;
}
* 1) mem_section - memory sections, mem_map's for valid memory
*/
#ifdef CONFIG_SPARSEMEM_EXTREME
-struct mem_section *mem_section[NR_SECTION_ROOTS]
- ____cacheline_internodealigned_in_smp;
+struct mem_section **mem_section;
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
____cacheline_internodealigned_in_smp;
int __section_nr(struct mem_section* ms)
{
unsigned long root_nr;
- struct mem_section* root;
+ struct mem_section *root = NULL;
for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
break;
}
- VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
+ VM_BUG_ON(!root);
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
{
unsigned long pfn;
+#ifdef CONFIG_SPARSEMEM_EXTREME
+ if (unlikely(!mem_section)) {
+ unsigned long size, align;
+
+ size = sizeof(struct mem_section) * NR_SECTION_ROOTS;
+ align = 1 << (INTERNODE_CACHE_SHIFT);
+ mem_section = memblock_virt_alloc(size, align);
+ }
+#endif
+
start &= PAGE_SECTION_MASK;
mminit_validate_memmodel_limits(&start, &end);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
{
unsigned long usemap_snr, pgdat_snr;
- static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
- static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
+ static unsigned long old_usemap_snr;
+ static unsigned long old_pgdat_snr;
struct pglist_data *pgdat = NODE_DATA(nid);
int usemap_nid;
+ /* First call */
+ if (!old_usemap_snr) {
+ old_usemap_snr = NR_MEM_SECTIONS;
+ old_pgdat_snr = NR_MEM_SECTIONS;
+ }
+
usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
__objtool_obj := $(objtree)/tools/objtool/objtool
-objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check)
+objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check)
ifndef CONFIG_FRAME_POINTER
objtool_args += --no-fp
if (insn->dead_end)
return 0;
- insn = next_insn;
- if (!insn) {
+ if (!next_insn) {
+ if (state.cfa.base == CFI_UNDEFINED)
+ return 0;
WARN("%s: unexpected end of section", sec->name);
return 1;
}
+
+ insn = next_insn;
}
return 0;
printf("\n");
- exit(1);
+ exit(129);
}
static void handle_options(int *argc, const char ***argv)
break;
} else {
fprintf(stderr, "Unknown option: %s\n", cmd);
- fprintf(stderr, "\n Usage: %s\n",
- objtool_usage_string);
- exit(1);
+ cmd_usage();
}
(*argv)++;
"int3\n\t"
"vmcode_int80:\n\t"
"int $0x80\n\t"
+ "vmcode_umip:\n\t"
+ /* addressing via displacements */
+ "smsw (2052)\n\t"
+ "sidt (2054)\n\t"
+ "sgdt (2060)\n\t"
+ /* addressing via registers */
+ "mov $2066, %bx\n\t"
+ "smsw (%bx)\n\t"
+ "mov $2068, %bx\n\t"
+ "sidt (%bx)\n\t"
+ "mov $2074, %bx\n\t"
+ "sgdt (%bx)\n\t"
+ /* register operands, only for smsw */
+ "smsw %ax\n\t"
+ "mov %ax, (2080)\n\t"
+ "int3\n\t"
+ "vmcode_umip_str:\n\t"
+ "str %eax\n\t"
+ "vmcode_umip_sldt:\n\t"
+ "sldt %eax\n\t"
+ "int3\n\t"
".size vmcode, . - vmcode\n\t"
"end_vmcode:\n\t"
".code32\n\t"
extern unsigned char vmcode[], end_vmcode[];
extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[],
- vmcode_sti[], vmcode_int3[], vmcode_int80[];
+ vmcode_sti[], vmcode_int3[], vmcode_int80[], vmcode_umip[],
+ vmcode_umip_str[], vmcode_umip_sldt[];
/* Returns false if the test was skipped. */
static bool do_test(struct vm86plus_struct *v86, unsigned long eip,
return true;
}
+void do_umip_tests(struct vm86plus_struct *vm86, unsigned char *test_mem)
+{
+ struct table_desc {
+ unsigned short limit;
+ unsigned long base;
+ } __attribute__((packed));
+
+ /* Initialize variables with arbitrary values */
+ struct table_desc gdt1 = { .base = 0x3c3c3c3c, .limit = 0x9999 };
+ struct table_desc gdt2 = { .base = 0x1a1a1a1a, .limit = 0xaeae };
+ struct table_desc idt1 = { .base = 0x7b7b7b7b, .limit = 0xf1f1 };
+ struct table_desc idt2 = { .base = 0x89898989, .limit = 0x1313 };
+ unsigned short msw1 = 0x1414, msw2 = 0x2525, msw3 = 3737;
+
+ /* UMIP -- exit with INT3 unless kernel emulation did not trap #GP */
+ do_test(vm86, vmcode_umip - vmcode, VM86_TRAP, 3, "UMIP tests");
+
+ /* Results from displacement-only addressing */
+ msw1 = *(unsigned short *)(test_mem + 2052);
+ memcpy(&idt1, test_mem + 2054, sizeof(idt1));
+ memcpy(&gdt1, test_mem + 2060, sizeof(gdt1));
+
+ /* Results from register-indirect addressing */
+ msw2 = *(unsigned short *)(test_mem + 2066);
+ memcpy(&idt2, test_mem + 2068, sizeof(idt2));
+ memcpy(&gdt2, test_mem + 2074, sizeof(gdt2));
+
+ /* Results when using register operands */
+ msw3 = *(unsigned short *)(test_mem + 2080);
+
+ printf("[INFO]\tResult from SMSW:[0x%04x]\n", msw1);
+ printf("[INFO]\tResult from SIDT: limit[0x%04x]base[0x%08lx]\n",
+ idt1.limit, idt1.base);
+ printf("[INFO]\tResult from SGDT: limit[0x%04x]base[0x%08lx]\n",
+ gdt1.limit, gdt1.base);
+
+ if (msw1 != msw2 || msw1 != msw3)
+ printf("[FAIL]\tAll the results of SMSW should be the same.\n");
+ else
+ printf("[PASS]\tAll the results from SMSW are identical.\n");
+
+ if (memcmp(&gdt1, &gdt2, sizeof(gdt1)))
+ printf("[FAIL]\tAll the results of SGDT should be the same.\n");
+ else
+ printf("[PASS]\tAll the results from SGDT are identical.\n");
+
+ if (memcmp(&idt1, &idt2, sizeof(idt1)))
+ printf("[FAIL]\tAll the results of SIDT should be the same.\n");
+ else
+ printf("[PASS]\tAll the results from SIDT are identical.\n");
+
+ sethandler(SIGILL, sighandler, 0);
+ do_test(vm86, vmcode_umip_str - vmcode, VM86_SIGNAL, 0,
+ "STR instruction");
+ clearhandler(SIGILL);
+
+ sethandler(SIGILL, sighandler, 0);
+ do_test(vm86, vmcode_umip_sldt - vmcode, VM86_SIGNAL, 0,
+ "SLDT instruction");
+ clearhandler(SIGILL);
+}
+
int main(void)
{
struct vm86plus_struct v86;
v86.regs.eax = (unsigned int)-1;
do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80");
+ /* UMIP -- should exit with INTx 0x80 unless UMIP was not disabled */
+ do_umip_tests(&v86, addr);
+
/* Execute a null pointer */
v86.regs.cs = 0;
v86.regs.ss = 0;
return;
}
- if (ar != expected_ar) {
+ /* The SDM says "bits 19:16 are undefined". Thanks. */
+ ar &= ~0xF0000;
+
+ /*
+ * NB: Different Linux versions do different things with the
+ * accessed bit in set_thread_area().
+ */
+ if (ar != expected_ar &&
+ (ldt || ar != (expected_ar | AR_ACCESSED))) {
printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
(ldt ? "LDT" : "GDT"), index, ar, expected_ar);
nerrs++;
}
}
-static bool install_valid_mode(const struct user_desc *desc, uint32_t ar,
- bool oldmode)
+static bool install_valid_mode(const struct user_desc *d, uint32_t ar,
+ bool oldmode, bool ldt)
{
- int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
- desc, sizeof(*desc));
- if (ret < -1)
- errno = -ret;
+ struct user_desc desc = *d;
+ int ret;
+
+ if (!ldt) {
+#ifndef __i386__
+ /* No point testing set_thread_area in a 64-bit build */
+ return false;
+#endif
+ if (!gdt_entry_num)
+ return false;
+ desc.entry_number = gdt_entry_num;
+
+ ret = syscall(SYS_set_thread_area, &desc);
+ } else {
+ ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+ &desc, sizeof(desc));
+
+ if (ret < -1)
+ errno = -ret;
+
+ if (ret != 0 && errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+ return false;
+ }
+ }
+
if (ret == 0) {
- uint32_t limit = desc->limit;
- if (desc->limit_in_pages)
+ uint32_t limit = desc.limit;
+ if (desc.limit_in_pages)
limit = (limit << 12) + 4095;
- check_valid_segment(desc->entry_number, 1, ar, limit, true);
+ check_valid_segment(desc.entry_number, ldt, ar, limit, true);
return true;
- } else if (errno == ENOSYS) {
- printf("[OK]\tmodify_ldt returned -ENOSYS\n");
- return false;
} else {
- if (desc->seg_32bit) {
- printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
+ if (desc.seg_32bit) {
+ printf("[FAIL]\tUnexpected %s failure %d\n",
+ ldt ? "modify_ldt" : "set_thread_area",
errno);
nerrs++;
return false;
} else {
- printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
+ printf("[OK]\t%s rejected 16 bit segment\n",
+ ldt ? "modify_ldt" : "set_thread_area");
return false;
}
}
static bool install_valid(const struct user_desc *desc, uint32_t ar)
{
- return install_valid_mode(desc, ar, false);
+ bool ret = install_valid_mode(desc, ar, false, true);
+
+ if (desc->contents <= 1 && desc->seg_32bit &&
+ !desc->seg_not_present) {
+ /* Should work in the GDT, too. */
+ install_valid_mode(desc, ar, false, false);
+ }
+
+ return ret;
}
static void install_invalid(const struct user_desc *desc, bool oldmode)
install_invalid(&desc, false);
desc.seg_not_present = 0;
- desc.read_exec_only = 0;
desc.seg_32bit = 1;
+ desc.read_exec_only = 0;
+ desc.limit = 0xfffff;
+
install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB);
+
+ desc.limit_in_pages = 1;
+
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB | AR_G);
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P | AR_DB | AR_G);
+ desc.contents = 1;
+ desc.read_exec_only = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G);
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G);
+
+ desc.limit = 0;
install_invalid(&desc, true);
}
#define u64 uint64_t
#ifdef __i386__
-#define SYS_mprotect_key 380
-#define SYS_pkey_alloc 381
-#define SYS_pkey_free 382
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 380
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 381
+# define SYS_pkey_free 382
+#endif
#define REG_IP_IDX REG_EIP
#define si_pkey_offset 0x14
+
#else
-#define SYS_mprotect_key 329
-#define SYS_pkey_alloc 330
-#define SYS_pkey_free 331
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 329
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 330
+# define SYS_pkey_free 331
+#endif
#define REG_IP_IDX REG_RIP
#define si_pkey_offset 0x20
+
#endif
void dump_mem(void *dumpme, int len_bytes)