From: Paolo Bonzini Date: Tue, 31 Mar 2020 14:44:53 +0000 (-0400) Subject: Merge tag 'kvmarm-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm... X-Git-Tag: v5.10.7~2921^2~13 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=cf39d37539068d53e015d8b4f1dcf42c65306b0d;hp=-c;p=platform%2Fkernel%2Flinux-rpi.git Merge tag 'kvmarm-5.7' of git://git./linux/kernel/git/kvmarm/kvmarm into HEAD KVM/arm updates for Linux 5.7 - GICv4.1 support - 32bit host removal --- cf39d37539068d53e015d8b4f1dcf42c65306b0d diff --combined Documentation/admin-guide/kernel-parameters.txt index b0beae9,c07815d..144c130 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@@ -136,6 -136,10 +136,10 @@@ dynamic table installation which will install SSDT tables to /sys/firmware/acpi/tables/dynamic. + acpi_no_watchdog [HW,ACPI,WDT] + Ignore the ACPI-based watchdog interface (WDAT) and let + a native driver control the watchdog device instead. + acpi_rsdp= [ACPI,EFI,KEXEC] Pass the RSDP address to the kernel, mostly used on machines running EFI runtime service to boot the @@@ -3795,11 -3799,6 +3799,11 @@@ before loading. See Documentation/admin-guide/blockdev/ramdisk.rst. + prot_virt= [S390] enable hosting protected virtual machines + isolated from the hypervisor (if hardware supports + that). + Format: + psi= [KNL] Enable or disable pressure stall information tracking. Format: diff --combined MAINTAINERS index 97a7064,e84a94e..d87d009 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -3649,6 -3649,7 +3649,7 @@@ F: sound/pci/oxygen C-SKY ARCHITECTURE M: Guo Ren + L: linux-csky@vger.kernel.org T: git https://github.com/c-sky/csky-linux.git S: Supported F: arch/csky/ @@@ -3909,7 -3910,7 +3910,7 @@@ S: Supporte F: Documentation/filesystems/ceph.txt F: fs/ceph/ - CERTIFICATE HANDLING: + CERTIFICATE HANDLING M: David Howells M: David Woodhouse L: keyrings@vger.kernel.org @@@ -3919,7 -3920,7 +3920,7 @@@ F: certs F: scripts/sign-file.c F: scripts/extract-cert.c - CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: + CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM L: devel@driverdev.osuosl.org S: Obsolete F: drivers/staging/wusbcore/ @@@ -5932,12 -5933,12 +5933,12 @@@ S: Maintaine F: drivers/media/dvb-frontends/ec100* ECRYPT FILE SYSTEM - M: Tyler Hicks + M: Tyler Hicks L: ecryptfs@vger.kernel.org W: http://ecryptfs.org W: https://launchpad.net/ecryptfs T: git git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git - S: Supported + S: Odd Fixes F: Documentation/filesystems/ecryptfs.txt F: fs/ecryptfs/ @@@ -7047,7 -7048,7 +7048,7 @@@ L: kvm@vger.kernel.or S: Supported F: drivers/uio/uio_pci_generic.c - GENERIC VDSO LIBRARY: + GENERIC VDSO LIBRARY M: Andy Lutomirski M: Thomas Gleixner M: Vincenzo Frascino @@@ -8392,7 -8393,7 +8393,7 @@@ M: Joonas Lahtinen L: intel-gfx@lists.freedesktop.org W: https://01.org/linuxgraphics/ - B: https://01.org/linuxgraphics/documentation/how-report-bugs + B: https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs C: irc://chat.freenode.net/intel-gfx Q: http://patchwork.freedesktop.org/project/intel-gfx/ T: git git://anongit.freedesktop.org/drm-intel @@@ -9163,7 -9164,7 +9164,7 @@@ F: virt/kvm/ F: tools/kvm/ F: tools/testing/selftests/kvm/ - KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64) + KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) M: Marc Zyngier R: James Morse R: Julien Thierry @@@ -9172,9 -9173,6 +9173,6 @@@ L: linux-arm-kernel@lists.infradead.or L: kvmarm@lists.cs.columbia.edu T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git S: Maintained - F: arch/arm/include/uapi/asm/kvm* - F: arch/arm/include/asm/kvm* - F: arch/arm/kvm/ F: arch/arm64/include/uapi/asm/kvm* F: arch/arm64/include/asm/kvm* F: arch/arm64/kvm/ @@@ -9209,7 -9207,6 +9207,7 @@@ L: kvm@vger.kernel.or W: http://www.ibm.com/developerworks/linux/linux390/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git S: Supported +F: Documentation/virt/kvm/s390* F: arch/s390/include/uapi/asm/kvm* F: arch/s390/include/asm/gmap.h F: arch/s390/include/asm/kvm* @@@ -9279,7 -9276,7 +9277,7 @@@ F: include/keys/trusted-type. F: security/keys/trusted.c F: include/keys/trusted.h - KEYS/KEYRINGS: + KEYS/KEYRINGS M: David Howells M: Jarkko Sakkinen L: keyrings@vger.kernel.org @@@ -11115,14 -11112,12 +11113,12 @@@ S: Maintaine F: drivers/usb/image/microtek.* MIPS - M: Ralf Baechle - M: Paul Burton + M: Thomas Bogendoerfer L: linux-mips@vger.kernel.org W: http://www.linux-mips.org/ - T: git git://git.linux-mips.org/pub/scm/ralf/linux.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git Q: http://patchwork.linux-mips.org/project/linux-mips/list/ - S: Supported + S: Maintained F: Documentation/devicetree/bindings/mips/ F: Documentation/mips/ F: arch/mips/ @@@ -11485,7 -11480,7 +11481,7 @@@ F: drivers/scsi/mac_scsi. F: drivers/scsi/sun3_scsi.* F: drivers/scsi/sun3_scsi_vme.c - NCSI LIBRARY: + NCSI LIBRARY M: Samuel Mendoza-Jonas S: Maintained F: net/ncsi/ @@@ -12741,7 -12736,7 +12737,7 @@@ M: Tom Joseph @@@ -13513,7 -13508,7 +13509,7 @@@ L: linuxppc-dev@lists.ozlabs.or S: Maintained F: drivers/block/ps3vram.c - PSAMPLE PACKET SAMPLING SUPPORT: + PSAMPLE PACKET SAMPLING SUPPORT M: Yotam Gigi S: Maintained F: net/psample @@@ -14583,10 -14578,10 +14579,10 @@@ F: drivers/media/pci/saa7146 F: include/media/drv-intf/saa7146* SAFESETID SECURITY MODULE - M: Micah Morton - S: Supported - F: security/safesetid/ - F: Documentation/admin-guide/LSM/SafeSetID.rst + M: Micah Morton + S: Supported + F: security/safesetid/ + F: Documentation/admin-guide/LSM/SafeSetID.rst SAMSUNG AUDIO (ASoC) DRIVERS M: Krzysztof Kozlowski @@@ -16553,8 -16548,8 +16549,8 @@@ M: Michael Jamet M: Yehezkel Bernat L: linux-usb@vger.kernel.org - T: git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git S: Maintained + T: git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git F: Documentation/admin-guide/thunderbolt.rst F: drivers/thunderbolt/ F: include/linux/thunderbolt.h @@@ -17081,7 -17076,7 +17077,7 @@@ S: Maintaine F: Documentation/admin-guide/ufs.rst F: fs/ufs/ - UHID USERSPACE HID IO DRIVER: + UHID USERSPACE HID IO DRIVER M: David Herrmann L: linux-input@vger.kernel.org S: Maintained @@@ -17095,18 -17090,18 +17091,18 @@@ S: Maintaine F: drivers/usb/common/ulpi.c F: include/linux/ulpi/ - ULTRA-WIDEBAND (UWB) SUBSYSTEM: + ULTRA-WIDEBAND (UWB) SUBSYSTEM L: devel@driverdev.osuosl.org S: Obsolete F: drivers/staging/uwb/ - UNICODE SUBSYSTEM: + UNICODE SUBSYSTEM M: Gabriel Krisman Bertazi L: linux-fsdevel@vger.kernel.org S: Supported F: fs/unicode/ - UNICORE32 ARCHITECTURE: + UNICORE32 ARCHITECTURE M: Guan Xuetao W: http://mprc.pku.edu.cn/~guanxuetao/linux S: Maintained @@@ -17393,11 -17388,14 +17389,14 @@@ F: drivers/usb F: include/linux/usb.h F: include/linux/usb/ - USB TYPEC PI3USB30532 MUX DRIVER - M: Hans de Goede + USB TYPEC BUS FOR ALTERNATE MODES + M: Heikki Krogerus L: linux-usb@vger.kernel.org S: Maintained - F: drivers/usb/typec/mux/pi3usb30532.c + F: Documentation/ABI/testing/sysfs-bus-typec + F: Documentation/driver-api/usb/typec_bus.rst + F: drivers/usb/typec/altmodes/ + F: include/linux/usb/typec_altmode.h USB TYPEC CLASS M: Heikki Krogerus @@@ -17408,14 -17406,11 +17407,11 @@@ F: Documentation/driver-api/usb/typec.r F: drivers/usb/typec/ F: include/linux/usb/typec.h - USB TYPEC BUS FOR ALTERNATE MODES - M: Heikki Krogerus + USB TYPEC PI3USB30532 MUX DRIVER + M: Hans de Goede L: linux-usb@vger.kernel.org S: Maintained - F: Documentation/ABI/testing/sysfs-bus-typec - F: Documentation/driver-api/usb/typec_bus.rst - F: drivers/usb/typec/altmodes/ - F: include/linux/usb/typec_altmode.h + F: drivers/usb/typec/mux/pi3usb30532.c USB TYPEC PORT CONTROLLER DRIVERS M: Guenter Roeck @@@ -17792,7 -17787,7 +17788,7 @@@ F: include/linux/vbox_utils. F: include/uapi/linux/vbox*.h F: drivers/virt/vboxguest/ - VIRTUAL BOX SHARED FOLDER VFS DRIVER: + VIRTUAL BOX SHARED FOLDER VFS DRIVER M: Hans de Goede L: linux-fsdevel@vger.kernel.org S: Maintained diff --combined arch/arm64/kvm/hyp/switch.c index f3e0ab9,925086b..600010cd6 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@@ -17,6 -17,7 +17,6 @@@ #include #include #include -#include #include #include #include @@@ -624,7 -625,7 +624,7 @@@ static void __hyp_text __pmu_switch_to_ } /* Switch to the guest for VHE systems running in EL2 */ - int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) + static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; @@@ -677,7 -678,42 +677,42 @@@ return exit_code; } - NOKPROBE_SYMBOL(kvm_vcpu_run_vhe); + NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); + + int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) + { + int ret; + + local_daif_mask(); + + /* + * Having IRQs masked via PMR when entering the guest means the GIC + * will not signal the CPU of interrupts of lower priority, and the + * only way to get out will be via guest exceptions. + * Naturally, we want to avoid this. + * + * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a + * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU. + */ + pmr_sync(); + + ret = __kvm_vcpu_run_vhe(vcpu); + + /* + * local_daif_restore() takes care to properly restore PSTATE.DAIF + * and the GIC PMR if the host is using IRQ priorities. + */ + local_daif_restore(DAIF_PROCCTX_NOIRQ); + + /* + * When we exit from the guest we change a number of CPU configuration + * parameters, such as traps. Make sure these changes take effect + * before running the host or additional guests. + */ + isb(); + + return ret; + } /* Switch to the guest for legacy non-VHE systems */ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu) diff --combined arch/s390/boot/Makefile index 30f1811,0ff9261..45b33b8 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@@ -37,7 -37,7 +37,7 @@@ CFLAGS_sclp_early_core.o += -I$(srctree obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += version.o pgm_check_info.o ctype.o text_dma.o -obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) @@@ -70,7 -70,7 +70,7 @@@ $(obj)/compressed/vmlinux: $(obj)/start $(obj)/startup.a: $(OBJECTS) FORCE $(call if_changed,ar) - install: $(CONFIGURE) $(obj)/bzImage + install: sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ System.map "$(INSTALL_PATH)" diff --combined arch/s390/include/asm/page.h index 4ebcf89,1019efd..62440a8 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@@ -42,7 -42,7 +42,7 @@@ void __storage_key_init_range(unsigned static inline void storage_key_init_range(unsigned long start, unsigned long end) { - if (PAGE_DEFAULT_KEY) + if (PAGE_DEFAULT_KEY != 0) __storage_key_init_range(start, end); } @@@ -153,11 -153,6 +153,11 @@@ static inline int devmem_is_allowed(uns #define HAVE_ARCH_FREE_PAGE #define HAVE_ARCH_ALLOC_PAGE +#if IS_ENABLED(CONFIG_PGSTE) +int arch_make_page_accessible(struct page *page); +#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +#endif + #endif /* !__ASSEMBLY__ */ #define __PAGE_OFFSET 0x0UL diff --combined arch/x86/kvm/svm.c index 2125c6a,24c0b2b..05cb45b --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@@ -57,11 -57,13 +57,13 @@@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); + #ifdef MODULE static const struct x86_cpu_id svm_cpu_id[] = { X86_FEATURE_MATCH(X86_FEATURE_SVM), {} }; MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); + #endif #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 @@@ -519,31 -521,10 +521,31 @@@ static void recalc_intercepts(struct vc h = &svm->nested.hsave->control; g = &svm->nested; - c->intercept_cr = h->intercept_cr | g->intercept_cr; - c->intercept_dr = h->intercept_dr | g->intercept_dr; - c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; - c->intercept = h->intercept | g->intercept; + c->intercept_cr = h->intercept_cr; + c->intercept_dr = h->intercept_dr; + c->intercept_exceptions = h->intercept_exceptions; + c->intercept = h->intercept; + + if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + /* We only want the cr8 intercept bits of L1 */ + c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); + c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); + + /* + * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not + * affect any interrupt we may want to inject; therefore, + * interrupt window vmexits are irrelevant to L0. + */ + c->intercept &= ~(1ULL << INTERCEPT_VINTR); + } + + /* We don't want to see VMMCALLs from a nested guest */ + c->intercept &= ~(1ULL << INTERCEPT_VMMCALL); + + c->intercept_cr |= g->intercept_cr; + c->intercept_dr |= g->intercept_dr; + c->intercept_exceptions |= g->intercept_exceptions; + c->intercept |= g->intercept; } static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) @@@ -648,11 -629,6 +650,11 @@@ static inline void clr_intercept(struc recalc_intercepts(svm); } +static inline bool is_intercept(struct vcpu_svm *svm, int bit) +{ + return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; +} + static inline bool vgif_enabled(struct vcpu_svm *svm) { return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); @@@ -1232,7 -1208,6 +1234,7 @@@ static int avic_ga_log_notifier(u32 ga_ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); + trace_kvm_avic_ga_log(vm_id, vcpu_id); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { @@@ -1394,29 -1369,6 +1396,29 @@@ static void svm_hardware_teardown(void iopm_base = 0; } +static __init void svm_set_cpu_caps(void) +{ + kvm_set_cpu_caps(); + + supported_xss = 0; + + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ + if (nested) { + kvm_cpu_cap_set(X86_FEATURE_SVM); + + if (nrips) + kvm_cpu_cap_set(X86_FEATURE_NRIPS); + + if (npt_enabled) + kvm_cpu_cap_set(X86_FEATURE_NPT); + } + + /* CPUID 0x80000008 */ + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); +} + static __init int svm_hardware_setup(void) { int cpu; @@@ -1435,8 -1387,6 +1437,8 @@@ init_msrpm_offsets(); + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@@ -1484,11 -1434,16 +1486,11 @@@ if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - if (npt_enabled && !npt) { - printk(KERN_INFO "kvm: Nested Paging disabled\n"); + if (npt_enabled && !npt) npt_enabled = false; - } - if (npt_enabled) { - printk(KERN_INFO "kvm: Nested Paging enabled\n"); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); + kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL); + pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { if (!boot_cpu_has(X86_FEATURE_NRIPS)) @@@ -1524,8 -1479,6 +1526,8 @@@ pr_info("Virtual GIF supported\n"); } + svm_set_cpu_caps(); + return 0; err: @@@ -1993,6 -1946,19 +1995,6 @@@ static void __unregister_enc_region_loc kfree(region); } -static struct kvm *svm_vm_alloc(void) -{ - struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm), - GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); - return &kvm_svm->kvm; -} - -static void svm_vm_free(struct kvm *kvm) -{ - vfree(to_kvm_svm(kvm)); -} - static void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@@ -2220,7 -2186,7 +2222,7 @@@ static void svm_vcpu_reset(struct kvm_v } init_vmcb(svm); - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); kvm_rdx_write(vcpu, eax); if (kvm_vcpu_apicv_active(vcpu) && !init_event) @@@ -2230,8 -2196,9 +2232,9 @@@ static int avic_init_vcpu(struct vcpu_svm *svm) { int ret; + struct kvm_vcpu *vcpu = &svm->vcpu; - if (!kvm_vcpu_apicv_active(&svm->vcpu)) + if (!avic || !irqchip_in_kernel(vcpu->kvm)) return 0; ret = avic_init_backing_page(&svm->vcpu); @@@ -2453,38 -2420,14 +2456,38 @@@ static void svm_cache_reg(struct kvm_vc } } +static inline void svm_enable_vintr(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control; + + /* The following fields are ignored when AVIC is enabled */ + WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); + + /* + * This is just a dummy VINTR to actually cause a vmexit to happen. + * Actual injection of virtual interrupts happens through EVENTINJ. + */ + control = &svm->vmcb->control; + control->int_vector = 0x0; + control->int_ctl &= ~V_INTR_PRIO_MASK; + control->int_ctl |= V_IRQ_MASK | + ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); + mark_dirty(svm->vmcb, VMCB_INTR); +} + static void svm_set_vintr(struct vcpu_svm *svm) { set_intercept(svm, INTERCEPT_VINTR); + if (is_intercept(svm, INTERCEPT_VINTR)) + svm_enable_vintr(svm); } static void svm_clear_vintr(struct vcpu_svm *svm) { clr_intercept(svm, INTERCEPT_VINTR); + + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + mark_dirty(svm->vmcb, VMCB_INTR); } static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) @@@ -3040,6 -2983,15 +3043,6 @@@ static u64 nested_svm_get_tdp_pdptr(str return pdpte; } -static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, - unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.nested_cr3 = __sme_set(root); - mark_dirty(svm->vmcb, VMCB_NPT); -} - static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@@ -3075,7 -3027,8 +3078,7 @@@ static void nested_svm_init_mmu_context vcpu->arch.mmu = &vcpu->arch.guest_mmu; kvm_init_shadow_mmu(vcpu); - vcpu->arch.mmu->set_cr3 = nested_svm_set_tdp_cr3; - vcpu->arch.mmu->get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu); @@@ -3136,36 -3089,43 +3139,36 @@@ static int nested_svm_check_exception(s return vmexit; } -/* This function returns true if it is save to enable the irq window */ -static inline bool nested_svm_intr(struct vcpu_svm *svm) +static void nested_svm_intr(struct vcpu_svm *svm) { - if (!is_guest_mode(&svm->vcpu)) - return true; - - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return true; - - if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return false; - - /* - * if vmexit was already requested (by intercepted exception - * for instance) do not overwrite it with "external interrupt" - * vmexit. - */ - if (svm->nested.exit_required) - return false; - svm->vmcb->control.exit_code = SVM_EXIT_INTR; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; - if (svm->nested.intercept & 1ULL) { - /* - * The #vmexit can't be emulated here directly because this - * code path runs with irqs and preemption disabled. A - * #vmexit emulation might sleep. Only signal request for - * the #vmexit here. - */ - svm->nested.exit_required = true; - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); - return false; + /* nested_svm_vmexit this gets called afterwards from handle_exit */ + svm->nested.exit_required = true; + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); +} + +static bool nested_exit_on_intr(struct vcpu_svm *svm) +{ + return (svm->nested.intercept & 1ULL); +} + +static int svm_check_nested_events(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + bool block_nested_events = + kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required; + + if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) { + if (block_nested_events) + return -EBUSY; + nested_svm_intr(svm); + return 0; } - return true; + return 0; } /* This function returns true if it is save to enable the nmi window */ @@@ -3284,6 -3244,9 +3287,6 @@@ static int nested_svm_exit_special(stru return NESTED_EXIT_CONTINUE; } -/* - * If this function returns true, this #vmexit was already handled - */ static int nested_svm_intercept(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; @@@ -3558,9 -3521,6 +3561,9 @@@ static bool nested_svm_vmrun_msrpm(stru static bool nested_vmcb_checks(struct vmcb *vmcb) { + if ((vmcb->save.efer & EFER_SVME) == 0) + return false; + if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) return false; @@@ -3577,10 -3537,6 +3580,10 @@@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb, struct kvm_host_map *map) { + bool evaluate_pending_interrupts = + is_intercept(svm, INTERCEPT_VINTR) || + is_intercept(svm, INTERCEPT_IRET); + if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; else @@@ -3640,6 -3596,15 +3643,6 @@@ else svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; - if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { - /* We only want the cr8 intercept bits of the guest */ - clr_cr_intercept(svm, INTERCEPT_CR8_READ); - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); - } - - /* We don't want to see VMMCALLs from a nested guest */ - clr_intercept(svm, INTERCEPT_VMMCALL); - svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; @@@ -3667,21 -3632,7 +3670,21 @@@ svm->nested.vmcb = vmcb_gpa; + /* + * If L1 had a pending IRQ/NMI before executing VMRUN, + * which wasn't delivered because it was disallowed (e.g. + * interrupts disabled), L0 needs to evaluate if this pending + * event should cause an exit from L2 to L1 or be delivered + * directly to L2. + * + * Usually this would be handled by the processor noticing an + * IRQ/NMI window request. However, VMRUN can unblock interrupts + * by implicitly setting GIF, so force L0 to perform pending event + * evaluation by requesting a KVM_REQ_EVENT. + */ enable_gif(svm); + if (unlikely(evaluate_pending_interrupts)) + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); mark_all_dirty(svm->vmcb); } @@@ -3883,8 -3834,11 +3886,8 @@@ static int clgi_interception(struct vcp disable_gif(svm); /* After a CLGI no interrupts should come */ - if (!kvm_vcpu_apicv_active(&svm->vcpu)) { + if (!kvm_vcpu_apicv_active(&svm->vcpu)) svm_clear_vintr(svm); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - mark_dirty(svm->vmcb, VMCB_INTR); - } return ret; } @@@ -5170,6 -5124,19 +5173,6 @@@ static void svm_inject_nmi(struct kvm_v ++vcpu->stat.nmi_injections; } -static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) -{ - struct vmcb_control_area *control; - - /* The following fields are ignored when AVIC is enabled */ - control = &svm->vmcb->control; - control->int_vector = irq; - control->int_ctl &= ~V_INTR_PRIO_MASK; - control->int_ctl |= V_IRQ_MASK | - ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); - mark_dirty(svm->vmcb, VMCB_INTR); -} - static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@@ -5558,15 -5525,18 +5561,15 @@@ static int svm_interrupt_allowed(struc { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - int ret; if (!gif_set(svm) || (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) return 0; - ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); - - if (is_guest_mode(vcpu)) - return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); - - return ret; + if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)) + return !!(svm->vcpu.arch.hflags & HF_HIF_MASK); + else + return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); } static void enable_irq_window(struct kvm_vcpu *vcpu) @@@ -5581,7 -5551,7 +5584,7 @@@ * enabled, the STGI interception will not occur. Enable the irq * window under the assumption that the hardware will set the GIF. */ - if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) { + if (vgif_enabled(svm) || gif_set(svm)) { /* * IRQ window is not needed when AVIC is enabled, * unless we have pending ExtINT since it cannot be injected @@@ -5590,6 -5560,7 +5593,6 @@@ */ svm_toggle_avic_for_irq_window(vcpu, false); svm_set_vintr(svm); - svm_inject_irq(svm, 0x0); } } @@@ -5975,30 -5946,24 +5978,30 @@@ static void svm_vcpu_run(struct kvm_vcp } STACK_FRAME_NON_STANDARD(svm_vcpu_run); -static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); + bool update_guest_cr3 = true; + unsigned long cr3; - svm->vmcb->save.cr3 = __sme_set(root); - mark_dirty(svm->vmcb, VMCB_CR); -} - -static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); + cr3 = __sme_set(root); + if (npt_enabled) { + svm->vmcb->control.nested_cr3 = cr3; + mark_dirty(svm->vmcb, VMCB_NPT); - svm->vmcb->control.nested_cr3 = __sme_set(root); - mark_dirty(svm->vmcb, VMCB_NPT); + /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ + if (is_guest_mode(vcpu)) + update_guest_cr3 = false; + else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + cr3 = vcpu->arch.cr3; + else /* CR3 is already up-to-date. */ + update_guest_cr3 = false; + } - /* Also sync guest cr3 here in case we live migrate */ - svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); - mark_dirty(svm->vmcb, VMCB_CR); + if (update_guest_cr3) { + svm->vmcb->save.cr3 = cr3; + mark_dirty(svm->vmcb, VMCB_CR); + } } static int is_disabled(void) @@@ -6060,19 -6025,12 +6063,19 @@@ static void svm_cpuid_update(struct kvm boot_cpu_has(X86_FEATURE_XSAVES); /* Update nrips enabled cache */ - svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); + svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && + guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); if (!kvm_vcpu_apicv_active(vcpu)) return; - guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); + /* + * AVIC does not work with an x2APIC mode guest. If the X2APIC feature + * is exposed to the guest, disable AVIC. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) + kvm_request_apicv_update(vcpu->kvm, false, + APICV_INHIBIT_REASON_X2APIC); /* * Currently, AVIC does not work with nested virtualization. @@@ -6083,11 -6041,88 +6086,11 @@@ APICV_INHIBIT_REASON_NESTED); } -#define F feature_bit - -static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -{ - switch (func) { - case 0x1: - if (avic) - entry->ecx &= ~F(X2APIC); - break; - case 0x80000001: - if (nested) - entry->ecx |= (1 << 2); /* Set SVM bit */ - break; - case 0x80000008: - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) - entry->ebx |= F(VIRT_SSBD); - break; - case 0x8000000A: - entry->eax = 1; /* SVM revision 1 */ - entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper - ASID emulation to nested SVM */ - entry->ecx = 0; /* Reserved */ - entry->edx = 0; /* Per default do not support any - additional features */ - - /* Support next_rip if host supports it */ - if (boot_cpu_has(X86_FEATURE_NRIPS)) - entry->edx |= F(NRIPS); - - /* Support NPT for the guest if enabled */ - if (npt_enabled) - entry->edx |= F(NPT); - - } -} - -static int svm_get_lpage_level(void) -{ - return PT_PDPE_LEVEL; -} - -static bool svm_rdtscp_supported(void) -{ - return boot_cpu_has(X86_FEATURE_RDTSCP); -} - -static bool svm_invpcid_supported(void) -{ - return false; -} - -static bool svm_mpx_supported(void) -{ - return false; -} - -static bool svm_xsaves_supported(void) -{ - return boot_cpu_has(X86_FEATURE_XSAVES); -} - -static bool svm_umip_emulated(void) -{ - return false; -} - -static bool svm_pt_supported(void) -{ - return false; -} - static bool svm_has_wbinvd_exit(void) { return true; } -static bool svm_pku_supported(void) -{ - return false; -} - #define PRE_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ @@@ -6154,8 -6189,7 +6157,8 @@@ static const struct __x86_intercept static int svm_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, - enum x86_intercept_stage stage) + enum x86_intercept_stage stage, + struct x86_exception *exception) { struct vcpu_svm *svm = to_svm(vcpu); int vmexit, ret = X86EMUL_CONTINUE; @@@ -7339,8 -7373,7 +7342,8 @@@ static bool svm_check_apicv_inhibit_rea BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | - BIT(APICV_INHIBIT_REASON_PIT_REINJ); + BIT(APICV_INHIBIT_REASON_PIT_REINJ) | + BIT(APICV_INHIBIT_REASON_X2APIC); return supported & BIT(bit); } @@@ -7365,7 -7398,8 +7368,7 @@@ static struct kvm_x86_ops svm_x86_ops _ .vcpu_free = svm_free_vcpu, .vcpu_reset = svm_vcpu_reset, - .vm_alloc = svm_vm_alloc, - .vm_free = svm_vm_free, + .vm_size = sizeof(struct kvm_svm), .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, @@@ -7387,6 -7421,7 +7390,6 @@@ .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, - .set_cr3 = svm_set_cr3, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, .get_idt = svm_get_idt, @@@ -7439,14 -7474,26 +7442,14 @@@ .get_exit_info = svm_get_exit_info, - .get_lpage_level = svm_get_lpage_level, - .cpuid_update = svm_cpuid_update, - .rdtscp_supported = svm_rdtscp_supported, - .invpcid_supported = svm_invpcid_supported, - .mpx_supported = svm_mpx_supported, - .xsaves_supported = svm_xsaves_supported, - .umip_emulated = svm_umip_emulated, - .pt_supported = svm_pt_supported, - .pku_supported = svm_pku_supported, - - .set_supported_cpuid = svm_set_supported_cpuid, - .has_wbinvd_exit = svm_has_wbinvd_exit, .read_l1_tsc_offset = svm_read_l1_tsc_offset, .write_l1_tsc_offset = svm_write_l1_tsc_offset, - .set_tdp_cr3 = set_tdp_cr3, + .load_mmu_pgd = svm_load_mmu_pgd, .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, @@@ -7476,8 -7523,6 +7479,8 @@@ .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, .apic_init_signal_blocked = svm_apic_init_signal_blocked, + + .check_nested_events = svm_check_nested_events, }; static int __init svm_init(void) diff --combined arch/x86/kvm/vmx/vmx.c index 3aba51d,40b1e61..a7dd678 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@@ -64,11 -64,13 +64,13 @@@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); + #ifdef MODULE static const struct x86_cpu_id vmx_cpu_id[] = { X86_FEATURE_MATCH(X86_FEATURE_VMX), {} }; MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); + #endif bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); @@@ -433,6 -435,7 +435,6 @@@ static const struct kvm_vmx_segment_fie VMX_SEGMENT_FIELD(LDTR), }; -u64 host_efer; static unsigned long host_idt_base; /* @@@ -653,16 -656,53 +655,16 @@@ static int vmx_set_guest_msr(struct vcp return ret; } -void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) -{ - vmcs_clear(loaded_vmcs->vmcs); - if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) - vmcs_clear(loaded_vmcs->shadow_vmcs); - loaded_vmcs->cpu = -1; - loaded_vmcs->launched = 0; -} - #ifdef CONFIG_KEXEC_CORE -/* - * This bitmap is used to indicate whether the vmclear - * operation is enabled on all cpus. All disabled by - * default. - */ -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; - -static inline void crash_enable_local_vmclear(int cpu) -{ - cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline void crash_disable_local_vmclear(int cpu) -{ - cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline int crash_local_vmclear_enabled(int cpu) -{ - return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - static void crash_vmclear_local_loaded_vmcss(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; - if (!crash_local_vmclear_enabled(cpu)) - return; - list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); } -#else -static inline void crash_enable_local_vmclear(int cpu) { } -static inline void crash_disable_local_vmclear(int cpu) { } #endif /* CONFIG_KEXEC_CORE */ static void __loaded_vmcs_clear(void *arg) @@@ -674,24 -714,19 +676,24 @@@ return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; - crash_disable_local_vmclear(cpu); + + vmcs_clear(loaded_vmcs->vmcs); + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) + vmcs_clear(loaded_vmcs->shadow_vmcs); + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); /* - * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link - * is before setting loaded_vmcs->vcpu to -1 which is done in - * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist - * then adds the vmcs into percpu list before it is deleted. + * Ensure all writes to loaded_vmcs, including deleting it from its + * current percpu list, complete before setting loaded_vmcs->vcpu to + * -1, otherwise a different cpu can see vcpu == -1 first and add + * loaded_vmcs to its percpu list before it's deleted from this cpu's + * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). */ smp_wmb(); - loaded_vmcs_init(loaded_vmcs); - crash_enable_local_vmclear(cpu); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; } void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) @@@ -775,7 -810,7 +777,7 @@@ void update_exception_bitmap(struct kvm if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) - eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + eb &= ~(1u << PF_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass @@@ -1026,7 -1061,7 +1028,7 @@@ static unsigned long segment_base(u16 s static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) { - return (pt_mode == PT_MODE_HOST_GUEST) && + return vmx_pt_mode_is_host_guest() && !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); } @@@ -1060,7 -1095,7 +1062,7 @@@ static inline void pt_save_msr(struct p static void pt_guest_enter(struct vcpu_vmx *vmx) { - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) return; /* @@@ -1077,7 -1112,7 +1079,7 @@@ static void pt_guest_exit(struct vcpu_vmx *vmx) { - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) return; if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { @@@ -1310,17 -1345,18 +1312,17 @@@ void vmx_vcpu_load_vmcs(struct kvm_vcp if (!already_loaded) { loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); - crash_disable_local_vmclear(cpu); /* - * Read loaded_vmcs->cpu should be before fetching - * loaded_vmcs->loaded_vmcss_on_cpu_link. - * See the comments in __loaded_vmcs_clear(). + * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to + * this cpu's percpu list, otherwise it may not yet be deleted + * from its previous cpu's percpu list. Pairs with the + * smb_wmb() in __loaded_vmcs_clear(). */ smp_rmb(); list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); - crash_enable_local_vmclear(cpu); local_irq_enable(); } @@@ -1653,6 -1689,16 +1655,6 @@@ static void vmx_queue_exception(struct vmx_clear_hlt(vcpu); } -static bool vmx_rdtscp_supported(void) -{ - return cpu_has_vmx_rdtscp(); -} - -static bool vmx_invpcid_supported(void) -{ - return cpu_has_vmx_invpcid(); -} - /* * Swap MSR entry in host/guest MSR entry array. */ @@@ -1860,24 -1906,24 +1862,24 @@@ static int vmx_get_msr(struct kvm_vcpu &msr_info->data); break; case MSR_IA32_RTIT_CTL: - if (pt_mode != PT_MODE_HOST_GUEST) + if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.ctl; break; case MSR_IA32_RTIT_STATUS: - if (pt_mode != PT_MODE_HOST_GUEST) + if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.status; break; case MSR_IA32_RTIT_CR3_MATCH: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) return 1; msr_info->data = vmx->pt_desc.guest.cr3_match; break; case MSR_IA32_RTIT_OUTPUT_BASE: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, @@@ -1886,7 -1932,7 +1888,7 @@@ msr_info->data = vmx->pt_desc.guest.output_base; break; case MSR_IA32_RTIT_OUTPUT_MASK: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, @@@ -1896,7 -1942,7 +1898,7 @@@ break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges))) return 1; @@@ -2102,7 -2148,7 +2104,7 @@@ static int vmx_set_msr(struct kvm_vcpu return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || vmx_rtit_ctl_check(vcpu, data) || vmx->nested.vmxon) return 1; @@@ -2218,33 -2264,18 +2220,33 @@@ static __init int vmx_disabled_by_bios( !boot_cpu_has(X86_FEATURE_VMX); } -static void kvm_cpu_vmxon(u64 addr) +static int kvm_cpu_vmxon(u64 vmxon_pointer) { + u64 msr; + cr4_set_bits(X86_CR4_VMXE); intel_pt_handle_vmx(1); - asm volatile ("vmxon %0" : : "m"(addr)); + asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + intel_pt_handle_vmx(0); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; } static int hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + int r; if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; @@@ -2261,10 -2292,18 +2263,10 @@@ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - /* - * Now we can enable the vmclear operation in kdump - * since the loaded_vmcss_on_cpu list on this cpu - * has been initialized. - * - * Though the cpu is not in VMX operation now, there - * is no problem to enable the vmclear operation - * for the loaded_vmcss_on_cpu list is empty! - */ - crash_enable_local_vmclear(cpu); + r = kvm_cpu_vmxon(phys_addr); + if (r) + return r; - kvm_cpu_vmxon(phys_addr); if (enable_ept) ept_sync_global(); @@@ -2564,12 -2603,9 +2566,12 @@@ int alloc_loaded_vmcs(struct loaded_vmc if (!loaded_vmcs->vmcs) return -ENOMEM; + vmcs_clear(loaded_vmcs->vmcs); + loaded_vmcs->shadow_vmcs = NULL; loaded_vmcs->hv_timer_soft_disabled = false; - loaded_vmcs_init(loaded_vmcs); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; if (cpu_has_vmx_msr_bitmap()) { loaded_vmcs->msr_bitmap = (unsigned long *) @@@ -2951,8 -2987,9 +2953,8 @@@ void vmx_set_cr0(struct kvm_vcpu *vcpu static int get_ept_level(struct kvm_vcpu *vcpu) { - /* Nested EPT currently only supports 4-level walks. */ if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return 4; + return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) return 5; return 4; @@@ -2972,7 -3009,7 +2974,7 @@@ u64 construct_eptp(struct kvm_vcpu *vcp return eptp; } -void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@@ -3989,7 -4026,7 +3991,7 @@@ static void vmx_compute_secondary_exec_ u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; @@@ -4044,7 -4081,7 +4046,7 @@@ } } - if (vmx_rdtscp_supported()) { + if (cpu_has_vmx_rdtscp()) { bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); if (!rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; @@@ -4059,7 -4096,7 +4061,7 @@@ } } - if (vmx_invpcid_supported()) { + if (cpu_has_vmx_invpcid()) { /* Exposing INVPCID only when PCID is exposed */ bool invpcid_enabled = guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && @@@ -4230,7 -4267,7 +4232,7 @@@ static void init_vmcs(struct vcpu_vmx * if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); - if (pt_mode == PT_MODE_HOST_GUEST) { + if (vmx_pt_mode_is_host_guest()) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); /* Bit[6~0] are forced to 1, writes are ignored. */ vmx->pt_desc.guest.output_mask = 0x7F; @@@ -4458,13 -4495,8 +4460,13 @@@ static int vmx_nmi_allowed(struct kvm_v static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - return (!to_vmx(vcpu)->nested.nested_run_pending && - vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + if (to_vmx(vcpu)->nested.nested_run_pending) + return false; + + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return true; + + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@@ -4520,6 -4552,7 +4522,6 @@@ static bool rmode_exception(struct kvm_ case GP_VECTOR: case MF_VECTOR: return true; - break; } return false; } @@@ -5296,6 -5329,7 +5298,6 @@@ static void vmx_enable_tdp(void VMX_EPT_RWX_MASK, 0ull); ept_set_mmio_spte_mask(); - kvm_enable_tdp(); } /* @@@ -5828,23 -5862,8 +5830,23 @@@ static int vmx_handle_exit(struct kvm_v if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) - return nested_vmx_reflect_vmexit(vcpu, exit_reason); + if (is_guest_mode(vcpu)) { + /* + * The host physical addresses of some pages of guest memory + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + + if (nested_vmx_exit_reflected(vcpu, exit_reason)) + return nested_vmx_reflect_vmexit(vcpu, exit_reason); + } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { dump_vmcs(); @@@ -6204,13 -6223,15 +6206,13 @@@ static void handle_exception_nmi_irqoff vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* if exit due to PF check for async PF */ - if (is_page_fault(vmx->exit_intr_info)) + if (is_page_fault(vmx->exit_intr_info)) { vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); - /* Handle machine checks before interrupts are enabled */ - if (is_machine_check(vmx->exit_intr_info)) + } else if (is_machine_check(vmx->exit_intr_info)) { kvm_machine_check(); - /* We need to handle NMIs before interrupts are enabled */ - if (is_nmi(vmx->exit_intr_info)) { + } else if (is_nmi(vmx->exit_intr_info)) { kvm_before_interrupt(&vmx->vcpu); asm("int $2"); kvm_after_interrupt(&vmx->vcpu); @@@ -6296,6 -6317,11 +6298,6 @@@ static bool vmx_has_emulated_msr(int in } } -static bool vmx_pt_supported(void) -{ - return pt_mode == PT_MODE_HOST_GUEST; -} - static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@@ -6541,8 -6567,7 +6543,8 @@@ static void vmx_vcpu_run(struct kvm_vcp pt_guest_enter(vmx); - atomic_switch_perf_msrs(vmx); + if (vcpu_to_pmu(vcpu)->version) + atomic_switch_perf_msrs(vmx); atomic_switch_umwait_control_msr(vmx); if (enable_preemption_timer) @@@ -6659,6 -6684,20 +6661,6 @@@ vmx_complete_interrupts(vmx); } -static struct kvm *vmx_vm_alloc(void) -{ - struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx), - GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); - return &kvm_vmx->kvm; -} - -static void vmx_vm_free(struct kvm *kvm) -{ - kfree(kvm->arch.hyperv.hv_pa_pg); - vfree(to_kvm_vmx(kvm)); -} - static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@@ -6861,24 -6900,17 +6863,24 @@@ static u64 vmx_get_mt_mask(struct kvm_v u8 cache; u64 ipat = 0; - /* For VT-d and EPT combination - * 1. MMIO: always map as UC - * 2. EPT with VT-d: - * a. VT-d without snooping control feature: can't guarantee the - * result, try to trust guest. - * b. VT-d with snooping control feature: snooping control feature of - * VT-d engine can guarantee the cache correctness. Just set it - * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep - * consistent with host MTRR + /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in + * memory aliases with conflicting memory types and sometimes MCEs. + * We have to be careful as to what are honored and when. + * + * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to + * UC. The effective memory type is UC or WC depending on guest PAT. + * This was historically the source of MCEs and we want to be + * conservative. + * + * When there is no need to deal with noncoherent DMA (e.g., no VT-d + * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The + * EPT memory type is set to WB. The effective memory type is forced + * WB. + * + * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The + * EPT memory type is used to emulate guest CD/MTRR. */ + if (is_mmio) { cache = MTRR_TYPE_UNCACHABLE; goto exit; @@@ -6905,6 -6937,15 +6907,6 @@@ exit return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } -static int vmx_get_lpage_level(void) -{ - if (enable_ept && !cpu_has_vmx_ept_1g_page()) - return PT_DIRECTORY_LEVEL; - else - /* For shadow and EPT supported 1GB page */ - return PT_PDPE_LEVEL; -} - static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) { /* @@@ -7095,37 -7136,10 +7097,37 @@@ static void vmx_cpuid_update(struct kvm } } -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +static __init void vmx_set_cpu_caps(void) { - if (func == 1 && nested) - entry->ecx |= feature_bit(VMX); + kvm_set_cpu_caps(); + + /* CPUID 0x1 */ + if (nested) + kvm_cpu_cap_set(X86_FEATURE_VMX); + + /* CPUID 0x7 */ + if (kvm_mpx_supported()) + kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); + if (cpu_has_vmx_invpcid()) + kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); + if (vmx_pt_mode_is_host_guest()) + kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + + /* PKU is not yet implemented for shadow paging. */ + if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE)) + kvm_cpu_cap_check_and_set(X86_FEATURE_PKU); + + if (vmx_umip_emulated()) + kvm_cpu_cap_set(X86_FEATURE_UMIP); + + /* CPUID 0xD.1 */ + supported_xss = 0; + if (!vmx_xsaves_supported()) + kvm_cpu_cap_clear(X86_FEATURE_XSAVES); + + /* CPUID 0x80000001 */ + if (!cpu_has_vmx_rdtscp()) + kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) @@@ -7163,15 -7177,16 +7165,16 @@@ static int vmx_check_intercept_io(struc else intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); + /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; } static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, - enum x86_intercept_stage stage) + enum x86_intercept_stage stage, + struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; switch (info->intercept) { /* @@@ -7180,8 -7195,8 +7183,8 @@@ */ case x86_intercept_rdtscp: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { - ctxt->exception.vector = UD_VECTOR; - ctxt->exception.error_code_valid = false; + exception->vector = UD_VECTOR; + exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } break; @@@ -7192,6 -7207,20 +7195,20 @@@ case x86_intercept_outs: return vmx_check_intercept_io(vcpu, info); + case x86_intercept_lgdt: + case x86_intercept_lidt: + case x86_intercept_lldt: + case x86_intercept_ltr: + case x86_intercept_sgdt: + case x86_intercept_sidt: + case x86_intercept_sldt: + case x86_intercept_str: + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) + return X86EMUL_CONTINUE; + + /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + break; + /* TODO: check more intercepts... */ default: break; @@@ -7278,8 -7307,7 +7295,8 @@@ static void vmx_sched_in(struct kvm_vcp static void vmx_slot_enable_log_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); kvm_mmu_slot_largepage_remove_write_access(kvm, slot); } @@@ -7633,7 -7661,9 +7650,7 @@@ static __init int hardware_setup(void { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, i; - - rdmsrl_safe(MSR_EFER, &host_efer); + int r, i, ept_lpage_level; store_idt(&dt); host_idt_base = dt.address; @@@ -7652,10 -7682,6 +7669,10 @@@ WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); } + if (!cpu_has_vmx_mpx()) + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@@ -7689,6 -7715,9 +7706,6 @@@ if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { @@@ -7721,16 -7750,8 +7738,16 @@@ if (enable_ept) vmx_enable_tdp(); + + if (!enable_ept) + ept_lpage_level = 0; + else if (cpu_has_vmx_ept_1g_page()) + ept_lpage_level = PT_PDPE_LEVEL; + else if (cpu_has_vmx_ept_2m_page()) + ept_lpage_level = PT_DIRECTORY_LEVEL; else - kvm_disable_tdp(); + ept_lpage_level = PT_PAGE_TABLE_LEVEL; + kvm_configure_mmu(enable_ept, ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT @@@ -7794,8 -7815,6 +7811,8 @@@ return r; } + vmx_set_cpu_caps(); + r = alloc_kvm_area(); if (r) nested_vmx_hardware_unsetup(); @@@ -7829,8 -7848,9 +7846,8 @@@ static struct kvm_x86_ops vmx_x86_ops _ .cpu_has_accelerated_tpr = report_flexpriority, .has_emulated_msr = vmx_has_emulated_msr, + .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, - .vm_alloc = vmx_vm_alloc, - .vm_free = vmx_vm_free, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, @@@ -7852,6 -7872,7 +7869,6 @@@ .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, - .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, .set_efer = vmx_set_efer, .get_idt = vmx_get_idt, @@@ -7907,17 -7928,29 +7924,17 @@@ .get_exit_info = vmx_get_exit_info, - .get_lpage_level = vmx_get_lpage_level, - .cpuid_update = vmx_cpuid_update, - .rdtscp_supported = vmx_rdtscp_supported, - .invpcid_supported = vmx_invpcid_supported, - - .set_supported_cpuid = vmx_set_supported_cpuid, - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .read_l1_tsc_offset = vmx_read_l1_tsc_offset, .write_l1_tsc_offset = vmx_write_l1_tsc_offset, - .set_tdp_cr3 = vmx_set_cr3, + .load_mmu_pgd = vmx_load_mmu_pgd, .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .mpx_supported = vmx_mpx_supported, - .xsaves_supported = vmx_xsaves_supported, - .umip_emulated = vmx_umip_emulated, - .pt_supported = vmx_pt_supported, - .pku_supported = vmx_pku_supported, .request_immediate_exit = vmx_request_immediate_exit, diff --combined arch/x86/kvm/x86.c index 6fa014c,5de2006..1b6d9ac --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@@ -22,7 -22,6 +22,7 @@@ #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" +#include "kvm_emulate.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" @@@ -82,7 -81,7 +82,7 @@@ u64 __read_mostly kvm_mce_cap_supporte EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ - container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + ((struct kvm_vcpu *)(ctxt)->vcpu) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@@ -181,17 -180,7 +181,17 @@@ struct kvm_shared_msrs static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; static struct kvm_shared_msrs __percpu *shared_msrs; +#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ + | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU) + +u64 __read_mostly host_efer; +EXPORT_SYMBOL_GPL(host_efer); + static u64 __read_mostly host_xss; +u64 __read_mostly supported_xss; +EXPORT_SYMBOL_GPL(supported_xss); struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@@ -237,25 -226,10 +237,25 @@@ }; u64 __read_mostly host_xcr0; +u64 __read_mostly supported_xcr0; +EXPORT_SYMBOL_GPL(supported_xcr0); struct kmem_cache *x86_fpu_cache; EXPORT_SYMBOL_GPL(x86_fpu_cache); +static struct kmem_cache *x86_emulator_cache; + +static struct kmem_cache *kvm_alloc_emulator_cache(void) +{ + unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); + unsigned int size = sizeof(struct x86_emulate_ctxt); + + return kmem_cache_create_usercopy("x86_emulator", size, + __alignof__(struct x86_emulate_ctxt), + SLAB_ACCOUNT, useroffset, + size - useroffset, NULL); +} + static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) @@@ -376,7 -350,6 +376,7 @@@ int kvm_set_apic_base(struct kvm_vcpu * } kvm_lapic_set_base(vcpu, msr_info->data); + kvm_recalculate_apic_map(vcpu->kvm); return 0; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); @@@ -930,10 -903,10 +930,10 @@@ static u64 kvm_host_cr4_reserved_bits(s { u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - if (cpuid_ecx(0x7) & feature_bit(LA57)) + if (kvm_cpu_cap_has(X86_FEATURE_LA57)) reserved_bits &= ~X86_CR4_LA57; - if (kvm_x86_ops->umip_emulated()) + if (kvm_cpu_cap_has(X86_FEATURE_UMIP)) reserved_bits &= ~X86_CR4_UMIP; return reserved_bits; @@@ -1585,12 -1558,8 +1585,12 @@@ static int handle_fastpath_set_x2apic_i ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { + data &= ~(1 << 12); + kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); - return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); + trace_kvm_apic_write(APIC_ICR, (u32)data); + return 0; } return 1; @@@ -1599,12 -1568,11 +1599,12 @@@ enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); + u64 data; int ret = 0; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): + data = kvm_read_edx_eax(vcpu); ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); break; default: @@@ -2555,7 -2523,7 +2555,7 @@@ static void kvmclock_sync_fn(struct wor static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd(vcpu)) + if (guest_cpuid_is_amd_or_hygon(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; @@@ -2830,11 -2798,12 +2830,11 @@@ int kvm_set_msr_common(struct kvm_vcpu !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return 1; /* - * We do support PT if kvm_x86_ops->pt_supported(), but we do - * not support IA32_XSS[bit 8]. Guests will have to use - * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT - * MSRs. + * KVM supports exposing PT to the guest, but does not support + * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than + * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data != 0) + if (data & ~supported_xss) return 1; vcpu->arch.ia32_xss = data; break; @@@ -3108,6 -3077,7 +3108,6 @@@ int kvm_get_msr_common(struct kvm_vcpu break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - break; case MSR_IA32_TSCDEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; @@@ -3190,6 -3160,7 +3190,6 @@@ return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); - break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@@ -3493,7 -3464,7 +3493,7 @@@ long kvm_arch_dev_ioctl(struct file *fi r = 0; break; } - case KVM_X86_GET_MCE_CAP_SUPPORTED: { + case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; if (copy_to_user(argp, &kvm_mce_cap_supported, sizeof(kvm_mce_cap_supported))) @@@ -3525,9 -3496,9 +3525,9 @@@ case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; - } default: r = -EINVAL; + break; } out: return r; @@@ -4130,7 -4101,8 +4130,7 @@@ static int kvm_vcpu_ioctl_x86_set_xsave * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~kvm_supported_xcr0() || - mxcsr & ~mxcsr_feature_mask) + if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { @@@ -4789,13 -4761,77 +4789,13 @@@ static int kvm_vm_ioctl_reinject(struc return 0; } -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * Steps 1-4 below provide general overview of dirty page logging. See - * kvm_get_dirty_log_protect() function description for additional details. - * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Copy the snapshot to the userspace. - * 4. Flush TLB's if needed. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - /* * Flush potentially hardware-cached dirty pages to dirty_bitmap. */ if (kvm_x86_ops->flush_log_dirty) kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_get_dirty_log_protect(kvm, log, &flush); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; -} - -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) -{ - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - /* - * Flush potentially hardware-cached dirty pages to dirty_bitmap. - */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_clear_dirty_log_protect(kvm, log, &flush); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, @@@ -5224,28 -5260,28 +5224,28 @@@ static void kvm_init_msr_list(void continue; break; case MSR_TSC_AUX: - if (!kvm_x86_ops->rdtscp_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) continue; break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: - if (!kvm_x86_ops->pt_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) continue; break; case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) continue; break; case MSR_IA32_RTIT_OUTPUT_BASE: case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) continue; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; @@@ -5702,7 -5738,7 +5702,7 @@@ static int emulator_read_write_onepage( int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; /* * If the exit was due to a NPF we may already have a GPA. @@@ -5711,9 -5747,10 +5711,9 @@@ * operation using rep will only have the initial GPA from the NPF * occurred. */ - if (vcpu->arch.gpa_available && - emulator_can_use_gpa(ctxt) && - (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) { - gpa = vcpu->arch.gpa_val; + if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) && + (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) { + gpa = ctxt->gpa_val; ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write); } else { ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@@ -5933,9 -5970,11 +5933,9 @@@ static int emulator_pio_in_out(struct k return 0; } -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int ret; if (vcpu->arch.pio.count) @@@ -5955,30 -5994,17 +5955,30 @@@ data_avail return 0; } -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, - const void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); + +} +static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, const void *val, + unsigned int count) +{ memcpy(vcpu->arch.pio_data, val, size * count); trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) +{ + return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); +} + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_x86_ops->get_segment_base(vcpu, seg); @@@ -6241,15 -6267,13 +6241,15 @@@ static int emulator_intercept(struct x8 struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); + return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage, + &ctxt->exception); } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, + bool exact_only) { - return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); } static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) @@@ -6376,7 -6400,7 +6376,7 @@@ static void toggle_interruptibility(str static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) return kvm_propagate_fault(vcpu, &ctxt->exception); @@@ -6388,31 -6412,13 +6388,31 @@@ return false; } +static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt; + + ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); + if (!ctxt) { + pr_err("kvm: failed to allocate vcpu's emulator\n"); + return NULL; + } + + ctxt->vcpu = vcpu; + ctxt->ops = &emulate_ops; + vcpu->arch.emulate_ctxt = ctxt; + + return ctxt; +} + static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; @@@ -6432,7 -6438,7 +6432,7 @@@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@@ -6488,11 -6494,10 +6488,11 @@@ static bool reexecute_instruction(struc gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (!vcpu->arch.mmu->direct_map) { @@@ -6580,11 -6585,10 +6580,11 @@@ static bool retry_instruction(struct x8 */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (x86_page_table_writing_insn(ctxt)) @@@ -6747,7 -6751,7 +6747,7 @@@ int x86_emulate_instruction(struct kvm_ int emulation_type, void *insn, int insn_len) { int r; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; @@@ -6837,19 -6841,8 +6837,19 @@@ } restart: - /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2_or_gpa; + if (emulation_type & EMULTYPE_PF) { + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2_or_gpa; + + /* With shadow page tables, cr2 contains a GVA or nGPA. */ + if (vcpu->arch.mmu->direct_map) { + ctxt->gpa_available = true; + ctxt->gpa_val = cr2_or_gpa; + } + } else { + /* Sanitize the address out of an abundance of paranoia. */ + ctxt->exception.address = 0; + } r = x86_emulate_insn(ctxt); @@@ -6950,8 -6943,8 +6950,8 @@@ static int kvm_fast_pio_out(struct kvm_ unsigned short port) { unsigned long val = kvm_rax_read(vcpu); - int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, - size, port, &val, 1); + int ret = emulator_pio_out(vcpu, size, port, &val, 1); + if (ret) return ret; @@@ -6987,10 -6980,11 +6987,10 @@@ static int complete_fast_pio_in(struct val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform * the copy and tracing */ - emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, - vcpu->arch.pio.port, &val, 1); + emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@@ -7005,7 -6999,8 +7005,7 @@@ static int kvm_fast_pio_in(struct kvm_v /* For size less than 4 we merge, else we zero extend */ val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, - &val, 1); + ret = emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { kvm_rax_write(vcpu, val); return ret; @@@ -7195,15 -7190,15 +7195,15 @@@ static void kvm_timer_init(void if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ - struct cpufreq_policy policy; + struct cpufreq_policy *policy; int cpu; - memset(&policy, 0, sizeof(policy)); cpu = get_cpu(); - cpufreq_get_policy(&policy, cpu); - if (policy.cpuinfo.max_freq) - max_tsc_khz = policy.cpuinfo.max_freq; + policy = cpufreq_cpu_get(cpu); + if (policy && policy->cpuinfo.max_freq) + max_tsc_khz = policy->cpuinfo.max_freq; put_cpu(); + cpufreq_cpu_put(policy); #endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); @@@ -7313,12 -7308,12 +7313,12 @@@ int kvm_arch_init(void *opaque } if (!ops->cpu_has_kvm_support()) { - printk(KERN_ERR "kvm: no hardware support\n"); + pr_err_ratelimited("kvm: no hardware support\n"); r = -EOPNOTSUPP; goto out; } if (ops->disabled_by_bios()) { - printk(KERN_ERR "kvm: disabled by bios\n"); + pr_err_ratelimited("kvm: disabled by bios\n"); r = -EOPNOTSUPP; goto out; } @@@ -7343,16 -7338,10 +7343,16 @@@ goto out; } + x86_emulator_cache = kvm_alloc_emulator_cache(); + if (!x86_emulator_cache) { + pr_err("kvm: failed to allocate cache for x86 emulator\n"); + goto out_free_x86_fpu_cache; + } + shared_msrs = alloc_percpu(struct kvm_shared_msrs); if (!shared_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); - goto out_free_x86_fpu_cache; + goto out_free_x86_emulator_cache; } r = kvm_mmu_module_init(); @@@ -7368,10 -7357,8 +7368,10 @@@ perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + } kvm_lapic_init(); if (pi_inject_timer == -1) @@@ -7387,8 -7374,6 +7387,8 @@@ out_free_percpu: free_percpu(shared_msrs); +out_free_x86_emulator_cache: + kmem_cache_destroy(x86_emulator_cache); out_free_x86_fpu_cache: kmem_cache_destroy(x86_fpu_cache); out: @@@ -7646,7 -7631,7 +7646,7 @@@ static void update_cr8_intercept(struc kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) +static int inject_pending_event(struct kvm_vcpu *vcpu) { int r; @@@ -7682,7 -7667,7 +7682,7 @@@ * from L2 to L1. */ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + r = kvm_x86_ops->check_nested_events(vcpu); if (r != 0) return r; } @@@ -7744,7 -7729,7 +7744,7 @@@ * KVM_REQ_EVENT only on certain events and not unconditionally? */ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + r = kvm_x86_ops->check_nested_events(vcpu); if (r != 0) return r; } @@@ -8054,26 -8039,19 +8054,26 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { + unsigned long old, new, expected; + if (!kvm_x86_ops->check_apicv_inhibit_reasons || !kvm_x86_ops->check_apicv_inhibit_reasons(bit)) return; - if (activate) { - if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - !kvm_apicv_activated(kvm)) - return; - } else { - if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - kvm_apicv_activated(kvm)) - return; - } + old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); + do { + expected = new = old; + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); + if (new == old) + break; + old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); + } while (old != expected); + + if (!!old == !!new) + return; trace_kvm_apicv_update_request(activate, bit); if (kvm_x86_ops->pre_update_apicv_exec_ctrl) @@@ -8198,8 -8176,8 +8198,8 @@@ static int vcpu_enter_guest(struct kvm_ } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) - kvm_mmu_load_cr3(vcpu); + if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) + kvm_mmu_load_pgd(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { @@@ -8284,7 -8262,7 +8284,7 @@@ goto out; } - if (inject_pending_event(vcpu, req_int_win) != 0) + if (inject_pending_event(vcpu) != 0) req_immediate_exit = true; else { /* Enable SMI/NMI/IRQ window open exits if needed. @@@ -8465,6 -8443,7 +8465,6 @@@ if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - vcpu->arch.gpa_available = false; r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); return r; @@@ -8505,6 -8484,7 +8505,6 @@@ static inline int vcpu_block(struct kv break; default: return -EINTR; - break; } return 1; } @@@ -8512,7 -8492,7 +8512,7 @@@ static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu, false); + kvm_x86_ops->check_nested_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@@ -8773,7 -8753,7 +8773,7 @@@ static void __get_regs(struct kvm_vcpu * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); + emulator_writeback_register_cache(vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_rax_read(vcpu); @@@ -8959,7 -8939,7 +8959,7 @@@ out int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@@ -9291,6 -9271,7 +9291,6 @@@ int kvm_arch_vcpu_create(struct kvm_vcp struct page *page; int r; - vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@@ -9328,14 -9309,11 +9328,14 @@@ GFP_KERNEL_ACCOUNT)) goto fail_free_mce_banks; + if (!alloc_emulate_ctxt(vcpu)) + goto free_wbinvd_dirty_mask; + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { pr_err("kvm: failed to allocate userspace's fpu\n"); - goto free_wbinvd_dirty_mask; + goto free_emulate_ctxt; } vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, @@@ -9377,8 -9355,6 +9377,8 @@@ free_guest_fpu kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_emulate_ctxt: + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: @@@ -9413,9 -9389,11 +9413,9 @@@ void kvm_arch_vcpu_postcreate(struct kv mutex_unlock(&vcpu->mutex); - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); + if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@@ -9429,7 -9407,6 +9429,7 @@@ kvm_x86_ops->vcpu_free(vcpu); + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); @@@ -9630,18 -9607,10 +9630,18 @@@ int kvm_arch_hardware_setup(void { int r; + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + r = kvm_x86_ops->hardware_setup(); if (r != 0) return r; + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + supported_xss = 0; + cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); if (kvm_has_tsc_control) { @@@ -9658,6 -9627,9 +9658,6 @@@ kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; } - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - kvm_init_msr_list(); return 0; } @@@ -9705,13 -9677,6 +9705,13 @@@ void kvm_arch_sched_in(struct kvm_vcpu kvm_x86_ops->sched_in(vcpu, cpu); } +void kvm_arch_free_vm(struct kvm *kvm) +{ + kfree(kvm->arch.hyperv.hv_pa_pg); + vfree(kvm); +} + + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { if (type) @@@ -9794,9 -9759,9 +9794,9 @@@ void kvm_arch_sync_events(struct kvm *k int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - unsigned long hva; + unsigned long hva, uninitialized_var(old_npages); struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *slot, old; + struct kvm_memory_slot *slot; /* Called with kvm->slots_lock held. */ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) @@@ -9804,7 -9769,7 +9804,7 @@@ slot = id_to_memslot(slots, id); if (size) { - if (slot->npages) + if (slot && slot->npages) return -EEXIST; /* @@@ -9816,18 -9781,13 +9816,18 @@@ if (IS_ERR((void *)hva)) return PTR_ERR((void *)hva); } else { - if (!slot->npages) + if (!slot || !slot->npages) return 0; - hva = 0; + /* + * Stuff a non-canonical value to catch use-after-delete. This + * ends up being 0 on 32-bit KVM, but there's no better + * alternative. + */ + hva = (unsigned long)(0xdeadull << 48); + old_npages = slot->npages; } - old = *slot; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_userspace_memory_region m; @@@ -9842,7 -9802,7 +9842,7 @@@ } if (!size) - vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); + vm_munmap(hva, old_npages * PAGE_SIZE); return 0; } @@@ -9881,36 -9841,34 +9881,36 @@@ void kvm_arch_destroy_vm(struct kvm *kv kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvfree(free->arch.rmap[i]); - free->arch.rmap[i] = NULL; - } + kvfree(slot->arch.rmap[i]); + slot->arch.rmap[i] = NULL; + if (i == 0) continue; - if (!dont || free->arch.lpage_info[i - 1] != - dont->arch.lpage_info[i - 1]) { - kvfree(free->arch.lpage_info[i - 1]); - free->arch.lpage_info[i - 1] = NULL; - } + kvfree(slot->arch.lpage_info[i - 1]); + slot->arch.lpage_info[i - 1] = NULL; } - kvm_page_track_free_memslot(free, dont); + kvm_page_track_free_memslot(slot); } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, + unsigned long npages) { int i; + /* + * Clear out the previous array pointers for the KVM_MR_MOVE case. The + * old arrays will be freed by __kvm_set_memory_region() if installing + * the new memslot is successful. + */ + memset(&slot->arch, 0, sizeof(slot->arch)); + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; @@@ -9941,9 -9899,11 +9941,9 @@@ ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot + * other, disable large page support for this slot. */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !kvm_largepages_enabled()) { + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) { unsigned long j; for (j = 0; j < lpages; ++j) @@@ -9990,9 -9950,6 +9990,9 @@@ int kvm_arch_prepare_memory_region(stru const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) + return kvm_alloc_memslot_metadata(memslot, + mem->memory_size >> PAGE_SHIFT); return 0; } @@@ -10001,7 -9958,7 +10001,7 @@@ static void kvm_mmu_slot_apply_flags(st { /* Still write protect RO slot */ if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL); return; } @@@ -10036,23 -9993,10 +10036,23 @@@ * See the comments in fast_page_fault(). */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops->slot_enable_log_dirty) + if (kvm_x86_ops->slot_enable_log_dirty) { kvm_x86_ops->slot_enable_log_dirty(kvm, new); - else - kvm_mmu_slot_remove_write_access(kvm, new); + } else { + int level = + kvm_dirty_log_manual_protect_and_init_set(kvm) ? + PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL; + + /* + * If we're with initial-all-set, we don't need + * to write protect any small page because + * they're reported as dirty already. However + * we still need to write-protect huge pages + * so that the page split can happen lazily on + * the first write to the huge page. + */ + kvm_mmu_slot_remove_write_access(kvm, new, level); + } } else { if (kvm_x86_ops->slot_disable_log_dirty) kvm_x86_ops->slot_disable_log_dirty(kvm, new); @@@ -10061,7 -10005,7 +10061,7 @@@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { @@@ -10103,10 -10047,6 +10103,10 @@@ */ if (change != KVM_MR_DELETE) kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); + + /* Free the arrays associated with the old memslot. */ + if (change == KVM_MR_MOVE) + kvm_arch_free_memslot(kvm, old); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@@ -10251,7 -10191,7 +10251,7 @@@ void kvm_arch_async_page_ready(struct k return; if (!vcpu->arch.mmu->direct_map && - work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) + work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) return; kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); @@@ -10574,5 -10514,4 +10574,5 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_fu EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); diff --combined include/linux/kvm_host.h index b19dee4,bcb9b2a..f6a1905 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@@ -360,10 -360,6 +360,10 @@@ static inline unsigned long *kvm_second return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap); } +#ifndef KVM_DIRTY_LOG_MANUAL_CAPS +#define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE +#endif + struct kvm_s390_adapter_int { u64 ind_addr; u64 summary_addr; @@@ -435,11 -431,11 +435,11 @@@ static inline int kvm_arch_vcpu_memslot */ struct kvm_memslots { u64 generation; - struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM]; /* The mapping table from slot id to the index in memslots[]. */ short id_to_index[KVM_MEM_SLOTS_NUM]; atomic_t lru_slot; int used_slots; + struct kvm_memory_slot memslots[]; }; struct kvm { @@@ -497,7 -493,7 +497,7 @@@ #endif long tlbs_dirty; struct list_head devices; - bool manual_dirty_log_protect; + u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; @@@ -531,11 -527,6 +531,11 @@@ #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) +static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) +{ + return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); +} + static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, @@@ -581,11 -572,10 +581,11 @@@ static inline int kvm_vcpu_get_idx(stru return vcpu->vcpu_idx; } -#define kvm_for_each_memslot(memslot, slots) \ - for (memslot = &slots->memslots[0]; \ - memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\ - memslot++) +#define kvm_for_each_memslot(memslot, slots) \ + for (memslot = &slots->memslots[0]; \ + memslot < slots->memslots + slots->used_slots; memslot++) \ + if (WARN_ON_ONCE(!memslot->npages)) { \ + } else void kvm_vcpu_destroy(struct kvm_vcpu *vcpu); @@@ -645,15 -635,12 +645,15 @@@ static inline struct kvm_memslots *kvm_ return __kvm_memslots(vcpu->kvm, as_id); } -static inline struct kvm_memory_slot * -id_to_memslot(struct kvm_memslots *slots, int id) +static inline +struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) { int index = slots->id_to_index[id]; struct kvm_memory_slot *slot; + if (index < 0) + return NULL; + slot = &slots->memslots[index]; WARN_ON(slot->id != id); @@@ -682,7 -669,10 +682,7 @@@ int kvm_set_memory_region(struct kvm *k const struct kvm_userspace_memory_region *mem); int __kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem); -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont); -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages); +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, @@@ -690,9 -680,11 +690,9 @@@ enum kvm_mr_change change); void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); -bool kvm_largepages_enabled(void); -void kvm_disable_largepages(void); /* flush all memory translations */ void kvm_arch_flush_shadow_all(struct kvm *kvm); /* flush memory translations pointing to 'slot' */ @@@ -712,6 -704,7 +712,6 @@@ void kvm_release_page_clean(struct pag void kvm_release_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); -kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); @@@ -826,20 -819,23 +826,20 @@@ vm_fault_t kvm_arch_vcpu_fault(struct k int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); -int kvm_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log, int *is_dirty); - -int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *flush); -int kvm_clear_dirty_log_protect(struct kvm *kvm, - struct kvm_clear_dirty_log *log, bool *flush); - void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); - -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log); -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, - struct kvm_clear_dirty_log *log); +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot); + +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot); +#else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); +int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, + int *is_dirty, struct kvm_memory_slot **memslot); +#endif int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); @@@ -1022,8 -1018,6 +1022,8 @@@ bool kvm_arch_irqfd_allowed(struct kvm * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. * gfn_to_memslot() itself isn't here as an inline because that would * bloat other code too much. + * + * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! */ static inline struct kvm_memory_slot * search_memslots(struct kvm_memslots *slots, gfn_t gfn) @@@ -1032,9 -1026,6 +1032,9 @@@ int slot = atomic_read(&slots->lru_slot); struct kvm_memory_slot *memslots = slots->memslots; + if (unlikely(!slots->used_slots)) + return NULL; + if (gfn >= memslots[slot].base_gfn && gfn < memslots[slot].base_gfn + memslots[slot].npages) return &memslots[slot]; @@@ -1353,7 -1344,7 +1353,7 @@@ static inline void kvm_vcpu_set_dy_elig #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ struct kvm_vcpu *kvm_get_running_vcpu(void); - struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); + struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS bool kvm_arch_has_irq_bypass(void); diff --combined virt/kvm/arm/arm.c index bfdba1c,4d864f8..376c6a7 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@@ -625,6 -625,14 +625,14 @@@ static void check_vcpu_requests(struct if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) kvm_update_stolen_time(vcpu); + + if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { + /* The distributor enable bits were changed */ + preempt_disable(); + vgic_v4_put(vcpu, false); + vgic_v4_load(vcpu); + preempt_enable(); + } } } @@@ -742,9 -750,7 +750,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_ guest_enter_irqoff(); if (has_vhe()) { - kvm_arm_vhe_guest_enter(); ret = kvm_vcpu_run_vhe(vcpu); - kvm_arm_vhe_guest_exit(); } else { ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu); } @@@ -1183,15 -1189,55 +1189,15 @@@ long kvm_arch_vcpu_ioctl(struct file *f return r; } -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * Steps 1-4 below provide general overview of dirty page logging. See - * kvm_get_dirty_log_protect() function description for additional details. - * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Copy the snapshot to the userspace. - * 4. Flush TLB's if needed. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - r = kvm_get_dirty_log_protect(kvm, log, &flush); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; } -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) { - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - r = kvm_clear_dirty_log_protect(kvm, log, &flush); - - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; + kvm_flush_remote_tlbs(kvm); } static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,