Merge tag 'kvmarm-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm...

author Paolo Bonzini <pbonzini@redhat.com>

Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)
diff --combined Documentation/admin-guide/kernel-parameters.txt

index b0beae9,c07815d..144c130
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -136,6 -136,10 +136,10 @@@
                         dynamic table installation which will install SSDT
                         tables to /sys/firmware/acpi/tables/dynamic.
   
+       acpi_no_watchdog        [HW,ACPI,WDT]
+                       Ignore the ACPI-based watchdog interface (WDAT) and let
+                       a native driver control the watchdog device instead.
+ 
         acpi_rsdp=      [ACPI,EFI,KEXEC]
                         Pass the RSDP address to the kernel, mostly used
                         on machines running EFI runtime service to boot the
@@@ -3795,11 -3799,6 +3799,11 @@@
                         before loading.
                         See Documentation/admin-guide/blockdev/ramdisk.rst.
   
+ +      prot_virt=      [S390] enable hosting protected virtual machines
+ +                      isolated from the hypervisor (if hardware supports
+ +                      that).
+ +                      Format: <bool>
+ +
         psi=            [KNL] Enable or disable pressure stall information
                         tracking.
                         Format: <bool>
diff --combined MAINTAINERS

index 97a7064,e84a94e..d87d009
--- 1/MAINTAINERS
--- 2/MAINTAINERS
+++ b/MAINTAINERS
@@@ -3649,6 -3649,7 +3649,7 @@@ F:      sound/pci/oxygen
   
   C-SKY ARCHITECTURE
   M:    Guo Ren <guoren@kernel.org>
+ L:    linux-csky@vger.kernel.org
   T:    git https://github.com/c-sky/csky-linux.git
   S:    Supported
   F:    arch/csky/
@@@ -3909,7 -3910,7 +3910,7 @@@ S:      Supporte
   F:    Documentation/filesystems/ceph.txt
   F:    fs/ceph/
   
- CERTIFICATE HANDLING:
+ CERTIFICATE HANDLING
   M:    David Howells <dhowells@redhat.com>
   M:    David Woodhouse <dwmw2@infradead.org>
   L:    keyrings@vger.kernel.org
@@@ -3919,7 -3920,7 +3920,7 @@@ F:      certs
   F:    scripts/sign-file.c
   F:    scripts/extract-cert.c
   
- CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
+ CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM
   L:    devel@driverdev.osuosl.org
   S:    Obsolete
   F:    drivers/staging/wusbcore/
@@@ -5932,12 -5933,12 +5933,12 @@@ S:   Maintaine
   F:    drivers/media/dvb-frontends/ec100*
   
   ECRYPT FILE SYSTEM
- M:    Tyler Hicks <tyhicks@canonical.com>
+ M:    Tyler Hicks <code@tyhicks.com>
   L:    ecryptfs@vger.kernel.org
   W:    http://ecryptfs.org
   W:    https://launchpad.net/ecryptfs
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git
- S:    Supported
+ S:    Odd Fixes
   F:    Documentation/filesystems/ecryptfs.txt
   F:    fs/ecryptfs/
   
@@@ -7047,7 -7048,7 +7048,7 @@@ L:      kvm@vger.kernel.or
   S:    Supported
   F:    drivers/uio/uio_pci_generic.c
   
- GENERIC VDSO LIBRARY:
+ GENERIC VDSO LIBRARY
   M:    Andy Lutomirski <luto@kernel.org>
   M:    Thomas Gleixner <tglx@linutronix.de>
   M:    Vincenzo Frascino <vincenzo.frascino@arm.com>
@@@ -8392,7 -8393,7 +8393,7 @@@ M:      Joonas Lahtinen <joonas.lahtinen@lin
   M:    Rodrigo Vivi <rodrigo.vivi@intel.com>
   L:    intel-gfx@lists.freedesktop.org
   W:    https://01.org/linuxgraphics/
- B:    https://01.org/linuxgraphics/documentation/how-report-bugs
+ B:    https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs
   C:    irc://chat.freenode.net/intel-gfx
   Q:    http://patchwork.freedesktop.org/project/intel-gfx/
   T:    git git://anongit.freedesktop.org/drm-intel
@@@ -9163,7 -9164,7 +9164,7 @@@ F:      virt/kvm/
   F:    tools/kvm/
   F:    tools/testing/selftests/kvm/
   
- KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64)
+ KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
   M:    Marc Zyngier <maz@kernel.org>
   R:    James Morse <james.morse@arm.com>
   R:    Julien Thierry <julien.thierry.kdev@gmail.com>
@@@ -9172,9 -9173,6 +9173,6 @@@ L:      linux-arm-kernel@lists.infradead.or
   L:    kvmarm@lists.cs.columbia.edu
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
   S:    Maintained
- F:    arch/arm/include/uapi/asm/kvm*
- F:    arch/arm/include/asm/kvm*
- F:    arch/arm/kvm/
   F:    arch/arm64/include/uapi/asm/kvm*
   F:    arch/arm64/include/asm/kvm*
   F:    arch/arm64/kvm/
@@@ -9209,7 -9207,6 +9207,7 @@@ L:      kvm@vger.kernel.or
   W:    http://www.ibm.com/developerworks/linux/linux390/
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
   S:    Supported
+ +F:    Documentation/virt/kvm/s390*
   F:    arch/s390/include/uapi/asm/kvm*
   F:    arch/s390/include/asm/gmap.h
   F:    arch/s390/include/asm/kvm*
@@@ -9279,7 -9276,7 +9277,7 @@@ F:      include/keys/trusted-type.
   F:    security/keys/trusted.c
   F:    include/keys/trusted.h
   
- KEYS/KEYRINGS:
+ KEYS/KEYRINGS
   M:    David Howells <dhowells@redhat.com>
   M:    Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
   L:    keyrings@vger.kernel.org
@@@ -11115,14 -11112,12 +11113,12 @@@ S:        Maintaine
   F:    drivers/usb/image/microtek.*
   
   MIPS
- M:    Ralf Baechle <ralf@linux-mips.org>
- M:    Paul Burton <paulburton@kernel.org>
+ M:    Thomas Bogendoerfer <tsbogend@alpha.franken.de>
   L:    linux-mips@vger.kernel.org
   W:    http://www.linux-mips.org/
- T:    git git://git.linux-mips.org/pub/scm/ralf/linux.git
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git
   Q:    http://patchwork.linux-mips.org/project/linux-mips/list/
- S:    Supported
+ S:    Maintained
   F:    Documentation/devicetree/bindings/mips/
   F:    Documentation/mips/
   F:    arch/mips/
@@@ -11485,7 -11480,7 +11481,7 @@@ F:   drivers/scsi/mac_scsi.
   F:    drivers/scsi/sun3_scsi.*
   F:    drivers/scsi/sun3_scsi_vme.c
   
- NCSI LIBRARY:
+ NCSI LIBRARY
   M:    Samuel Mendoza-Jonas <sam@mendozajonas.com>
   S:    Maintained
   F:    net/ncsi/
@@@ -12741,7 -12736,7 +12737,7 @@@ M:   Tom Joseph <tjoseph@cadence.com
   L:    linux-pci@vger.kernel.org
   S:    Maintained
   F:    Documentation/devicetree/bindings/pci/cdns,*.txt
- F:    drivers/pci/controller/pcie-cadence*
+ F:    drivers/pci/controller/cadence/
   
   PCI DRIVER FOR FREESCALE LAYERSCAPE
   M:    Minghuan Lian <minghuan.Lian@nxp.com>
@@@ -13513,7 -13508,7 +13509,7 @@@ L:   linuxppc-dev@lists.ozlabs.or
   S:    Maintained
   F:    drivers/block/ps3vram.c
   
- PSAMPLE PACKET SAMPLING SUPPORT:
+ PSAMPLE PACKET SAMPLING SUPPORT
   M:    Yotam Gigi <yotam.gi@gmail.com>
   S:    Maintained
   F:    net/psample
@@@ -14583,10 -14578,10 +14579,10 @@@ F:        drivers/media/pci/saa7146
   F:    include/media/drv-intf/saa7146*
   
   SAFESETID SECURITY MODULE
- M:     Micah Morton <mortonm@chromium.org>
- S:     Supported
- F:     security/safesetid/
- F:     Documentation/admin-guide/LSM/SafeSetID.rst
+ M:    Micah Morton <mortonm@chromium.org>
+ S:    Supported
+ F:    security/safesetid/
+ F:    Documentation/admin-guide/LSM/SafeSetID.rst
   
   SAMSUNG AUDIO (ASoC) DRIVERS
   M:    Krzysztof Kozlowski <krzk@kernel.org>
@@@ -16553,8 -16548,8 +16549,8 @@@ M:   Michael Jamet <michael.jamet@intel.c
   M:    Mika Westerberg <mika.westerberg@linux.intel.com>
   M:    Yehezkel Bernat <YehezkelShB@gmail.com>
   L:    linux-usb@vger.kernel.org
- T:    git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
   S:    Maintained
+ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
   F:    Documentation/admin-guide/thunderbolt.rst
   F:    drivers/thunderbolt/
   F:    include/linux/thunderbolt.h
@@@ -17081,7 -17076,7 +17077,7 @@@ S:   Maintaine
   F:    Documentation/admin-guide/ufs.rst
   F:    fs/ufs/
   
- UHID USERSPACE HID IO DRIVER:
+ UHID USERSPACE HID IO DRIVER
   M:    David Herrmann <dh.herrmann@googlemail.com>
   L:    linux-input@vger.kernel.org
   S:    Maintained
@@@ -17095,18 -17090,18 +17091,18 @@@ S:        Maintaine
   F:    drivers/usb/common/ulpi.c
   F:    include/linux/ulpi/
   
- ULTRA-WIDEBAND (UWB) SUBSYSTEM:
+ ULTRA-WIDEBAND (UWB) SUBSYSTEM
   L:    devel@driverdev.osuosl.org
   S:    Obsolete
   F:    drivers/staging/uwb/
   
- UNICODE SUBSYSTEM:
+ UNICODE SUBSYSTEM
   M:    Gabriel Krisman Bertazi <krisman@collabora.com>
   L:    linux-fsdevel@vger.kernel.org
   S:    Supported
   F:    fs/unicode/
   
- UNICORE32 ARCHITECTURE:
+ UNICORE32 ARCHITECTURE
   M:    Guan Xuetao <gxt@pku.edu.cn>
   W:    http://mprc.pku.edu.cn/~guanxuetao/linux
   S:    Maintained
@@@ -17393,11 -17388,14 +17389,14 @@@ F:        drivers/usb
   F:    include/linux/usb.h
   F:    include/linux/usb/
   
- USB TYPEC PI3USB30532 MUX DRIVER
- M:    Hans de Goede <hdegoede@redhat.com>
+ USB TYPEC BUS FOR ALTERNATE MODES
+ M:    Heikki Krogerus <heikki.krogerus@linux.intel.com>
   L:    linux-usb@vger.kernel.org
   S:    Maintained
- F:    drivers/usb/typec/mux/pi3usb30532.c
+ F:    Documentation/ABI/testing/sysfs-bus-typec
+ F:    Documentation/driver-api/usb/typec_bus.rst
+ F:    drivers/usb/typec/altmodes/
+ F:    include/linux/usb/typec_altmode.h
   
   USB TYPEC CLASS
   M:    Heikki Krogerus <heikki.krogerus@linux.intel.com>
@@@ -17408,14 -17406,11 +17407,11 @@@ F:        Documentation/driver-api/usb/typec.r
   F:    drivers/usb/typec/
   F:    include/linux/usb/typec.h
   
- USB TYPEC BUS FOR ALTERNATE MODES
- M:    Heikki Krogerus <heikki.krogerus@linux.intel.com>
+ USB TYPEC PI3USB30532 MUX DRIVER
+ M:    Hans de Goede <hdegoede@redhat.com>
   L:    linux-usb@vger.kernel.org
   S:    Maintained
- F:    Documentation/ABI/testing/sysfs-bus-typec
- F:    Documentation/driver-api/usb/typec_bus.rst
- F:    drivers/usb/typec/altmodes/
- F:    include/linux/usb/typec_altmode.h
+ F:    drivers/usb/typec/mux/pi3usb30532.c
   
   USB TYPEC PORT CONTROLLER DRIVERS
   M:    Guenter Roeck <linux@roeck-us.net>
@@@ -17792,7 -17787,7 +17788,7 @@@ F:   include/linux/vbox_utils.
   F:    include/uapi/linux/vbox*.h
   F:    drivers/virt/vboxguest/
   
- VIRTUAL BOX SHARED FOLDER VFS DRIVER:
+ VIRTUAL BOX SHARED FOLDER VFS DRIVER
   M:    Hans de Goede <hdegoede@redhat.com>
   L:    linux-fsdevel@vger.kernel.org
   S:    Maintained
diff --combined arch/arm64/kvm/hyp/switch.c

index f3e0ab9,925086b..600010c
--- 1/arch/arm64/kvm/hyp/switch.c
--- 2/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@@ -17,6 -17,7 +17,6 @@@
   #include <asm/kprobes.h>
   #include <asm/kvm_asm.h>
   #include <asm/kvm_emulate.h>
- -#include <asm/kvm_host.h>
   #include <asm/kvm_hyp.h>
   #include <asm/kvm_mmu.h>
   #include <asm/fpsimd.h>
@@@ -624,7 -625,7 +624,7 @@@ static void __hyp_text __pmu_switch_to_
   }
   
   /* Switch to the guest for VHE systems running in EL2 */
- int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
+ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
   {
         struct kvm_cpu_context *host_ctxt;
         struct kvm_cpu_context *guest_ctxt;
@@@ -677,7 -678,42 +677,42 @@@
   
         return exit_code;
   }
- NOKPROBE_SYMBOL(kvm_vcpu_run_vhe);
+ NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe);
+ 
+ int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
+ {
+       int ret;
+ 
+       local_daif_mask();
+ 
+       /*
+        * Having IRQs masked via PMR when entering the guest means the GIC
+        * will not signal the CPU of interrupts of lower priority, and the
+        * only way to get out will be via guest exceptions.
+        * Naturally, we want to avoid this.
+        *
+        * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a
+        * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU.
+        */
+       pmr_sync();
+ 
+       ret = __kvm_vcpu_run_vhe(vcpu);
+ 
+       /*
+        * local_daif_restore() takes care to properly restore PSTATE.DAIF
+        * and the GIC PMR if the host is using IRQ priorities.
+        */
+       local_daif_restore(DAIF_PROCCTX_NOIRQ);
+ 
+       /*
+        * When we exit from the guest we change a number of CPU configuration
+        * parameters, such as traps.  Make sure these changes take effect
+        * before running the host or additional guests.
+        */
+       isb();
+ 
+       return ret;
+ }
   
   /* Switch to the guest for legacy non-VHE systems */
   int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
diff --combined arch/s390/boot/Makefile

index 30f1811,0ff9261..45b33b8
--- 1/arch/s390/boot/Makefile
--- 2/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@@ -37,7 -37,7 +37,7 @@@ CFLAGS_sclp_early_core.o += -I$(srctree
   obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o
   obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
   obj-y += version.o pgm_check_info.o ctype.o text_dma.o
- -obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST)  += uv.o
+ +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
   obj-$(CONFIG_RELOCATABLE)     += machine_kexec_reloc.o
   obj-$(CONFIG_RANDOMIZE_BASE)  += kaslr.o
   targets       := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
@@@ -70,7 -70,7 +70,7 @@@ $(obj)/compressed/vmlinux: $(obj)/start
   $(obj)/startup.a: $(OBJECTS) FORCE
         $(call if_changed,ar)
   
- install: $(CONFIGURE) $(obj)/bzImage
+ install:
         sh -x  $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
               System.map "$(INSTALL_PATH)"
   
diff --combined arch/s390/include/asm/page.h

index 4ebcf89,1019efd..62440a8
--- 1/arch/s390/include/asm/page.h
--- 2/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@@ -42,7 -42,7 +42,7 @@@ void __storage_key_init_range(unsigned 
   
   static inline void storage_key_init_range(unsigned long start, unsigned long end)
   {
-       if (PAGE_DEFAULT_KEY)
+       if (PAGE_DEFAULT_KEY != 0)
                 __storage_key_init_range(start, end);
   }
   
@@@ -153,11 -153,6 +153,11 @@@ static inline int devmem_is_allowed(uns
   #define HAVE_ARCH_FREE_PAGE
   #define HAVE_ARCH_ALLOC_PAGE
   
+ +#if IS_ENABLED(CONFIG_PGSTE)
+ +int arch_make_page_accessible(struct page *page);
+ +#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
+ +#endif
+ +
   #endif /* !__ASSEMBLY__ */
   
   #define __PAGE_OFFSET         0x0UL
diff --combined arch/x86/kvm/svm.c

index 2125c6a,24c0b2b..05cb45b
--- 1/arch/x86/kvm/svm.c
--- 2/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@@ -57,11 -57,13 +57,13 @@@
   MODULE_AUTHOR("Qumranet");
   MODULE_LICENSE("GPL");
   
+ #ifdef MODULE
   static const struct x86_cpu_id svm_cpu_id[] = {
         X86_FEATURE_MATCH(X86_FEATURE_SVM),
         {}
   };
   MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+ #endif
   
   #define IOPM_ALLOC_ORDER 2
   #define MSRPM_ALLOC_ORDER 1
@@@ -519,31 -521,10 +521,31 @@@ static void recalc_intercepts(struct vc
         h = &svm->nested.hsave->control;
         g = &svm->nested;
   
- -      c->intercept_cr = h->intercept_cr | g->intercept_cr;
- -      c->intercept_dr = h->intercept_dr | g->intercept_dr;
- -      c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
- -      c->intercept = h->intercept | g->intercept;
+ +      c->intercept_cr = h->intercept_cr;
+ +      c->intercept_dr = h->intercept_dr;
+ +      c->intercept_exceptions = h->intercept_exceptions;
+ +      c->intercept = h->intercept;
+ +
+ +      if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+ +              /* We only want the cr8 intercept bits of L1 */
+ +              c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
+ +              c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
+ +
+ +              /*
+ +               * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
+ +               * affect any interrupt we may want to inject; therefore,
+ +               * interrupt window vmexits are irrelevant to L0.
+ +               */
+ +              c->intercept &= ~(1ULL << INTERCEPT_VINTR);
+ +      }
+ +
+ +      /* We don't want to see VMMCALLs from a nested guest */
+ +      c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+ +
+ +      c->intercept_cr |= g->intercept_cr;
+ +      c->intercept_dr |= g->intercept_dr;
+ +      c->intercept_exceptions |= g->intercept_exceptions;
+ +      c->intercept |= g->intercept;
   }
   
   static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
@@@ -648,11 -629,6 +650,11 @@@ static inline void clr_intercept(struc
         recalc_intercepts(svm);
   }
   
+ +static inline bool is_intercept(struct vcpu_svm *svm, int bit)
+ +{
+ +      return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
+ +}
+ +
   static inline bool vgif_enabled(struct vcpu_svm *svm)
   {
         return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
@@@ -1232,7 -1208,6 +1234,7 @@@ static int avic_ga_log_notifier(u32 ga_
         u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
   
         pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+ +      trace_kvm_avic_ga_log(vm_id, vcpu_id);
   
         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
         hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
@@@ -1394,29 -1369,6 +1396,29 @@@ static void svm_hardware_teardown(void
         iopm_base = 0;
   }
   
+ +static __init void svm_set_cpu_caps(void)
+ +{
+ +      kvm_set_cpu_caps();
+ +
+ +      supported_xss = 0;
+ +
+ +      /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+ +      if (nested) {
+ +              kvm_cpu_cap_set(X86_FEATURE_SVM);
+ +
+ +              if (nrips)
+ +                      kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+ +
+ +              if (npt_enabled)
+ +                      kvm_cpu_cap_set(X86_FEATURE_NPT);
+ +      }
+ +
+ +      /* CPUID 0x80000008 */
+ +      if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+ +          boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ +              kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+ +}
+ +
   static __init int svm_hardware_setup(void)
   {
         int cpu;
@@@ -1435,8 -1387,6 +1437,8 @@@
   
         init_msrpm_offsets();
   
+ +      supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+ +
         if (boot_cpu_has(X86_FEATURE_NX))
                 kvm_enable_efer_bits(EFER_NX);
   
@@@ -1484,11 -1434,16 +1486,11 @@@
         if (!boot_cpu_has(X86_FEATURE_NPT))
                 npt_enabled = false;
   
- -      if (npt_enabled && !npt) {
- -              printk(KERN_INFO "kvm: Nested Paging disabled\n");
+ +      if (npt_enabled && !npt)
                 npt_enabled = false;
- -      }
   
- -      if (npt_enabled) {
- -              printk(KERN_INFO "kvm: Nested Paging enabled\n");
- -              kvm_enable_tdp();
- -      } else
- -              kvm_disable_tdp();
+ +      kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL);
+ +      pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
   
         if (nrips) {
                 if (!boot_cpu_has(X86_FEATURE_NRIPS))
@@@ -1524,8 -1479,6 +1526,8 @@@
                         pr_info("Virtual GIF supported\n");
         }
   
+ +      svm_set_cpu_caps();
+ +
         return 0;
   
   err:
@@@ -1993,6 -1946,19 +1995,6 @@@ static void __unregister_enc_region_loc
         kfree(region);
   }
   
- -static struct kvm *svm_vm_alloc(void)
- -{
- -      struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
- -                                          GFP_KERNEL_ACCOUNT | __GFP_ZERO,
- -                                          PAGE_KERNEL);
- -      return &kvm_svm->kvm;
- -}
- -
- -static void svm_vm_free(struct kvm *kvm)
- -{
- -      vfree(to_kvm_svm(kvm));
- -}
- -
   static void sev_vm_destroy(struct kvm *kvm)
   {
         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@@ -2220,7 -2186,7 +2222,7 @@@ static void svm_vcpu_reset(struct kvm_v
         }
         init_vmcb(svm);
   
- -      kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
+ +      kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
         kvm_rdx_write(vcpu, eax);
   
         if (kvm_vcpu_apicv_active(vcpu) && !init_event)
@@@ -2230,8 -2196,9 +2232,9 @@@
   static int avic_init_vcpu(struct vcpu_svm *svm)
   {
         int ret;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
   
-       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+       if (!avic || !irqchip_in_kernel(vcpu->kvm))
                 return 0;
   
         ret = avic_init_backing_page(&svm->vcpu);
@@@ -2453,38 -2420,14 +2456,38 @@@ static void svm_cache_reg(struct kvm_vc
         }
   }
   
+ +static inline void svm_enable_vintr(struct vcpu_svm *svm)
+ +{
+ +      struct vmcb_control_area *control;
+ +
+ +      /* The following fields are ignored when AVIC is enabled */
+ +      WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
+ +
+ +      /*
+ +       * This is just a dummy VINTR to actually cause a vmexit to happen.
+ +       * Actual injection of virtual interrupts happens through EVENTINJ.
+ +       */
+ +      control = &svm->vmcb->control;
+ +      control->int_vector = 0x0;
+ +      control->int_ctl &= ~V_INTR_PRIO_MASK;
+ +      control->int_ctl |= V_IRQ_MASK |
+ +              ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ +      mark_dirty(svm->vmcb, VMCB_INTR);
+ +}
+ +
   static void svm_set_vintr(struct vcpu_svm *svm)
   {
         set_intercept(svm, INTERCEPT_VINTR);
+ +      if (is_intercept(svm, INTERCEPT_VINTR))
+ +              svm_enable_vintr(svm);
   }
   
   static void svm_clear_vintr(struct vcpu_svm *svm)
   {
         clr_intercept(svm, INTERCEPT_VINTR);
+ +
+ +      svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ +      mark_dirty(svm->vmcb, VMCB_INTR);
   }
   
   static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@@ -3040,6 -2983,15 +3043,6 @@@ static u64 nested_svm_get_tdp_pdptr(str
         return pdpte;
   }
   
- -static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
- -                                 unsigned long root)
- -{
- -      struct vcpu_svm *svm = to_svm(vcpu);
- -
- -      svm->vmcb->control.nested_cr3 = __sme_set(root);
- -      mark_dirty(svm->vmcb, VMCB_NPT);
- -}
- -
   static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
                                        struct x86_exception *fault)
   {
@@@ -3075,7 -3027,8 +3078,7 @@@ static void nested_svm_init_mmu_context
   
         vcpu->arch.mmu = &vcpu->arch.guest_mmu;
         kvm_init_shadow_mmu(vcpu);
- -      vcpu->arch.mmu->set_cr3           = nested_svm_set_tdp_cr3;
- -      vcpu->arch.mmu->get_cr3           = nested_svm_get_tdp_cr3;
+ +      vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
         vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
         vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
         vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
@@@ -3136,36 -3089,43 +3139,36 @@@ static int nested_svm_check_exception(s
         return vmexit;
   }
   
- -/* This function returns true if it is save to enable the irq window */
- -static inline bool nested_svm_intr(struct vcpu_svm *svm)
+ +static void nested_svm_intr(struct vcpu_svm *svm)
   {
- -      if (!is_guest_mode(&svm->vcpu))
- -              return true;
- -
- -      if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- -              return true;
- -
- -      if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- -              return false;
- -
- -      /*
- -       * if vmexit was already requested (by intercepted exception
- -       * for instance) do not overwrite it with "external interrupt"
- -       * vmexit.
- -       */
- -      if (svm->nested.exit_required)
- -              return false;
- -
         svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
         svm->vmcb->control.exit_info_1 = 0;
         svm->vmcb->control.exit_info_2 = 0;
   
- -      if (svm->nested.intercept & 1ULL) {
- -              /*
- -               * The #vmexit can't be emulated here directly because this
- -               * code path runs with irqs and preemption disabled. A
- -               * #vmexit emulation might sleep. Only signal request for
- -               * the #vmexit here.
- -               */
- -              svm->nested.exit_required = true;
- -              trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
- -              return false;
+ +      /* nested_svm_vmexit this gets called afterwards from handle_exit */
+ +      svm->nested.exit_required = true;
+ +      trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ +}
+ +
+ +static bool nested_exit_on_intr(struct vcpu_svm *svm)
+ +{
+ +      return (svm->nested.intercept & 1ULL);
+ +}
+ +
+ +static int svm_check_nested_events(struct kvm_vcpu *vcpu)
+ +{
+ +      struct vcpu_svm *svm = to_svm(vcpu);
+ +      bool block_nested_events =
+ +              kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
+ +
+ +      if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
+ +              if (block_nested_events)
+ +                      return -EBUSY;
+ +              nested_svm_intr(svm);
+ +              return 0;
         }
   
- -      return true;
+ +      return 0;
   }
   
   /* This function returns true if it is save to enable the nmi window */
@@@ -3284,6 -3244,9 +3287,6 @@@ static int nested_svm_exit_special(stru
         return NESTED_EXIT_CONTINUE;
   }
   
- -/*
- - * If this function returns true, this #vmexit was already handled
- - */
   static int nested_svm_intercept(struct vcpu_svm *svm)
   {
         u32 exit_code = svm->vmcb->control.exit_code;
@@@ -3558,9 -3521,6 +3561,9 @@@ static bool nested_svm_vmrun_msrpm(stru
   
   static bool nested_vmcb_checks(struct vmcb *vmcb)
   {
+ +      if ((vmcb->save.efer & EFER_SVME) == 0)
+ +              return false;
+ +
         if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
                 return false;
   
@@@ -3577,10 -3537,6 +3580,10 @@@
   static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
                                  struct vmcb *nested_vmcb, struct kvm_host_map *map)
   {
+ +      bool evaluate_pending_interrupts =
+ +              is_intercept(svm, INTERCEPT_VINTR) ||
+ +              is_intercept(svm, INTERCEPT_IRET);
+ +
         if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
                 svm->vcpu.arch.hflags |= HF_HIF_MASK;
         else
@@@ -3640,6 -3596,15 +3643,6 @@@
         else
                 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
   
- -      if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
- -              /* We only want the cr8 intercept bits of the guest */
- -              clr_cr_intercept(svm, INTERCEPT_CR8_READ);
- -              clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
- -      }
- -
- -      /* We don't want to see VMMCALLs from a nested guest */
- -      clr_intercept(svm, INTERCEPT_VMMCALL);
- -
         svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
         svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
   
@@@ -3667,21 -3632,7 +3670,21 @@@
   
         svm->nested.vmcb = vmcb_gpa;
   
+ +      /*
+ +       * If L1 had a pending IRQ/NMI before executing VMRUN,
+ +       * which wasn't delivered because it was disallowed (e.g.
+ +       * interrupts disabled), L0 needs to evaluate if this pending
+ +       * event should cause an exit from L2 to L1 or be delivered
+ +       * directly to L2.
+ +       *
+ +       * Usually this would be handled by the processor noticing an
+ +       * IRQ/NMI window request.  However, VMRUN can unblock interrupts
+ +       * by implicitly setting GIF, so force L0 to perform pending event
+ +       * evaluation by requesting a KVM_REQ_EVENT.
+ +       */
         enable_gif(svm);
+ +      if (unlikely(evaluate_pending_interrupts))
+ +              kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
   
         mark_all_dirty(svm->vmcb);
   }
@@@ -3883,8 -3834,11 +3886,8 @@@ static int clgi_interception(struct vcp
         disable_gif(svm);
   
         /* After a CLGI no interrupts should come */
- -      if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
+ +      if (!kvm_vcpu_apicv_active(&svm->vcpu))
                 svm_clear_vintr(svm);
- -              svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
- -              mark_dirty(svm->vmcb, VMCB_INTR);
- -      }
   
         return ret;
   }
@@@ -5170,6 -5124,19 +5173,6 @@@ static void svm_inject_nmi(struct kvm_v
         ++vcpu->stat.nmi_injections;
   }
   
- -static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
- -{
- -      struct vmcb_control_area *control;
- -
- -      /* The following fields are ignored when AVIC is enabled */
- -      control = &svm->vmcb->control;
- -      control->int_vector = irq;
- -      control->int_ctl &= ~V_INTR_PRIO_MASK;
- -      control->int_ctl |= V_IRQ_MASK |
- -              ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
- -      mark_dirty(svm->vmcb, VMCB_INTR);
- -}
- -
   static void svm_set_irq(struct kvm_vcpu *vcpu)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
@@@ -5558,15 -5525,18 +5561,15 @@@ static int svm_interrupt_allowed(struc
   {
         struct vcpu_svm *svm = to_svm(vcpu);
         struct vmcb *vmcb = svm->vmcb;
- -      int ret;
   
         if (!gif_set(svm) ||
              (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
                 return 0;
   
- -      ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
- -
- -      if (is_guest_mode(vcpu))
- -              return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
- -
- -      return ret;
+ +      if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ +              return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
+ +      else
+ +              return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
   }
   
   static void enable_irq_window(struct kvm_vcpu *vcpu)
@@@ -5581,7 -5551,7 +5584,7 @@@
          * enabled, the STGI interception will not occur. Enable the irq
          * window under the assumption that the hardware will set the GIF.
          */
- -      if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
+ +      if (vgif_enabled(svm) || gif_set(svm)) {
                 /*
                  * IRQ window is not needed when AVIC is enabled,
                  * unless we have pending ExtINT since it cannot be injected
@@@ -5590,6 -5560,7 +5593,6 @@@
                  */
                 svm_toggle_avic_for_irq_window(vcpu, false);
                 svm_set_vintr(svm);
- -              svm_inject_irq(svm, 0x0);
         }
   }
   
@@@ -5975,30 -5946,24 +5978,30 @@@ static void svm_vcpu_run(struct kvm_vcp
   }
   STACK_FRAME_NON_STANDARD(svm_vcpu_run);
   
- -static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+ +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
+ +      bool update_guest_cr3 = true;
+ +      unsigned long cr3;
   
- -      svm->vmcb->save.cr3 = __sme_set(root);
- -      mark_dirty(svm->vmcb, VMCB_CR);
- -}
- -
- -static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
- -{
- -      struct vcpu_svm *svm = to_svm(vcpu);
+ +      cr3 = __sme_set(root);
+ +      if (npt_enabled) {
+ +              svm->vmcb->control.nested_cr3 = cr3;
+ +              mark_dirty(svm->vmcb, VMCB_NPT);
   
- -      svm->vmcb->control.nested_cr3 = __sme_set(root);
- -      mark_dirty(svm->vmcb, VMCB_NPT);
+ +              /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
+ +              if (is_guest_mode(vcpu))
+ +                      update_guest_cr3 = false;
+ +              else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ +                      cr3 = vcpu->arch.cr3;
+ +              else /* CR3 is already up-to-date.  */
+ +                      update_guest_cr3 = false;
+ +      }
   
- -      /* Also sync guest cr3 here in case we live migrate */
- -      svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
- -      mark_dirty(svm->vmcb, VMCB_CR);
+ +      if (update_guest_cr3) {
+ +              svm->vmcb->save.cr3 = cr3;
+ +              mark_dirty(svm->vmcb, VMCB_CR);
+ +      }
   }
   
   static int is_disabled(void)
@@@ -6060,19 -6025,12 +6063,19 @@@ static void svm_cpuid_update(struct kvm
                                     boot_cpu_has(X86_FEATURE_XSAVES);
   
         /* Update nrips enabled cache */
- -      svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+ +      svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
+ +                           guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
   
         if (!kvm_vcpu_apicv_active(vcpu))
                 return;
   
- -      guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
+ +      /*
+ +       * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
+ +       * is exposed to the guest, disable AVIC.
+ +       */
+ +      if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+ +              kvm_request_apicv_update(vcpu->kvm, false,
+ +                                       APICV_INHIBIT_REASON_X2APIC);
   
         /*
          * Currently, AVIC does not work with nested virtualization.
@@@ -6083,11 -6041,88 +6086,11 @@@
                                          APICV_INHIBIT_REASON_NESTED);
   }
   
- -#define F feature_bit
- -
- -static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
- -{
- -      switch (func) {
- -      case 0x1:
- -              if (avic)
- -                      entry->ecx &= ~F(X2APIC);
- -              break;
- -      case 0x80000001:
- -              if (nested)
- -                      entry->ecx |= (1 << 2); /* Set SVM bit */
- -              break;
- -      case 0x80000008:
- -              if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
- -                   boot_cpu_has(X86_FEATURE_AMD_SSBD))
- -                      entry->ebx |= F(VIRT_SSBD);
- -              break;
- -      case 0x8000000A:
- -              entry->eax = 1; /* SVM revision 1 */
- -              entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
- -                                 ASID emulation to nested SVM */
- -              entry->ecx = 0; /* Reserved */
- -              entry->edx = 0; /* Per default do not support any
- -                                 additional features */
- -
- -              /* Support next_rip if host supports it */
- -              if (boot_cpu_has(X86_FEATURE_NRIPS))
- -                      entry->edx |= F(NRIPS);
- -
- -              /* Support NPT for the guest if enabled */
- -              if (npt_enabled)
- -                      entry->edx |= F(NPT);
- -
- -      }
- -}
- -
- -static int svm_get_lpage_level(void)
- -{
- -      return PT_PDPE_LEVEL;
- -}
- -
- -static bool svm_rdtscp_supported(void)
- -{
- -      return boot_cpu_has(X86_FEATURE_RDTSCP);
- -}
- -
- -static bool svm_invpcid_supported(void)
- -{
- -      return false;
- -}
- -
- -static bool svm_mpx_supported(void)
- -{
- -      return false;
- -}
- -
- -static bool svm_xsaves_supported(void)
- -{
- -      return boot_cpu_has(X86_FEATURE_XSAVES);
- -}
- -
- -static bool svm_umip_emulated(void)
- -{
- -      return false;
- -}
- -
- -static bool svm_pt_supported(void)
- -{
- -      return false;
- -}
- -
   static bool svm_has_wbinvd_exit(void)
   {
         return true;
   }
   
- -static bool svm_pku_supported(void)
- -{
- -      return false;
- -}
- -
   #define PRE_EX(exit)  { .exit_code = (exit), \
                         .stage = X86_ICPT_PRE_EXCEPT, }
   #define POST_EX(exit) { .exit_code = (exit), \
@@@ -6154,8 -6189,7 +6157,8 @@@ static const struct __x86_intercept 
   
   static int svm_check_intercept(struct kvm_vcpu *vcpu,
                                struct x86_instruction_info *info,
- -                             enum x86_intercept_stage stage)
+ +                             enum x86_intercept_stage stage,
+ +                             struct x86_exception *exception)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
         int vmexit, ret = X86EMUL_CONTINUE;
@@@ -7339,8 -7373,7 +7342,8 @@@ static bool svm_check_apicv_inhibit_rea
                           BIT(APICV_INHIBIT_REASON_HYPERV) |
                           BIT(APICV_INHIBIT_REASON_NESTED) |
                           BIT(APICV_INHIBIT_REASON_IRQWIN) |
- -                        BIT(APICV_INHIBIT_REASON_PIT_REINJ);
+ +                        BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
+ +                        BIT(APICV_INHIBIT_REASON_X2APIC);
   
         return supported & BIT(bit);
   }
@@@ -7365,7 -7398,8 +7368,7 @@@ static struct kvm_x86_ops svm_x86_ops _
         .vcpu_free = svm_free_vcpu,
         .vcpu_reset = svm_vcpu_reset,
   
- -      .vm_alloc = svm_vm_alloc,
- -      .vm_free = svm_vm_free,
+ +      .vm_size = sizeof(struct kvm_svm),
         .vm_init = svm_vm_init,
         .vm_destroy = svm_vm_destroy,
   
@@@ -7387,6 -7421,7 +7390,6 @@@
         .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
         .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
         .set_cr0 = svm_set_cr0,
- -      .set_cr3 = svm_set_cr3,
         .set_cr4 = svm_set_cr4,
         .set_efer = svm_set_efer,
         .get_idt = svm_get_idt,
@@@ -7439,14 -7474,26 +7442,14 @@@
   
         .get_exit_info = svm_get_exit_info,
   
- -      .get_lpage_level = svm_get_lpage_level,
- -
         .cpuid_update = svm_cpuid_update,
   
- -      .rdtscp_supported = svm_rdtscp_supported,
- -      .invpcid_supported = svm_invpcid_supported,
- -      .mpx_supported = svm_mpx_supported,
- -      .xsaves_supported = svm_xsaves_supported,
- -      .umip_emulated = svm_umip_emulated,
- -      .pt_supported = svm_pt_supported,
- -      .pku_supported = svm_pku_supported,
- -
- -      .set_supported_cpuid = svm_set_supported_cpuid,
- -
         .has_wbinvd_exit = svm_has_wbinvd_exit,
   
         .read_l1_tsc_offset = svm_read_l1_tsc_offset,
         .write_l1_tsc_offset = svm_write_l1_tsc_offset,
   
- -      .set_tdp_cr3 = set_tdp_cr3,
+ +      .load_mmu_pgd = svm_load_mmu_pgd,
   
         .check_intercept = svm_check_intercept,
         .handle_exit_irqoff = svm_handle_exit_irqoff,
@@@ -7476,8 -7523,6 +7479,8 @@@
         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
   
         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
+ +
+ +      .check_nested_events = svm_check_nested_events,
   };
   
   static int __init svm_init(void)
diff --combined arch/x86/kvm/vmx/vmx.c

index 3aba51d,40b1e61..a7dd678
--- 1/arch/x86/kvm/vmx/vmx.c
--- 2/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -64,11 -64,13 +64,13 @@@
   MODULE_AUTHOR("Qumranet");
   MODULE_LICENSE("GPL");
   
+ #ifdef MODULE
   static const struct x86_cpu_id vmx_cpu_id[] = {
         X86_FEATURE_MATCH(X86_FEATURE_VMX),
         {}
   };
   MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+ #endif
   
   bool __read_mostly enable_vpid = 1;
   module_param_named(vpid, enable_vpid, bool, 0444);
@@@ -433,6 -435,7 +435,6 @@@ static const struct kvm_vmx_segment_fie
         VMX_SEGMENT_FIELD(LDTR),
   };
   
- -u64 host_efer;
   static unsigned long host_idt_base;
   
   /*
@@@ -653,16 -656,53 +655,16 @@@ static int vmx_set_guest_msr(struct vcp
         return ret;
   }
   
- -void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
- -{
- -      vmcs_clear(loaded_vmcs->vmcs);
- -      if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
- -              vmcs_clear(loaded_vmcs->shadow_vmcs);
- -      loaded_vmcs->cpu = -1;
- -      loaded_vmcs->launched = 0;
- -}
- -
   #ifdef CONFIG_KEXEC_CORE
- -/*
- - * This bitmap is used to indicate whether the vmclear
- - * operation is enabled on all cpus. All disabled by
- - * default.
- - */
- -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
- -
- -static inline void crash_enable_local_vmclear(int cpu)
- -{
- -      cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
- -}
- -
- -static inline void crash_disable_local_vmclear(int cpu)
- -{
- -      cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
- -}
- -
- -static inline int crash_local_vmclear_enabled(int cpu)
- -{
- -      return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
- -}
- -
   static void crash_vmclear_local_loaded_vmcss(void)
   {
         int cpu = raw_smp_processor_id();
         struct loaded_vmcs *v;
   
- -      if (!crash_local_vmclear_enabled(cpu))
- -              return;
- -
         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                             loaded_vmcss_on_cpu_link)
                 vmcs_clear(v->vmcs);
   }
- -#else
- -static inline void crash_enable_local_vmclear(int cpu) { }
- -static inline void crash_disable_local_vmclear(int cpu) { }
   #endif /* CONFIG_KEXEC_CORE */
   
   static void __loaded_vmcs_clear(void *arg)
@@@ -674,24 -714,19 +676,24 @@@
                 return; /* vcpu migration can race with cpu offline */
         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                 per_cpu(current_vmcs, cpu) = NULL;
- -      crash_disable_local_vmclear(cpu);
+ +
+ +      vmcs_clear(loaded_vmcs->vmcs);
+ +      if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ +              vmcs_clear(loaded_vmcs->shadow_vmcs);
+ +
         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
   
         /*
- -       * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
- -       * is before setting loaded_vmcs->vcpu to -1 which is done in
- -       * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
- -       * then adds the vmcs into percpu list before it is deleted.
+ +       * Ensure all writes to loaded_vmcs, including deleting it from its
+ +       * current percpu list, complete before setting loaded_vmcs->vcpu to
+ +       * -1, otherwise a different cpu can see vcpu == -1 first and add
+ +       * loaded_vmcs to its percpu list before it's deleted from this cpu's
+ +       * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
          */
         smp_wmb();
   
- -      loaded_vmcs_init(loaded_vmcs);
- -      crash_enable_local_vmclear(cpu);
+ +      loaded_vmcs->cpu = -1;
+ +      loaded_vmcs->launched = 0;
   }
   
   void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@@ -775,7 -810,7 +777,7 @@@ void update_exception_bitmap(struct kvm
         if (to_vmx(vcpu)->rmode.vm86_active)
                 eb = ~0;
         if (enable_ept)
- -              eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+ +              eb &= ~(1u << PF_VECTOR);
   
         /* When we are running a nested L2 guest and L1 specified for it a
          * certain exception bitmap, we must trap the same exceptions and pass
@@@ -1026,7 -1061,7 +1028,7 @@@ static unsigned long segment_base(u16 s
   
   static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
   {
- -      return (pt_mode == PT_MODE_HOST_GUEST) &&
+ +      return vmx_pt_mode_is_host_guest() &&
                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
   }
   
@@@ -1060,7 -1095,7 +1062,7 @@@ static inline void pt_save_msr(struct p
   
   static void pt_guest_enter(struct vcpu_vmx *vmx)
   {
- -      if (pt_mode == PT_MODE_SYSTEM)
+ +      if (vmx_pt_mode_is_system())
                 return;
   
         /*
@@@ -1077,7 -1112,7 +1079,7 @@@
   
   static void pt_guest_exit(struct vcpu_vmx *vmx)
   {
- -      if (pt_mode == PT_MODE_SYSTEM)
+ +      if (vmx_pt_mode_is_system())
                 return;
   
         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
@@@ -1310,17 -1345,18 +1312,17 @@@ void vmx_vcpu_load_vmcs(struct kvm_vcp
         if (!already_loaded) {
                 loaded_vmcs_clear(vmx->loaded_vmcs);
                 local_irq_disable();
- -              crash_disable_local_vmclear(cpu);
   
                 /*
- -               * Read loaded_vmcs->cpu should be before fetching
- -               * loaded_vmcs->loaded_vmcss_on_cpu_link.
- -               * See the comments in __loaded_vmcs_clear().
+ +               * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ +               * this cpu's percpu list, otherwise it may not yet be deleted
+ +               * from its previous cpu's percpu list.  Pairs with the
+ +               * smb_wmb() in __loaded_vmcs_clear().
                  */
                 smp_rmb();
   
                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                          &per_cpu(loaded_vmcss_on_cpu, cpu));
- -              crash_enable_local_vmclear(cpu);
                 local_irq_enable();
         }
   
@@@ -1653,6 -1689,16 +1655,6 @@@ static void vmx_queue_exception(struct 
         vmx_clear_hlt(vcpu);
   }
   
- -static bool vmx_rdtscp_supported(void)
- -{
- -      return cpu_has_vmx_rdtscp();
- -}
- -
- -static bool vmx_invpcid_supported(void)
- -{
- -      return cpu_has_vmx_invpcid();
- -}
- -
   /*
    * Swap MSR entry in host/guest MSR entry array.
    */
@@@ -1860,24 -1906,24 +1862,24 @@@ static int vmx_get_msr(struct kvm_vcpu 
                                                         &msr_info->data);
                 break;
         case MSR_IA32_RTIT_CTL:
- -              if (pt_mode != PT_MODE_HOST_GUEST)
+ +              if (!vmx_pt_mode_is_host_guest())
                         return 1;
                 msr_info->data = vmx->pt_desc.guest.ctl;
                 break;
         case MSR_IA32_RTIT_STATUS:
- -              if (pt_mode != PT_MODE_HOST_GUEST)
+ +              if (!vmx_pt_mode_is_host_guest())
                         return 1;
                 msr_info->data = vmx->pt_desc.guest.status;
                 break;
         case MSR_IA32_RTIT_CR3_MATCH:
- -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ +              if (!vmx_pt_mode_is_host_guest() ||
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
                                                 PT_CAP_cr3_filtering))
                         return 1;
                 msr_info->data = vmx->pt_desc.guest.cr3_match;
                 break;
         case MSR_IA32_RTIT_OUTPUT_BASE:
- -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ +              if (!vmx_pt_mode_is_host_guest() ||
                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                         PT_CAP_topa_output) &&
                          !intel_pt_validate_cap(vmx->pt_desc.caps,
@@@ -1886,7 -1932,7 +1888,7 @@@
                 msr_info->data = vmx->pt_desc.guest.output_base;
                 break;
         case MSR_IA32_RTIT_OUTPUT_MASK:
- -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ +              if (!vmx_pt_mode_is_host_guest() ||
                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                         PT_CAP_topa_output) &&
                          !intel_pt_validate_cap(vmx->pt_desc.caps,
@@@ -1896,7 -1942,7 +1898,7 @@@
                 break;
         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ +              if (!vmx_pt_mode_is_host_guest() ||
                         (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
                                         PT_CAP_num_address_ranges)))
                         return 1;
@@@ -2102,7 -2148,7 +2104,7 @@@ static int vmx_set_msr(struct kvm_vcpu 
                         return 1;
                 return vmx_set_vmx_msr(vcpu, msr_index, data);
         case MSR_IA32_RTIT_CTL:
- -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ +              if (!vmx_pt_mode_is_host_guest() ||
                         vmx_rtit_ctl_check(vcpu, data) ||
                         vmx->nested.vmxon)
                         return 1;
@@@ -2218,33 -2264,18 +2220,33 @@@ static __init int vmx_disabled_by_bios(
                !boot_cpu_has(X86_FEATURE_VMX);
   }
   
- -static void kvm_cpu_vmxon(u64 addr)
+ +static int kvm_cpu_vmxon(u64 vmxon_pointer)
   {
+ +      u64 msr;
+ +
         cr4_set_bits(X86_CR4_VMXE);
         intel_pt_handle_vmx(1);
   
- -      asm volatile ("vmxon %0" : : "m"(addr));
+ +      asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ +                        _ASM_EXTABLE(1b, %l[fault])
+ +                        : : [vmxon_pointer] "m"(vmxon_pointer)
+ +                        : : fault);
+ +      return 0;
+ +
+ +fault:
+ +      WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ +                rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ +      intel_pt_handle_vmx(0);
+ +      cr4_clear_bits(X86_CR4_VMXE);
+ +
+ +      return -EFAULT;
   }
   
   static int hardware_enable(void)
   {
         int cpu = raw_smp_processor_id();
         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ +      int r;
   
         if (cr4_read_shadow() & X86_CR4_VMXE)
                 return -EBUSY;
@@@ -2261,10 -2292,18 +2263,10 @@@
         INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
         spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
   
- -      /*
- -       * Now we can enable the vmclear operation in kdump
- -       * since the loaded_vmcss_on_cpu list on this cpu
- -       * has been initialized.
- -       *
- -       * Though the cpu is not in VMX operation now, there
- -       * is no problem to enable the vmclear operation
- -       * for the loaded_vmcss_on_cpu list is empty!
- -       */
- -      crash_enable_local_vmclear(cpu);
+ +      r = kvm_cpu_vmxon(phys_addr);
+ +      if (r)
+ +              return r;
   
- -      kvm_cpu_vmxon(phys_addr);
         if (enable_ept)
                 ept_sync_global();
   
@@@ -2564,12 -2603,9 +2566,12 @@@ int alloc_loaded_vmcs(struct loaded_vmc
         if (!loaded_vmcs->vmcs)
                 return -ENOMEM;
   
+ +      vmcs_clear(loaded_vmcs->vmcs);
+ +
         loaded_vmcs->shadow_vmcs = NULL;
         loaded_vmcs->hv_timer_soft_disabled = false;
- -      loaded_vmcs_init(loaded_vmcs);
+ +      loaded_vmcs->cpu = -1;
+ +      loaded_vmcs->launched = 0;
   
         if (cpu_has_vmx_msr_bitmap()) {
                 loaded_vmcs->msr_bitmap = (unsigned long *)
@@@ -2951,8 -2987,9 +2953,8 @@@ void vmx_set_cr0(struct kvm_vcpu *vcpu
   
   static int get_ept_level(struct kvm_vcpu *vcpu)
   {
- -      /* Nested EPT currently only supports 4-level walks. */
         if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
- -              return 4;
+ +              return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
         if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
                 return 5;
         return 4;
@@@ -2972,7 -3009,7 +2974,7 @@@ u64 construct_eptp(struct kvm_vcpu *vcp
         return eptp;
   }
   
- -void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+ +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
   {
         struct kvm *kvm = vcpu->kvm;
         bool update_guest_cr3 = true;
@@@ -3989,7 -4026,7 +3991,7 @@@ static void vmx_compute_secondary_exec_
   
         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
   
- -      if (pt_mode == PT_MODE_SYSTEM)
+ +      if (vmx_pt_mode_is_system())
                 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
         if (!cpu_need_virtualize_apic_accesses(vcpu))
                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
@@@ -4044,7 -4081,7 +4046,7 @@@
                 }
         }
   
- -      if (vmx_rdtscp_supported()) {
+ +      if (cpu_has_vmx_rdtscp()) {
                 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
                 if (!rdtscp_enabled)
                         exec_control &= ~SECONDARY_EXEC_RDTSCP;
@@@ -4059,7 -4096,7 +4061,7 @@@
                 }
         }
   
- -      if (vmx_invpcid_supported()) {
+ +      if (cpu_has_vmx_invpcid()) {
                 /* Exposing INVPCID only when PCID is exposed */
                 bool invpcid_enabled =
                         guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
@@@ -4230,7 -4267,7 +4232,7 @@@ static void init_vmcs(struct vcpu_vmx *
         if (cpu_has_vmx_encls_vmexit())
                 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
   
- -      if (pt_mode == PT_MODE_HOST_GUEST) {
+ +      if (vmx_pt_mode_is_host_guest()) {
                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
                 /* Bit[6~0] are forced to 1, writes are ignored. */
                 vmx->pt_desc.guest.output_mask = 0x7F;
@@@ -4458,13 -4495,8 +4460,13 @@@ static int vmx_nmi_allowed(struct kvm_v
   
   static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
   {
- -      return (!to_vmx(vcpu)->nested.nested_run_pending &&
- -              vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ +      if (to_vmx(vcpu)->nested.nested_run_pending)
+ +              return false;
+ +
+ +      if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ +              return true;
+ +
+ +      return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
   }
@@@ -4520,6 -4552,7 +4522,6 @@@ static bool rmode_exception(struct kvm_
         case GP_VECTOR:
         case MF_VECTOR:
                 return true;
- -      break;
         }
         return false;
   }
@@@ -5296,6 -5329,7 +5298,6 @@@ static void vmx_enable_tdp(void
                 VMX_EPT_RWX_MASK, 0ull);
   
         ept_set_mmio_spte_mask();
- -      kvm_enable_tdp();
   }
   
   /*
@@@ -5828,23 -5862,8 +5830,23 @@@ static int vmx_handle_exit(struct kvm_v
         if (vmx->emulation_required)
                 return handle_invalid_guest_state(vcpu);
   
- -      if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
- -              return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+ +      if (is_guest_mode(vcpu)) {
+ +              /*
+ +               * The host physical addresses of some pages of guest memory
+ +               * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ +               * Page). The CPU may write to these pages via their host
+ +               * physical address while L2 is running, bypassing any
+ +               * address-translation-based dirty tracking (e.g. EPT write
+ +               * protection).
+ +               *
+ +               * Mark them dirty on every exit from L2 to prevent them from
+ +               * getting out of sync with dirty tracking.
+ +               */
+ +              nested_mark_vmcs12_pages_dirty(vcpu);
+ +
+ +              if (nested_vmx_exit_reflected(vcpu, exit_reason))
+ +                      return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+ +      }
   
         if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
                 dump_vmcs();
@@@ -6204,13 -6223,15 +6206,13 @@@ static void handle_exception_nmi_irqoff
         vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
   
         /* if exit due to PF check for async PF */
- -      if (is_page_fault(vmx->exit_intr_info))
+ +      if (is_page_fault(vmx->exit_intr_info)) {
                 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
- -
         /* Handle machine checks before interrupts are enabled */
- -      if (is_machine_check(vmx->exit_intr_info))
+ +      } else if (is_machine_check(vmx->exit_intr_info)) {
                 kvm_machine_check();
- -
         /* We need to handle NMIs before interrupts are enabled */
- -      if (is_nmi(vmx->exit_intr_info)) {
+ +      } else if (is_nmi(vmx->exit_intr_info)) {
                 kvm_before_interrupt(&vmx->vcpu);
                 asm("int $2");
                 kvm_after_interrupt(&vmx->vcpu);
@@@ -6296,6 -6317,11 +6298,6 @@@ static bool vmx_has_emulated_msr(int in
         }
   }
   
- -static bool vmx_pt_supported(void)
- -{
- -      return pt_mode == PT_MODE_HOST_GUEST;
- -}
- -
   static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
   {
         u32 exit_intr_info;
@@@ -6541,8 -6567,7 +6543,8 @@@ static void vmx_vcpu_run(struct kvm_vcp
   
         pt_guest_enter(vmx);
   
- -      atomic_switch_perf_msrs(vmx);
+ +      if (vcpu_to_pmu(vcpu)->version)
+ +              atomic_switch_perf_msrs(vmx);
         atomic_switch_umwait_control_msr(vmx);
   
         if (enable_preemption_timer)
@@@ -6659,6 -6684,20 +6661,6 @@@
         vmx_complete_interrupts(vmx);
   }
   
- -static struct kvm *vmx_vm_alloc(void)
- -{
- -      struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
- -                                          GFP_KERNEL_ACCOUNT | __GFP_ZERO,
- -                                          PAGE_KERNEL);
- -      return &kvm_vmx->kvm;
- -}
- -
- -static void vmx_vm_free(struct kvm *kvm)
- -{
- -      kfree(kvm->arch.hyperv.hv_pa_pg);
- -      vfree(to_kvm_vmx(kvm));
- -}
- -
   static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -6861,24 -6900,17 +6863,24 @@@ static u64 vmx_get_mt_mask(struct kvm_v
         u8 cache;
         u64 ipat = 0;
   
- -      /* For VT-d and EPT combination
- -       * 1. MMIO: always map as UC
- -       * 2. EPT with VT-d:
- -       *   a. VT-d without snooping control feature: can't guarantee the
- -       *      result, try to trust guest.
- -       *   b. VT-d with snooping control feature: snooping control feature of
- -       *      VT-d engine can guarantee the cache correctness. Just set it
- -       *      to WB to keep consistent with host. So the same as item 3.
- -       * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
- -       *    consistent with host MTRR
+ +      /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
+ +       * memory aliases with conflicting memory types and sometimes MCEs.
+ +       * We have to be careful as to what are honored and when.
+ +       *
+ +       * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
+ +       * UC.  The effective memory type is UC or WC depending on guest PAT.
+ +       * This was historically the source of MCEs and we want to be
+ +       * conservative.
+ +       *
+ +       * When there is no need to deal with noncoherent DMA (e.g., no VT-d
+ +       * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
+ +       * EPT memory type is set to WB.  The effective memory type is forced
+ +       * WB.
+ +       *
+ +       * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
+ +       * EPT memory type is used to emulate guest CD/MTRR.
          */
+ +
         if (is_mmio) {
                 cache = MTRR_TYPE_UNCACHABLE;
                 goto exit;
@@@ -6905,6 -6937,15 +6907,6 @@@ exit
         return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
   }
   
- -static int vmx_get_lpage_level(void)
- -{
- -      if (enable_ept && !cpu_has_vmx_ept_1g_page())
- -              return PT_DIRECTORY_LEVEL;
- -      else
- -              /* For shadow and EPT supported 1GB page */
- -              return PT_PDPE_LEVEL;
- -}
- -
   static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
   {
         /*
@@@ -7095,37 -7136,10 +7097,37 @@@ static void vmx_cpuid_update(struct kvm
         }
   }
   
- -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+ +static __init void vmx_set_cpu_caps(void)
   {
- -      if (func == 1 && nested)
- -              entry->ecx |= feature_bit(VMX);
+ +      kvm_set_cpu_caps();
+ +
+ +      /* CPUID 0x1 */
+ +      if (nested)
+ +              kvm_cpu_cap_set(X86_FEATURE_VMX);
+ +
+ +      /* CPUID 0x7 */
+ +      if (kvm_mpx_supported())
+ +              kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+ +      if (cpu_has_vmx_invpcid())
+ +              kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+ +      if (vmx_pt_mode_is_host_guest())
+ +              kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ +
+ +      /* PKU is not yet implemented for shadow paging. */
+ +      if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
+ +              kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
+ +
+ +      if (vmx_umip_emulated())
+ +              kvm_cpu_cap_set(X86_FEATURE_UMIP);
+ +
+ +      /* CPUID 0xD.1 */
+ +      supported_xss = 0;
+ +      if (!vmx_xsaves_supported())
+ +              kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+ +
+ +      /* CPUID 0x80000001 */
+ +      if (!cpu_has_vmx_rdtscp())
+ +              kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
   }
   
   static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@@ -7163,15 -7177,16 +7165,16 @@@ static int vmx_check_intercept_io(struc
         else
                 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
   
+       /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
         return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
   }
   
   static int vmx_check_intercept(struct kvm_vcpu *vcpu,
                                struct x86_instruction_info *info,
- -                             enum x86_intercept_stage stage)
+ +                             enum x86_intercept_stage stage,
+ +                             struct x86_exception *exception)
   {
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
   
         switch (info->intercept) {
         /*
@@@ -7180,8 -7195,8 +7183,8 @@@
          */
         case x86_intercept_rdtscp:
                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
- -                      ctxt->exception.vector = UD_VECTOR;
- -                      ctxt->exception.error_code_valid = false;
+ +                      exception->vector = UD_VECTOR;
+ +                      exception->error_code_valid = false;
                         return X86EMUL_PROPAGATE_FAULT;
                 }
                 break;
@@@ -7192,6 -7207,20 +7195,20 @@@
         case x86_intercept_outs:
                 return vmx_check_intercept_io(vcpu, info);
   
+       case x86_intercept_lgdt:
+       case x86_intercept_lidt:
+       case x86_intercept_lldt:
+       case x86_intercept_ltr:
+       case x86_intercept_sgdt:
+       case x86_intercept_sidt:
+       case x86_intercept_sldt:
+       case x86_intercept_str:
+               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+                       return X86EMUL_CONTINUE;
+ 
+               /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
+               break;
+ 
         /* TODO: check more intercepts... */
         default:
                 break;
@@@ -7278,8 -7307,7 +7295,8 @@@ static void vmx_sched_in(struct kvm_vcp
   static void vmx_slot_enable_log_dirty(struct kvm *kvm,
                                      struct kvm_memory_slot *slot)
   {
- -      kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+ +      if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
+ +              kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
         kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
   }
   
@@@ -7633,7 -7661,9 +7650,7 @@@ static __init int hardware_setup(void
   {
         unsigned long host_bndcfgs;
         struct desc_ptr dt;
- -      int r, i;
- -
- -      rdmsrl_safe(MSR_EFER, &host_efer);
+ +      int r, i, ept_lpage_level;
   
         store_idt(&dt);
         host_idt_base = dt.address;
@@@ -7652,10 -7682,6 +7669,10 @@@
                 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
         }
   
+ +      if (!cpu_has_vmx_mpx())
+ +              supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ +                                  XFEATURE_MASK_BNDCSR);
+ +
         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
             !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                 enable_vpid = 0;
@@@ -7689,6 -7715,9 +7706,6 @@@
         if (!cpu_has_vmx_tpr_shadow())
                 kvm_x86_ops->update_cr8_intercept = NULL;
   
- -      if (enable_ept && !cpu_has_vmx_ept_2m_page())
- -              kvm_disable_largepages();
- -
   #if IS_ENABLED(CONFIG_HYPERV)
         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
             && enable_ept) {
@@@ -7721,16 -7750,8 +7738,16 @@@
   
         if (enable_ept)
                 vmx_enable_tdp();
+ +
+ +      if (!enable_ept)
+ +              ept_lpage_level = 0;
+ +      else if (cpu_has_vmx_ept_1g_page())
+ +              ept_lpage_level = PT_PDPE_LEVEL;
+ +      else if (cpu_has_vmx_ept_2m_page())
+ +              ept_lpage_level = PT_DIRECTORY_LEVEL;
         else
- -              kvm_disable_tdp();
+ +              ept_lpage_level = PT_PAGE_TABLE_LEVEL;
+ +      kvm_configure_mmu(enable_ept, ept_lpage_level);
   
         /*
          * Only enable PML when hardware supports PML feature, and both EPT
@@@ -7794,8 -7815,6 +7811,8 @@@
                         return r;
         }
   
+ +      vmx_set_cpu_caps();
+ +
         r = alloc_kvm_area();
         if (r)
                 nested_vmx_hardware_unsetup();
@@@ -7829,8 -7848,9 +7846,8 @@@ static struct kvm_x86_ops vmx_x86_ops _
         .cpu_has_accelerated_tpr = report_flexpriority,
         .has_emulated_msr = vmx_has_emulated_msr,
   
+ +      .vm_size = sizeof(struct kvm_vmx),
         .vm_init = vmx_vm_init,
- -      .vm_alloc = vmx_vm_alloc,
- -      .vm_free = vmx_vm_free,
   
         .vcpu_create = vmx_create_vcpu,
         .vcpu_free = vmx_free_vcpu,
@@@ -7852,6 -7872,7 +7869,6 @@@
         .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
         .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
         .set_cr0 = vmx_set_cr0,
- -      .set_cr3 = vmx_set_cr3,
         .set_cr4 = vmx_set_cr4,
         .set_efer = vmx_set_efer,
         .get_idt = vmx_get_idt,
@@@ -7907,17 -7928,29 +7924,17 @@@
   
         .get_exit_info = vmx_get_exit_info,
   
- -      .get_lpage_level = vmx_get_lpage_level,
- -
         .cpuid_update = vmx_cpuid_update,
   
- -      .rdtscp_supported = vmx_rdtscp_supported,
- -      .invpcid_supported = vmx_invpcid_supported,
- -
- -      .set_supported_cpuid = vmx_set_supported_cpuid,
- -
         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
   
         .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
         .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
   
- -      .set_tdp_cr3 = vmx_set_cr3,
+ +      .load_mmu_pgd = vmx_load_mmu_pgd,
   
         .check_intercept = vmx_check_intercept,
         .handle_exit_irqoff = vmx_handle_exit_irqoff,
- -      .mpx_supported = vmx_mpx_supported,
- -      .xsaves_supported = vmx_xsaves_supported,
- -      .umip_emulated = vmx_umip_emulated,
- -      .pt_supported = vmx_pt_supported,
- -      .pku_supported = vmx_pku_supported,
   
         .request_immediate_exit = vmx_request_immediate_exit,
   
diff --combined arch/x86/kvm/x86.c

index 6fa014c,5de2006..1b6d9ac
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -22,7 -22,6 +22,7 @@@
   #include "i8254.h"
   #include "tss.h"
   #include "kvm_cache_regs.h"
+ +#include "kvm_emulate.h"
   #include "x86.h"
   #include "cpuid.h"
   #include "pmu.h"
@@@ -82,7 -81,7 +82,7 @@@ u64 __read_mostly kvm_mce_cap_supporte
   EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
   
   #define emul_to_vcpu(ctxt) \
- -      container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+ +      ((struct kvm_vcpu *)(ctxt)->vcpu)
   
   /* EFER defaults:
    * - enable syscall per default because its emulated by KVM
@@@ -181,17 -180,7 +181,17 @@@ struct kvm_shared_msrs 
   static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
   static struct kvm_shared_msrs __percpu *shared_msrs;
   
+ +#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+ +                              | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+ +                              | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+ +                              | XFEATURE_MASK_PKRU)
+ +
+ +u64 __read_mostly host_efer;
+ +EXPORT_SYMBOL_GPL(host_efer);
+ +
   static u64 __read_mostly host_xss;
+ +u64 __read_mostly supported_xss;
+ +EXPORT_SYMBOL_GPL(supported_xss);
   
   struct kvm_stats_debugfs_item debugfs_entries[] = {
         { "pf_fixed", VCPU_STAT(pf_fixed) },
@@@ -237,25 -226,10 +237,25 @@@
   };
   
   u64 __read_mostly host_xcr0;
+ +u64 __read_mostly supported_xcr0;
+ +EXPORT_SYMBOL_GPL(supported_xcr0);
   
   struct kmem_cache *x86_fpu_cache;
   EXPORT_SYMBOL_GPL(x86_fpu_cache);
   
+ +static struct kmem_cache *x86_emulator_cache;
+ +
+ +static struct kmem_cache *kvm_alloc_emulator_cache(void)
+ +{
+ +      unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
+ +      unsigned int size = sizeof(struct x86_emulate_ctxt);
+ +
+ +      return kmem_cache_create_usercopy("x86_emulator", size,
+ +                                        __alignof__(struct x86_emulate_ctxt),
+ +                                        SLAB_ACCOUNT, useroffset,
+ +                                        size - useroffset, NULL);
+ +}
+ +
   static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
   
   static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
@@@ -376,7 -350,6 +376,7 @@@ int kvm_set_apic_base(struct kvm_vcpu *
         }
   
         kvm_lapic_set_base(vcpu, msr_info->data);
+ +      kvm_recalculate_apic_map(vcpu->kvm);
         return 0;
   }
   EXPORT_SYMBOL_GPL(kvm_set_apic_base);
@@@ -930,10 -903,10 +930,10 @@@ static u64 kvm_host_cr4_reserved_bits(s
   {
         u64 reserved_bits = __cr4_reserved_bits(cpu_has, c);
   
- -      if (cpuid_ecx(0x7) & feature_bit(LA57))
+ +      if (kvm_cpu_cap_has(X86_FEATURE_LA57))
                 reserved_bits &= ~X86_CR4_LA57;
   
- -      if (kvm_x86_ops->umip_emulated())
+ +      if (kvm_cpu_cap_has(X86_FEATURE_UMIP))
                 reserved_bits &= ~X86_CR4_UMIP;
   
         return reserved_bits;
@@@ -1585,12 -1558,8 +1585,12 @@@ static int handle_fastpath_set_x2apic_i
                 ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
                 ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
   
+ +              data &= ~(1 << 12);
+ +              kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
                 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
- -              return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
+ +              kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
+ +              trace_kvm_apic_write(APIC_ICR, (u32)data);
+ +              return 0;
         }
   
         return 1;
@@@ -1599,12 -1568,11 +1599,12 @@@
   enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
   {
         u32 msr = kvm_rcx_read(vcpu);
- -      u64 data = kvm_read_edx_eax(vcpu);
+ +      u64 data;
         int ret = 0;
   
         switch (msr) {
         case APIC_BASE_MSR + (APIC_ICR >> 4):
+ +              data = kvm_read_edx_eax(vcpu);
                 ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
                 break;
         default:
@@@ -2555,7 -2523,7 +2555,7 @@@ static void kvmclock_sync_fn(struct wor
   static bool can_set_mci_status(struct kvm_vcpu *vcpu)
   {
         /* McStatusWrEn enabled? */
- -      if (guest_cpuid_is_amd(vcpu))
+ +      if (guest_cpuid_is_amd_or_hygon(vcpu))
                 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
   
         return false;
@@@ -2830,11 -2798,12 +2830,11 @@@ int kvm_set_msr_common(struct kvm_vcpu 
                     !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
                         return 1;
                 /*
- -               * We do support PT if kvm_x86_ops->pt_supported(), but we do
- -               * not support IA32_XSS[bit 8]. Guests will have to use
- -               * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
- -               * MSRs.
+ +               * KVM supports exposing PT to the guest, but does not support
+ +               * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
+ +               * XSAVES/XRSTORS to save/restore PT MSRs.
                  */
- -              if (data != 0)
+ +              if (data & ~supported_xss)
                         return 1;
                 vcpu->arch.ia32_xss = data;
                 break;
@@@ -3108,6 -3077,7 +3108,6 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                 break;
         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
                 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
- -              break;
         case MSR_IA32_TSCDEADLINE:
                 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
                 break;
@@@ -3190,6 -3160,7 +3190,6 @@@
                 return kvm_hv_get_msr_common(vcpu,
                                              msr_info->index, &msr_info->data,
                                              msr_info->host_initiated);
- -              break;
         case MSR_IA32_BBL_CR_CTL3:
                 /* This legacy MSR exists but isn't fully documented in current
                  * silicon.  It is however accessed by winxp in very narrow
@@@ -3493,7 -3464,7 +3493,7 @@@ long kvm_arch_dev_ioctl(struct file *fi
                 r = 0;
                 break;
         }
- -      case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ +      case KVM_X86_GET_MCE_CAP_SUPPORTED:
                 r = -EFAULT;
                 if (copy_to_user(argp, &kvm_mce_cap_supported,
                                  sizeof(kvm_mce_cap_supported)))
@@@ -3525,9 -3496,9 +3525,9 @@@
         case KVM_GET_MSRS:
                 r = msr_io(NULL, argp, do_get_msr_feature, 1);
                 break;
- -      }
         default:
                 r = -EINVAL;
+ +              break;
         }
   out:
         return r;
@@@ -4130,7 -4101,8 +4130,7 @@@ static int kvm_vcpu_ioctl_x86_set_xsave
                  * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
                  * with old userspace.
                  */
- -              if (xstate_bv & ~kvm_supported_xcr0() ||
- -                      mxcsr & ~mxcsr_feature_mask)
+ +              if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
                         return -EINVAL;
                 load_xsave(vcpu, (u8 *)guest_xsave->region);
         } else {
@@@ -4789,13 -4761,77 +4789,13 @@@ static int kvm_vm_ioctl_reinject(struc
         return 0;
   }
   
- -/**
- - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- - * @kvm: kvm instance
- - * @log: slot id and address to which we copy the log
- - *
- - * Steps 1-4 below provide general overview of dirty page logging. See
- - * kvm_get_dirty_log_protect() function description for additional details.
- - *
- - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- - * always flush the TLB (step 4) even if previous step failed  and the dirty
- - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- - * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- - * writes will be marked dirty for next log read.
- - *
- - *   1. Take a snapshot of the bit and clear it if needed.
- - *   2. Write protect the corresponding page.
- - *   3. Copy the snapshot to the userspace.
- - *   4. Flush TLB's if needed.
- - */
- -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+ +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
   {
- -      bool flush = false;
- -      int r;
- -
- -      mutex_lock(&kvm->slots_lock);
- -
         /*
          * Flush potentially hardware-cached dirty pages to dirty_bitmap.
          */
         if (kvm_x86_ops->flush_log_dirty)
                 kvm_x86_ops->flush_log_dirty(kvm);
- -
- -      r = kvm_get_dirty_log_protect(kvm, log, &flush);
- -
- -      /*
- -       * All the TLBs can be flushed out of mmu lock, see the comments in
- -       * kvm_mmu_slot_remove_write_access().
- -       */
- -      lockdep_assert_held(&kvm->slots_lock);
- -      if (flush)
- -              kvm_flush_remote_tlbs(kvm);
- -
- -      mutex_unlock(&kvm->slots_lock);
- -      return r;
- -}
- -
- -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
- -{
- -      bool flush = false;
- -      int r;
- -
- -      mutex_lock(&kvm->slots_lock);
- -
- -      /*
- -       * Flush potentially hardware-cached dirty pages to dirty_bitmap.
- -       */
- -      if (kvm_x86_ops->flush_log_dirty)
- -              kvm_x86_ops->flush_log_dirty(kvm);
- -
- -      r = kvm_clear_dirty_log_protect(kvm, log, &flush);
- -
- -      /*
- -       * All the TLBs can be flushed out of mmu lock, see the comments in
- -       * kvm_mmu_slot_remove_write_access().
- -       */
- -      lockdep_assert_held(&kvm->slots_lock);
- -      if (flush)
- -              kvm_flush_remote_tlbs(kvm);
- -
- -      mutex_unlock(&kvm->slots_lock);
- -      return r;
   }
   
   int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@@ -5224,28 -5260,28 +5224,28 @@@ static void kvm_init_msr_list(void
                                 continue;
                         break;
                 case MSR_TSC_AUX:
- -                      if (!kvm_x86_ops->rdtscp_supported())
+ +                      if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
                                 continue;
                         break;
                 case MSR_IA32_RTIT_CTL:
                 case MSR_IA32_RTIT_STATUS:
- -                      if (!kvm_x86_ops->pt_supported())
+ +                      if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
                                 continue;
                         break;
                 case MSR_IA32_RTIT_CR3_MATCH:
- -                      if (!kvm_x86_ops->pt_supported() ||
+ +                      if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                             !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
                                 continue;
                         break;
                 case MSR_IA32_RTIT_OUTPUT_BASE:
                 case MSR_IA32_RTIT_OUTPUT_MASK:
- -                      if (!kvm_x86_ops->pt_supported() ||
+ +                      if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                                 (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
                                  !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
                                 continue;
                         break;
                 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
- -                      if (!kvm_x86_ops->pt_supported() ||
+ +                      if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                                 msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
                                 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
                                 continue;
@@@ -5702,7 -5738,7 +5702,7 @@@ static int emulator_read_write_onepage(
         int handled, ret;
         bool write = ops->write;
         struct kvm_mmio_fragment *frag;
- -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
   
         /*
          * If the exit was due to a NPF we may already have a GPA.
@@@ -5711,9 -5747,10 +5711,9 @@@
          * operation using rep will only have the initial GPA from the NPF
          * occurred.
          */
- -      if (vcpu->arch.gpa_available &&
- -          emulator_can_use_gpa(ctxt) &&
- -          (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
- -              gpa = vcpu->arch.gpa_val;
+ +      if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
+ +          (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
+ +              gpa = ctxt->gpa_val;
                 ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
         } else {
                 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@@ -5933,9 -5970,11 +5933,9 @@@ static int emulator_pio_in_out(struct k
         return 0;
   }
   
- -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- -                                  int size, unsigned short port, void *val,
- -                                  unsigned int count)
+ +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ +                         unsigned short port, void *val, unsigned int count)
   {
- -      struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
         int ret;
   
         if (vcpu->arch.pio.count)
@@@ -5955,30 -5994,17 +5955,30 @@@ data_avail
         return 0;
   }
   
- -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
- -                                   int size, unsigned short port,
- -                                   const void *val, unsigned int count)
+ +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ +                                  int size, unsigned short port, void *val,
+ +                                  unsigned int count)
   {
- -      struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ +      return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
+ +
+ +}
   
+ +static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
+ +                          unsigned short port, const void *val,
+ +                          unsigned int count)
+ +{
         memcpy(vcpu->arch.pio_data, val, size * count);
         trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
         return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
   }
   
+ +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ +                                   int size, unsigned short port,
+ +                                   const void *val, unsigned int count)
+ +{
+ +      return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
+ +}
+ +
   static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
   {
         return kvm_x86_ops->get_segment_base(vcpu, seg);
@@@ -6241,15 -6267,13 +6241,15 @@@ static int emulator_intercept(struct x8
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage)
   {
- -      return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
+ +      return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage,
+ +                                          &ctxt->exception);
   }
   
   static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
- -                      u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
+ +                            u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
+ +                            bool exact_only)
   {
- -      return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
+ +      return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
   }
   
   static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
@@@ -6376,7 -6400,7 +6376,7 @@@ static void toggle_interruptibility(str
   
   static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
   {
- -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
         if (ctxt->exception.vector == PF_VECTOR)
                 return kvm_propagate_fault(vcpu, &ctxt->exception);
   
@@@ -6388,31 -6412,13 +6388,31 @@@
         return false;
   }
   
+ +static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
+ +{
+ +      struct x86_emulate_ctxt *ctxt;
+ +
+ +      ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
+ +      if (!ctxt) {
+ +              pr_err("kvm: failed to allocate vcpu's emulator\n");
+ +              return NULL;
+ +      }
+ +
+ +      ctxt->vcpu = vcpu;
+ +      ctxt->ops = &emulate_ops;
+ +      vcpu->arch.emulate_ctxt = ctxt;
+ +
+ +      return ctxt;
+ +}
+ +
   static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
   {
- -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
         int cs_db, cs_l;
   
         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
   
+ +      ctxt->gpa_available = false;
         ctxt->eflags = kvm_get_rflags(vcpu);
         ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
   
@@@ -6432,7 -6438,7 +6432,7 @@@
   
   void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
   {
- -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
         int ret;
   
         init_emulate_ctxt(vcpu);
@@@ -6488,11 -6494,10 +6488,11 @@@ static bool reexecute_instruction(struc
         gpa_t gpa = cr2_or_gpa;
         kvm_pfn_t pfn;
   
- -      if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ +      if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                 return false;
   
- -      if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ +      if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ +          WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                 return false;
   
         if (!vcpu->arch.mmu->direct_map) {
@@@ -6580,11 -6585,10 +6580,11 @@@ static bool retry_instruction(struct x8
          */
         vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
   
- -      if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ +      if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                 return false;
   
- -      if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ +      if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ +          WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                 return false;
   
         if (x86_page_table_writing_insn(ctxt))
@@@ -6747,7 -6751,7 +6747,7 @@@ int x86_emulate_instruction(struct kvm_
                             int emulation_type, void *insn, int insn_len)
   {
         int r;
- -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
         bool writeback = true;
         bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
   
@@@ -6837,19 -6841,8 +6837,19 @@@
         }
   
   restart:
- -      /* Save the faulting GPA (cr2) in the address field */
- -      ctxt->exception.address = cr2_or_gpa;
+ +      if (emulation_type & EMULTYPE_PF) {
+ +              /* Save the faulting GPA (cr2) in the address field */
+ +              ctxt->exception.address = cr2_or_gpa;
+ +
+ +              /* With shadow page tables, cr2 contains a GVA or nGPA. */
+ +              if (vcpu->arch.mmu->direct_map) {
+ +                      ctxt->gpa_available = true;
+ +                      ctxt->gpa_val = cr2_or_gpa;
+ +              }
+ +      } else {
+ +              /* Sanitize the address out of an abundance of paranoia. */
+ +              ctxt->exception.address = 0;
+ +      }
   
         r = x86_emulate_insn(ctxt);
   
@@@ -6950,8 -6943,8 +6950,8 @@@ static int kvm_fast_pio_out(struct kvm_
                             unsigned short port)
   {
         unsigned long val = kvm_rax_read(vcpu);
- -      int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
- -                                          size, port, &val, 1);
+ +      int ret = emulator_pio_out(vcpu, size, port, &val, 1);
+ +
         if (ret)
                 return ret;
   
@@@ -6987,10 -6980,11 +6987,10 @@@ static int complete_fast_pio_in(struct 
         val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
   
         /*
- -       * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
+ +       * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
          * the copy and tracing
          */
- -      emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
- -                               vcpu->arch.pio.port, &val, 1);
+ +      emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
         kvm_rax_write(vcpu, val);
   
         return kvm_skip_emulated_instruction(vcpu);
@@@ -7005,7 -6999,8 +7005,7 @@@ static int kvm_fast_pio_in(struct kvm_v
         /* For size less than 4 we merge, else we zero extend */
         val = (size < 4) ? kvm_rax_read(vcpu) : 0;
   
- -      ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
- -                                     &val, 1);
+ +      ret = emulator_pio_in(vcpu, size, port, &val, 1);
         if (ret) {
                 kvm_rax_write(vcpu, val);
                 return ret;
@@@ -7195,15 -7190,15 +7195,15 @@@ static void kvm_timer_init(void
   
         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
   #ifdef CONFIG_CPU_FREQ
-               struct cpufreq_policy policy;
+               struct cpufreq_policy *policy;
                 int cpu;
   
-               memset(&policy, 0, sizeof(policy));
                 cpu = get_cpu();
-               cpufreq_get_policy(&policy, cpu);
-               if (policy.cpuinfo.max_freq)
-                       max_tsc_khz = policy.cpuinfo.max_freq;
+               policy = cpufreq_cpu_get(cpu);
+               if (policy && policy->cpuinfo.max_freq)
+                       max_tsc_khz = policy->cpuinfo.max_freq;
                 put_cpu();
+               cpufreq_cpu_put(policy);
   #endif
                 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
                                           CPUFREQ_TRANSITION_NOTIFIER);
@@@ -7313,12 -7308,12 +7313,12 @@@ int kvm_arch_init(void *opaque
         }
   
         if (!ops->cpu_has_kvm_support()) {
-               printk(KERN_ERR "kvm: no hardware support\n");
+               pr_err_ratelimited("kvm: no hardware support\n");
                 r = -EOPNOTSUPP;
                 goto out;
         }
         if (ops->disabled_by_bios()) {
-               printk(KERN_ERR "kvm: disabled by bios\n");
+               pr_err_ratelimited("kvm: disabled by bios\n");
                 r = -EOPNOTSUPP;
                 goto out;
         }
@@@ -7343,16 -7338,10 +7343,16 @@@
                 goto out;
         }
   
+ +      x86_emulator_cache = kvm_alloc_emulator_cache();
+ +      if (!x86_emulator_cache) {
+ +              pr_err("kvm: failed to allocate cache for x86 emulator\n");
+ +              goto out_free_x86_fpu_cache;
+ +      }
+ +
         shared_msrs = alloc_percpu(struct kvm_shared_msrs);
         if (!shared_msrs) {
                 printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
- -              goto out_free_x86_fpu_cache;
+ +              goto out_free_x86_emulator_cache;
         }
   
         r = kvm_mmu_module_init();
@@@ -7368,10 -7357,8 +7368,10 @@@
   
         perf_register_guest_info_callbacks(&kvm_guest_cbs);
   
- -      if (boot_cpu_has(X86_FEATURE_XSAVE))
+ +      if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ +              supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ +      }
   
         kvm_lapic_init();
         if (pi_inject_timer == -1)
@@@ -7387,8 -7374,6 +7387,8 @@@
   
   out_free_percpu:
         free_percpu(shared_msrs);
+ +out_free_x86_emulator_cache:
+ +      kmem_cache_destroy(x86_emulator_cache);
   out_free_x86_fpu_cache:
         kmem_cache_destroy(x86_fpu_cache);
   out:
@@@ -7646,7 -7631,7 +7646,7 @@@ static void update_cr8_intercept(struc
         kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
   }
   
- -static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
+ +static int inject_pending_event(struct kvm_vcpu *vcpu)
   {
         int r;
   
@@@ -7682,7 -7667,7 +7682,7 @@@
          * from L2 to L1.
          */
         if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- -              r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ +              r = kvm_x86_ops->check_nested_events(vcpu);
                 if (r != 0)
                         return r;
         }
@@@ -7744,7 -7729,7 +7744,7 @@@
                  * KVM_REQ_EVENT only on certain events and not unconditionally?
                  */
                 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- -                      r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ +                      r = kvm_x86_ops->check_nested_events(vcpu);
                         if (r != 0)
                                 return r;
                 }
@@@ -8054,26 -8039,19 +8054,26 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv
    */
   void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
   {
+ +      unsigned long old, new, expected;
+ +
         if (!kvm_x86_ops->check_apicv_inhibit_reasons ||
             !kvm_x86_ops->check_apicv_inhibit_reasons(bit))
                 return;
   
- -      if (activate) {
- -              if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
- -                  !kvm_apicv_activated(kvm))
- -                      return;
- -      } else {
- -              if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
- -                  kvm_apicv_activated(kvm))
- -                      return;
- -      }
+ +      old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
+ +      do {
+ +              expected = new = old;
+ +              if (activate)
+ +                      __clear_bit(bit, &new);
+ +              else
+ +                      __set_bit(bit, &new);
+ +              if (new == old)
+ +                      break;
+ +              old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
+ +      } while (old != expected);
+ +
+ +      if (!!old == !!new)
+ +              return;
   
         trace_kvm_apicv_update_request(activate, bit);
         if (kvm_x86_ops->pre_update_apicv_exec_ctrl)
@@@ -8198,8 -8176,8 +8198,8 @@@ static int vcpu_enter_guest(struct kvm_
                 }
                 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
                         kvm_mmu_sync_roots(vcpu);
- -              if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
- -                      kvm_mmu_load_cr3(vcpu);
+ +              if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
+ +                      kvm_mmu_load_pgd(vcpu);
                 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
                         kvm_vcpu_flush_tlb(vcpu, true);
                 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@@ -8284,7 -8262,7 +8284,7 @@@
                         goto out;
                 }
   
- -              if (inject_pending_event(vcpu, req_int_win) != 0)
+ +              if (inject_pending_event(vcpu) != 0)
                         req_immediate_exit = true;
                 else {
                         /* Enable SMI/NMI/IRQ window open exits if needed.
@@@ -8465,6 -8443,7 +8465,6 @@@
         if (vcpu->arch.apic_attention)
                 kvm_lapic_sync_from_vapic(vcpu);
   
- -      vcpu->arch.gpa_available = false;
         r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath);
         return r;
   
@@@ -8505,6 -8484,7 +8505,6 @@@ static inline int vcpu_block(struct kv
                 break;
         default:
                 return -EINTR;
- -              break;
         }
         return 1;
   }
@@@ -8512,7 -8492,7 +8512,7 @@@
   static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
   {
         if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
- -              kvm_x86_ops->check_nested_events(vcpu, false);
+ +              kvm_x86_ops->check_nested_events(vcpu);
   
         return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                 !vcpu->arch.apf.halted);
@@@ -8773,7 -8753,7 +8773,7 @@@ static void __get_regs(struct kvm_vcpu 
                  * that usually, but some bad designed PV devices (vmware
                  * backdoor interface) need this to work
                  */
- -              emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
+ +              emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
                 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
         }
         regs->rax = kvm_rax_read(vcpu);
@@@ -8959,7 -8939,7 +8959,7 @@@ out
   int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
                     int reason, bool has_error_code, u32 error_code)
   {
- -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
         int ret;
   
         init_emulate_ctxt(vcpu);
@@@ -9291,6 -9271,7 +9291,6 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
         struct page *page;
         int r;
   
- -      vcpu->arch.emulate_ctxt.ops = &emulate_ops;
         if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
         else
@@@ -9328,14 -9309,11 +9328,14 @@@
                                 GFP_KERNEL_ACCOUNT))
                 goto fail_free_mce_banks;
   
+ +      if (!alloc_emulate_ctxt(vcpu))
+ +              goto free_wbinvd_dirty_mask;
+ +
         vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
                                                 GFP_KERNEL_ACCOUNT);
         if (!vcpu->arch.user_fpu) {
                 pr_err("kvm: failed to allocate userspace's fpu\n");
- -              goto free_wbinvd_dirty_mask;
+ +              goto free_emulate_ctxt;
         }
   
         vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
@@@ -9377,8 -9355,6 +9377,8 @@@ free_guest_fpu
         kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
   free_user_fpu:
         kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+ +free_emulate_ctxt:
+ +      kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
   free_wbinvd_dirty_mask:
         free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
   fail_free_mce_banks:
@@@ -9413,9 -9389,11 +9413,9 @@@ void kvm_arch_vcpu_postcreate(struct kv
   
         mutex_unlock(&vcpu->mutex);
   
- -      if (!kvmclock_periodic_sync)
- -              return;
- -
- -      schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
- -                                      KVMCLOCK_SYNC_PERIOD);
+ +      if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
+ +              schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ +                                              KVMCLOCK_SYNC_PERIOD);
   }
   
   void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@@ -9429,7 -9407,6 +9429,7 @@@
   
         kvm_x86_ops->vcpu_free(vcpu);
   
+ +      kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
         free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
         kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
         kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
@@@ -9630,18 -9607,10 +9630,18 @@@ int kvm_arch_hardware_setup(void
   {
         int r;
   
+ +      rdmsrl_safe(MSR_EFER, &host_efer);
+ +
+ +      if (boot_cpu_has(X86_FEATURE_XSAVES))
+ +              rdmsrl(MSR_IA32_XSS, host_xss);
+ +
         r = kvm_x86_ops->hardware_setup();
         if (r != 0)
                 return r;
   
+ +      if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ +              supported_xss = 0;
+ +
         cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
   
         if (kvm_has_tsc_control) {
@@@ -9658,6 -9627,9 +9658,6 @@@
                 kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
         }
   
- -      if (boot_cpu_has(X86_FEATURE_XSAVES))
- -              rdmsrl(MSR_IA32_XSS, host_xss);
- -
         kvm_init_msr_list();
         return 0;
   }
@@@ -9705,13 -9677,6 +9705,13 @@@ void kvm_arch_sched_in(struct kvm_vcpu 
         kvm_x86_ops->sched_in(vcpu, cpu);
   }
   
+ +void kvm_arch_free_vm(struct kvm *kvm)
+ +{
+ +      kfree(kvm->arch.hyperv.hv_pa_pg);
+ +      vfree(kvm);
+ +}
+ +
+ +
   int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
   {
         if (type)
@@@ -9794,9 -9759,9 +9794,9 @@@ void kvm_arch_sync_events(struct kvm *k
   int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
   {
         int i, r;
- -      unsigned long hva;
+ +      unsigned long hva, uninitialized_var(old_npages);
         struct kvm_memslots *slots = kvm_memslots(kvm);
- -      struct kvm_memory_slot *slot, old;
+ +      struct kvm_memory_slot *slot;
   
         /* Called with kvm->slots_lock held.  */
         if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
@@@ -9804,7 -9769,7 +9804,7 @@@
   
         slot = id_to_memslot(slots, id);
         if (size) {
- -              if (slot->npages)
+ +              if (slot && slot->npages)
                         return -EEXIST;
   
                 /*
@@@ -9816,18 -9781,13 +9816,18 @@@
                 if (IS_ERR((void *)hva))
                         return PTR_ERR((void *)hva);
         } else {
- -              if (!slot->npages)
+ +              if (!slot || !slot->npages)
                         return 0;
   
- -              hva = 0;
+ +              /*
+ +               * Stuff a non-canonical value to catch use-after-delete.  This
+ +               * ends up being 0 on 32-bit KVM, but there's no better
+ +               * alternative.
+ +               */
+ +              hva = (unsigned long)(0xdeadull << 48);
+ +              old_npages = slot->npages;
         }
   
- -      old = *slot;
         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                 struct kvm_userspace_memory_region m;
   
@@@ -9842,7 -9802,7 +9842,7 @@@
         }
   
         if (!size)
- -              vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
+ +              vm_munmap(hva, old_npages * PAGE_SIZE);
   
         return 0;
   }
@@@ -9881,36 -9841,34 +9881,36 @@@ void kvm_arch_destroy_vm(struct kvm *kv
         kvm_hv_destroy_vm(kvm);
   }
   
- -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
- -                         struct kvm_memory_slot *dont)
+ +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
   {
         int i;
   
         for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- -              if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
- -                      kvfree(free->arch.rmap[i]);
- -                      free->arch.rmap[i] = NULL;
- -              }
+ +              kvfree(slot->arch.rmap[i]);
+ +              slot->arch.rmap[i] = NULL;
+ +
                 if (i == 0)
                         continue;
   
- -              if (!dont || free->arch.lpage_info[i - 1] !=
- -                           dont->arch.lpage_info[i - 1]) {
- -                      kvfree(free->arch.lpage_info[i - 1]);
- -                      free->arch.lpage_info[i - 1] = NULL;
- -              }
+ +              kvfree(slot->arch.lpage_info[i - 1]);
+ +              slot->arch.lpage_info[i - 1] = NULL;
         }
   
- -      kvm_page_track_free_memslot(free, dont);
+ +      kvm_page_track_free_memslot(slot);
   }
   
- -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
- -                          unsigned long npages)
+ +static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
+ +                                    unsigned long npages)
   {
         int i;
   
+ +      /*
+ +       * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
+ +       * old arrays will be freed by __kvm_set_memory_region() if installing
+ +       * the new memslot is successful.
+ +       */
+ +      memset(&slot->arch, 0, sizeof(slot->arch));
+ +
         for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                 struct kvm_lpage_info *linfo;
                 unsigned long ugfn;
@@@ -9941,9 -9899,11 +9941,9 @@@
                 ugfn = slot->userspace_addr >> PAGE_SHIFT;
                 /*
                  * If the gfn and userspace address are not aligned wrt each
- -               * other, or if explicitly asked to, disable large page
- -               * support for this slot
+ +               * other, disable large page support for this slot.
                  */
- -              if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
- -                  !kvm_largepages_enabled()) {
+ +              if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
                         unsigned long j;
   
                         for (j = 0; j < lpages; ++j)
@@@ -9990,9 -9950,6 +9990,9 @@@ int kvm_arch_prepare_memory_region(stru
                                 const struct kvm_userspace_memory_region *mem,
                                 enum kvm_mr_change change)
   {
+ +      if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
+ +              return kvm_alloc_memslot_metadata(memslot,
+ +                                                mem->memory_size >> PAGE_SHIFT);
         return 0;
   }
   
@@@ -10001,7 -9958,7 +10001,7 @@@ static void kvm_mmu_slot_apply_flags(st
   {
         /* Still write protect RO slot */
         if (new->flags & KVM_MEM_READONLY) {
- -              kvm_mmu_slot_remove_write_access(kvm, new);
+ +              kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL);
                 return;
         }
   
@@@ -10036,23 -9993,10 +10036,23 @@@
          * See the comments in fast_page_fault().
          */
         if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
- -              if (kvm_x86_ops->slot_enable_log_dirty)
+ +              if (kvm_x86_ops->slot_enable_log_dirty) {
                         kvm_x86_ops->slot_enable_log_dirty(kvm, new);
- -              else
- -                      kvm_mmu_slot_remove_write_access(kvm, new);
+ +              } else {
+ +                      int level =
+ +                              kvm_dirty_log_manual_protect_and_init_set(kvm) ?
+ +                              PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL;
+ +
+ +                      /*
+ +                       * If we're with initial-all-set, we don't need
+ +                       * to write protect any small page because
+ +                       * they're reported as dirty already.  However
+ +                       * we still need to write-protect huge pages
+ +                       * so that the page split can happen lazily on
+ +                       * the first write to the huge page.
+ +                       */
+ +                      kvm_mmu_slot_remove_write_access(kvm, new, level);
+ +              }
         } else {
                 if (kvm_x86_ops->slot_disable_log_dirty)
                         kvm_x86_ops->slot_disable_log_dirty(kvm, new);
@@@ -10061,7 -10005,7 +10061,7 @@@
   
   void kvm_arch_commit_memory_region(struct kvm *kvm,
                                 const struct kvm_userspace_memory_region *mem,
- -                              const struct kvm_memory_slot *old,
+ +                              struct kvm_memory_slot *old,
                                 const struct kvm_memory_slot *new,
                                 enum kvm_mr_change change)
   {
@@@ -10103,10 -10047,6 +10103,10 @@@
          */
         if (change != KVM_MR_DELETE)
                 kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
+ +
+ +      /* Free the arrays associated with the old memslot. */
+ +      if (change == KVM_MR_MOVE)
+ +              kvm_arch_free_memslot(kvm, old);
   }
   
   void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@@ -10251,7 -10191,7 +10251,7 @@@ void kvm_arch_async_page_ready(struct k
                 return;
   
         if (!vcpu->arch.mmu->direct_map &&
- -            work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
+ +            work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
                 return;
   
         kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
@@@ -10574,5 -10514,4 +10574,5 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_fu
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+ +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
diff --combined include/linux/kvm_host.h

index b19dee4,bcb9b2a..f6a1905
--- 1/include/linux/kvm_host.h
--- 2/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@@ -360,10 -360,6 +360,10 @@@ static inline unsigned long *kvm_second
         return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap);
   }
   
+ +#ifndef KVM_DIRTY_LOG_MANUAL_CAPS
+ +#define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE
+ +#endif
+ +
   struct kvm_s390_adapter_int {
         u64 ind_addr;
         u64 summary_addr;
@@@ -435,11 -431,11 +435,11 @@@ static inline int kvm_arch_vcpu_memslot
    */
   struct kvm_memslots {
         u64 generation;
- -      struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
         /* The mapping table from slot id to the index in memslots[]. */
         short id_to_index[KVM_MEM_SLOTS_NUM];
         atomic_t lru_slot;
         int used_slots;
+ +      struct kvm_memory_slot memslots[];
   };
   
   struct kvm {
@@@ -497,7 -493,7 +497,7 @@@
   #endif
         long tlbs_dirty;
         struct list_head devices;
- -      bool manual_dirty_log_protect;
+ +      u64 manual_dirty_log_protect;
         struct dentry *debugfs_dentry;
         struct kvm_stat_data **debugfs_stat_data;
         struct srcu_struct srcu;
@@@ -531,11 -527,6 +531,11 @@@
   #define vcpu_err(vcpu, fmt, ...)                                      \
         kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
   
+ +static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
+ +{
+ +      return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
+ +}
+ +
   static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx)
   {
         return srcu_dereference_check(kvm->buses[idx], &kvm->srcu,
@@@ -581,11 -572,10 +581,11 @@@ static inline int kvm_vcpu_get_idx(stru
         return vcpu->vcpu_idx;
   }
   
- -#define kvm_for_each_memslot(memslot, slots)  \
- -      for (memslot = &slots->memslots[0];     \
- -            memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
- -              memslot++)
+ +#define kvm_for_each_memslot(memslot, slots)                          \
+ +      for (memslot = &slots->memslots[0];                             \
+ +           memslot < slots->memslots + slots->used_slots; memslot++)  \
+ +              if (WARN_ON_ONCE(!memslot->npages)) {                   \
+ +              } else
   
   void kvm_vcpu_destroy(struct kvm_vcpu *vcpu);
   
@@@ -645,15 -635,12 +645,15 @@@ static inline struct kvm_memslots *kvm_
         return __kvm_memslots(vcpu->kvm, as_id);
   }
   
- -static inline struct kvm_memory_slot *
- -id_to_memslot(struct kvm_memslots *slots, int id)
+ +static inline
+ +struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
   {
         int index = slots->id_to_index[id];
         struct kvm_memory_slot *slot;
   
+ +      if (index < 0)
+ +              return NULL;
+ +
         slot = &slots->memslots[index];
   
         WARN_ON(slot->id != id);
@@@ -682,7 -669,10 +682,7 @@@ int kvm_set_memory_region(struct kvm *k
                           const struct kvm_userspace_memory_region *mem);
   int __kvm_set_memory_region(struct kvm *kvm,
                             const struct kvm_userspace_memory_region *mem);
- -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
- -                         struct kvm_memory_slot *dont);
- -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
- -                          unsigned long npages);
+ +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
   void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
   int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot,
@@@ -690,9 -680,11 +690,9 @@@
                                 enum kvm_mr_change change);
   void kvm_arch_commit_memory_region(struct kvm *kvm,
                                 const struct kvm_userspace_memory_region *mem,
- -                              const struct kvm_memory_slot *old,
+ +                              struct kvm_memory_slot *old,
                                 const struct kvm_memory_slot *new,
                                 enum kvm_mr_change change);
- -bool kvm_largepages_enabled(void);
- -void kvm_disable_largepages(void);
   /* flush all memory translations */
   void kvm_arch_flush_shadow_all(struct kvm *kvm);
   /* flush memory translations pointing to 'slot' */
@@@ -712,6 -704,7 +712,6 @@@ void kvm_release_page_clean(struct pag
   void kvm_release_page_dirty(struct page *page);
   void kvm_set_page_accessed(struct page *page);
   
- -kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
   kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
   kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                       bool *writable);
@@@ -826,20 -819,23 +826,20 @@@ vm_fault_t kvm_arch_vcpu_fault(struct k
   
   int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
   
- -int kvm_get_dirty_log(struct kvm *kvm,
- -                      struct kvm_dirty_log *log, int *is_dirty);
- -
- -int kvm_get_dirty_log_protect(struct kvm *kvm,
- -                            struct kvm_dirty_log *log, bool *flush);
- -int kvm_clear_dirty_log_protect(struct kvm *kvm,
- -                              struct kvm_clear_dirty_log *log, bool *flush);
- -
   void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                                         struct kvm_memory_slot *slot,
                                         gfn_t gfn_offset,
                                         unsigned long mask);
- -
- -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
- -                              struct kvm_dirty_log *log);
- -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
- -                                struct kvm_clear_dirty_log *log);
+ +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
+ +
+ +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+ +                                      struct kvm_memory_slot *memslot);
+ +#else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
+ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
+ +int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
+ +                    int *is_dirty, struct kvm_memory_slot **memslot);
+ +#endif
   
   int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
                         bool line_status);
@@@ -1022,8 -1018,6 +1022,8 @@@ bool kvm_arch_irqfd_allowed(struct kvm 
    * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
    * gfn_to_memslot() itself isn't here as an inline because that would
    * bloat other code too much.
+ + *
+ + * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!
    */
   static inline struct kvm_memory_slot *
   search_memslots(struct kvm_memslots *slots, gfn_t gfn)
@@@ -1032,9 -1026,6 +1032,9 @@@
         int slot = atomic_read(&slots->lru_slot);
         struct kvm_memory_slot *memslots = slots->memslots;
   
+ +      if (unlikely(!slots->used_slots))
+ +              return NULL;
+ +
         if (gfn >= memslots[slot].base_gfn &&
             gfn < memslots[slot].base_gfn + memslots[slot].npages)
                 return &memslots[slot];
@@@ -1353,7 -1344,7 +1353,7 @@@ static inline void kvm_vcpu_set_dy_elig
   #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
   
   struct kvm_vcpu *kvm_get_running_vcpu(void);
- struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
+ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
   
   #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
   bool kvm_arch_has_irq_bypass(void);
diff --combined virt/kvm/arm/arm.c

index bfdba1c,4d864f8..376c6a7
--- 1/virt/kvm/arm/arm.c
--- 2/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@@ -625,6 -625,14 +625,14 @@@ static void check_vcpu_requests(struct 
   
                 if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
                         kvm_update_stolen_time(vcpu);
+ 
+               if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
+                       /* The distributor enable bits were changed */
+                       preempt_disable();
+                       vgic_v4_put(vcpu, false);
+                       vgic_v4_load(vcpu);
+                       preempt_enable();
+               }
         }
   }
   
@@@ -742,9 -750,7 +750,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                 guest_enter_irqoff();
   
                 if (has_vhe()) {
-                       kvm_arm_vhe_guest_enter();
                         ret = kvm_vcpu_run_vhe(vcpu);
-                       kvm_arm_vhe_guest_exit();
                 } else {
                         ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
                 }
@@@ -1183,15 -1189,55 +1189,15 @@@ long kvm_arch_vcpu_ioctl(struct file *f
         return r;
   }
   
- -/**
- - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- - * @kvm: kvm instance
- - * @log: slot id and address to which we copy the log
- - *
- - * Steps 1-4 below provide general overview of dirty page logging. See
- - * kvm_get_dirty_log_protect() function description for additional details.
- - *
- - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- - * always flush the TLB (step 4) even if previous step failed  and the dirty
- - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- - * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- - * writes will be marked dirty for next log read.
- - *
- - *   1. Take a snapshot of the bit and clear it if needed.
- - *   2. Write protect the corresponding page.
- - *   3. Copy the snapshot to the userspace.
- - *   4. Flush TLB's if needed.
- - */
- -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+ +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
   {
- -      bool flush = false;
- -      int r;
- -
- -      mutex_lock(&kvm->slots_lock);
- -
- -      r = kvm_get_dirty_log_protect(kvm, log, &flush);
   
- -      if (flush)
- -              kvm_flush_remote_tlbs(kvm);
- -
- -      mutex_unlock(&kvm->slots_lock);
- -      return r;
   }
   
- -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
+ +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+ +                                      struct kvm_memory_slot *memslot)
   {
- -      bool flush = false;
- -      int r;
- -
- -      mutex_lock(&kvm->slots_lock);
- -
- -      r = kvm_clear_dirty_log_protect(kvm, log, &flush);
- -
- -      if (flush)
- -              kvm_flush_remote_tlbs(kvm);
- -
- -      mutex_unlock(&kvm->slots_lock);
- -      return r;
+ +      kvm_flush_remote_tlbs(kvm);
   }
   
   static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
author	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)
		1	2
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/kvm/hyp/switch.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/boot/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/page.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/arm/arm.c	patch \|	diff1 \|	diff2 \|	blob \| history