Merge tag 'kvmarm-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm...
authorPaolo Bonzini <pbonzini@redhat.com>
Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)
committerPaolo Bonzini <pbonzini@redhat.com>
Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)
KVM/arm updates for Linux 5.7

- GICv4.1 support
- 32bit host removal

1  2 
Documentation/admin-guide/kernel-parameters.txt
MAINTAINERS
arch/arm64/kvm/hyp/switch.c
arch/s390/boot/Makefile
arch/s390/include/asm/page.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
include/linux/kvm_host.h
virt/kvm/arm/arm.c

                        dynamic table installation which will install SSDT
                        tables to /sys/firmware/acpi/tables/dynamic.
  
+       acpi_no_watchdog        [HW,ACPI,WDT]
+                       Ignore the ACPI-based watchdog interface (WDAT) and let
+                       a native driver control the watchdog device instead.
        acpi_rsdp=      [ACPI,EFI,KEXEC]
                        Pass the RSDP address to the kernel, mostly used
                        on machines running EFI runtime service to boot the
                        before loading.
                        See Documentation/admin-guide/blockdev/ramdisk.rst.
  
 +      prot_virt=      [S390] enable hosting protected virtual machines
 +                      isolated from the hypervisor (if hardware supports
 +                      that).
 +                      Format: <bool>
 +
        psi=            [KNL] Enable or disable pressure stall information
                        tracking.
                        Format: <bool>
diff --combined MAINTAINERS
@@@ -3649,6 -3649,7 +3649,7 @@@ F:      sound/pci/oxygen
  
  C-SKY ARCHITECTURE
  M:    Guo Ren <guoren@kernel.org>
+ L:    linux-csky@vger.kernel.org
  T:    git https://github.com/c-sky/csky-linux.git
  S:    Supported
  F:    arch/csky/
@@@ -3909,7 -3910,7 +3910,7 @@@ S:      Supporte
  F:    Documentation/filesystems/ceph.txt
  F:    fs/ceph/
  
- CERTIFICATE HANDLING:
+ CERTIFICATE HANDLING
  M:    David Howells <dhowells@redhat.com>
  M:    David Woodhouse <dwmw2@infradead.org>
  L:    keyrings@vger.kernel.org
@@@ -3919,7 -3920,7 +3920,7 @@@ F:      certs
  F:    scripts/sign-file.c
  F:    scripts/extract-cert.c
  
- CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
+ CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM
  L:    devel@driverdev.osuosl.org
  S:    Obsolete
  F:    drivers/staging/wusbcore/
@@@ -5932,12 -5933,12 +5933,12 @@@ S:   Maintaine
  F:    drivers/media/dvb-frontends/ec100*
  
  ECRYPT FILE SYSTEM
- M:    Tyler Hicks <tyhicks@canonical.com>
+ M:    Tyler Hicks <code@tyhicks.com>
  L:    ecryptfs@vger.kernel.org
  W:    http://ecryptfs.org
  W:    https://launchpad.net/ecryptfs
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git
- S:    Supported
+ S:    Odd Fixes
  F:    Documentation/filesystems/ecryptfs.txt
  F:    fs/ecryptfs/
  
@@@ -7047,7 -7048,7 +7048,7 @@@ L:      kvm@vger.kernel.or
  S:    Supported
  F:    drivers/uio/uio_pci_generic.c
  
- GENERIC VDSO LIBRARY:
+ GENERIC VDSO LIBRARY
  M:    Andy Lutomirski <luto@kernel.org>
  M:    Thomas Gleixner <tglx@linutronix.de>
  M:    Vincenzo Frascino <vincenzo.frascino@arm.com>
@@@ -8392,7 -8393,7 +8393,7 @@@ M:      Joonas Lahtinen <joonas.lahtinen@lin
  M:    Rodrigo Vivi <rodrigo.vivi@intel.com>
  L:    intel-gfx@lists.freedesktop.org
  W:    https://01.org/linuxgraphics/
- B:    https://01.org/linuxgraphics/documentation/how-report-bugs
+ B:    https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs
  C:    irc://chat.freenode.net/intel-gfx
  Q:    http://patchwork.freedesktop.org/project/intel-gfx/
  T:    git git://anongit.freedesktop.org/drm-intel
@@@ -9163,7 -9164,7 +9164,7 @@@ F:      virt/kvm/
  F:    tools/kvm/
  F:    tools/testing/selftests/kvm/
  
- KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64)
+ KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
  M:    Marc Zyngier <maz@kernel.org>
  R:    James Morse <james.morse@arm.com>
  R:    Julien Thierry <julien.thierry.kdev@gmail.com>
@@@ -9172,9 -9173,6 +9173,6 @@@ L:      linux-arm-kernel@lists.infradead.or
  L:    kvmarm@lists.cs.columbia.edu
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
  S:    Maintained
- F:    arch/arm/include/uapi/asm/kvm*
- F:    arch/arm/include/asm/kvm*
- F:    arch/arm/kvm/
  F:    arch/arm64/include/uapi/asm/kvm*
  F:    arch/arm64/include/asm/kvm*
  F:    arch/arm64/kvm/
@@@ -9209,7 -9207,6 +9207,7 @@@ L:      kvm@vger.kernel.or
  W:    http://www.ibm.com/developerworks/linux/linux390/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
  S:    Supported
 +F:    Documentation/virt/kvm/s390*
  F:    arch/s390/include/uapi/asm/kvm*
  F:    arch/s390/include/asm/gmap.h
  F:    arch/s390/include/asm/kvm*
@@@ -9279,7 -9276,7 +9277,7 @@@ F:      include/keys/trusted-type.
  F:    security/keys/trusted.c
  F:    include/keys/trusted.h
  
- KEYS/KEYRINGS:
+ KEYS/KEYRINGS
  M:    David Howells <dhowells@redhat.com>
  M:    Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
  L:    keyrings@vger.kernel.org
@@@ -11115,14 -11112,12 +11113,12 @@@ S:        Maintaine
  F:    drivers/usb/image/microtek.*
  
  MIPS
- M:    Ralf Baechle <ralf@linux-mips.org>
- M:    Paul Burton <paulburton@kernel.org>
+ M:    Thomas Bogendoerfer <tsbogend@alpha.franken.de>
  L:    linux-mips@vger.kernel.org
  W:    http://www.linux-mips.org/
- T:    git git://git.linux-mips.org/pub/scm/ralf/linux.git
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git
  Q:    http://patchwork.linux-mips.org/project/linux-mips/list/
- S:    Supported
+ S:    Maintained
  F:    Documentation/devicetree/bindings/mips/
  F:    Documentation/mips/
  F:    arch/mips/
@@@ -11485,7 -11480,7 +11481,7 @@@ F:   drivers/scsi/mac_scsi.
  F:    drivers/scsi/sun3_scsi.*
  F:    drivers/scsi/sun3_scsi_vme.c
  
- NCSI LIBRARY:
+ NCSI LIBRARY
  M:    Samuel Mendoza-Jonas <sam@mendozajonas.com>
  S:    Maintained
  F:    net/ncsi/
@@@ -12741,7 -12736,7 +12737,7 @@@ M:   Tom Joseph <tjoseph@cadence.com
  L:    linux-pci@vger.kernel.org
  S:    Maintained
  F:    Documentation/devicetree/bindings/pci/cdns,*.txt
- F:    drivers/pci/controller/pcie-cadence*
+ F:    drivers/pci/controller/cadence/
  
  PCI DRIVER FOR FREESCALE LAYERSCAPE
  M:    Minghuan Lian <minghuan.Lian@nxp.com>
@@@ -13513,7 -13508,7 +13509,7 @@@ L:   linuxppc-dev@lists.ozlabs.or
  S:    Maintained
  F:    drivers/block/ps3vram.c
  
- PSAMPLE PACKET SAMPLING SUPPORT:
+ PSAMPLE PACKET SAMPLING SUPPORT
  M:    Yotam Gigi <yotam.gi@gmail.com>
  S:    Maintained
  F:    net/psample
@@@ -14583,10 -14578,10 +14579,10 @@@ F:        drivers/media/pci/saa7146
  F:    include/media/drv-intf/saa7146*
  
  SAFESETID SECURITY MODULE
- M:     Micah Morton <mortonm@chromium.org>
- S:     Supported
- F:     security/safesetid/
- F:     Documentation/admin-guide/LSM/SafeSetID.rst
+ M:    Micah Morton <mortonm@chromium.org>
+ S:    Supported
+ F:    security/safesetid/
+ F:    Documentation/admin-guide/LSM/SafeSetID.rst
  
  SAMSUNG AUDIO (ASoC) DRIVERS
  M:    Krzysztof Kozlowski <krzk@kernel.org>
@@@ -16553,8 -16548,8 +16549,8 @@@ M:   Michael Jamet <michael.jamet@intel.c
  M:    Mika Westerberg <mika.westerberg@linux.intel.com>
  M:    Yehezkel Bernat <YehezkelShB@gmail.com>
  L:    linux-usb@vger.kernel.org
- T:    git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
  S:    Maintained
+ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
  F:    Documentation/admin-guide/thunderbolt.rst
  F:    drivers/thunderbolt/
  F:    include/linux/thunderbolt.h
@@@ -17081,7 -17076,7 +17077,7 @@@ S:   Maintaine
  F:    Documentation/admin-guide/ufs.rst
  F:    fs/ufs/
  
- UHID USERSPACE HID IO DRIVER:
+ UHID USERSPACE HID IO DRIVER
  M:    David Herrmann <dh.herrmann@googlemail.com>
  L:    linux-input@vger.kernel.org
  S:    Maintained
@@@ -17095,18 -17090,18 +17091,18 @@@ S:        Maintaine
  F:    drivers/usb/common/ulpi.c
  F:    include/linux/ulpi/
  
- ULTRA-WIDEBAND (UWB) SUBSYSTEM:
+ ULTRA-WIDEBAND (UWB) SUBSYSTEM
  L:    devel@driverdev.osuosl.org
  S:    Obsolete
  F:    drivers/staging/uwb/
  
- UNICODE SUBSYSTEM:
+ UNICODE SUBSYSTEM
  M:    Gabriel Krisman Bertazi <krisman@collabora.com>
  L:    linux-fsdevel@vger.kernel.org
  S:    Supported
  F:    fs/unicode/
  
- UNICORE32 ARCHITECTURE:
+ UNICORE32 ARCHITECTURE
  M:    Guan Xuetao <gxt@pku.edu.cn>
  W:    http://mprc.pku.edu.cn/~guanxuetao/linux
  S:    Maintained
@@@ -17393,11 -17388,14 +17389,14 @@@ F:        drivers/usb
  F:    include/linux/usb.h
  F:    include/linux/usb/
  
- USB TYPEC PI3USB30532 MUX DRIVER
- M:    Hans de Goede <hdegoede@redhat.com>
+ USB TYPEC BUS FOR ALTERNATE MODES
+ M:    Heikki Krogerus <heikki.krogerus@linux.intel.com>
  L:    linux-usb@vger.kernel.org
  S:    Maintained
- F:    drivers/usb/typec/mux/pi3usb30532.c
+ F:    Documentation/ABI/testing/sysfs-bus-typec
+ F:    Documentation/driver-api/usb/typec_bus.rst
+ F:    drivers/usb/typec/altmodes/
+ F:    include/linux/usb/typec_altmode.h
  
  USB TYPEC CLASS
  M:    Heikki Krogerus <heikki.krogerus@linux.intel.com>
@@@ -17408,14 -17406,11 +17407,11 @@@ F:        Documentation/driver-api/usb/typec.r
  F:    drivers/usb/typec/
  F:    include/linux/usb/typec.h
  
- USB TYPEC BUS FOR ALTERNATE MODES
- M:    Heikki Krogerus <heikki.krogerus@linux.intel.com>
+ USB TYPEC PI3USB30532 MUX DRIVER
+ M:    Hans de Goede <hdegoede@redhat.com>
  L:    linux-usb@vger.kernel.org
  S:    Maintained
- F:    Documentation/ABI/testing/sysfs-bus-typec
- F:    Documentation/driver-api/usb/typec_bus.rst
- F:    drivers/usb/typec/altmodes/
- F:    include/linux/usb/typec_altmode.h
+ F:    drivers/usb/typec/mux/pi3usb30532.c
  
  USB TYPEC PORT CONTROLLER DRIVERS
  M:    Guenter Roeck <linux@roeck-us.net>
@@@ -17792,7 -17787,7 +17788,7 @@@ F:   include/linux/vbox_utils.
  F:    include/uapi/linux/vbox*.h
  F:    drivers/virt/vboxguest/
  
- VIRTUAL BOX SHARED FOLDER VFS DRIVER:
+ VIRTUAL BOX SHARED FOLDER VFS DRIVER
  M:    Hans de Goede <hdegoede@redhat.com>
  L:    linux-fsdevel@vger.kernel.org
  S:    Maintained
@@@ -17,6 -17,7 +17,6 @@@
  #include <asm/kprobes.h>
  #include <asm/kvm_asm.h>
  #include <asm/kvm_emulate.h>
 -#include <asm/kvm_host.h>
  #include <asm/kvm_hyp.h>
  #include <asm/kvm_mmu.h>
  #include <asm/fpsimd.h>
@@@ -624,7 -625,7 +624,7 @@@ static void __hyp_text __pmu_switch_to_
  }
  
  /* Switch to the guest for VHE systems running in EL2 */
int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
  {
        struct kvm_cpu_context *host_ctxt;
        struct kvm_cpu_context *guest_ctxt;
  
        return exit_code;
  }
- NOKPROBE_SYMBOL(kvm_vcpu_run_vhe);
+ NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe);
+ int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
+ {
+       int ret;
+       local_daif_mask();
+       /*
+        * Having IRQs masked via PMR when entering the guest means the GIC
+        * will not signal the CPU of interrupts of lower priority, and the
+        * only way to get out will be via guest exceptions.
+        * Naturally, we want to avoid this.
+        *
+        * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a
+        * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU.
+        */
+       pmr_sync();
+       ret = __kvm_vcpu_run_vhe(vcpu);
+       /*
+        * local_daif_restore() takes care to properly restore PSTATE.DAIF
+        * and the GIC PMR if the host is using IRQ priorities.
+        */
+       local_daif_restore(DAIF_PROCCTX_NOIRQ);
+       /*
+        * When we exit from the guest we change a number of CPU configuration
+        * parameters, such as traps.  Make sure these changes take effect
+        * before running the host or additional guests.
+        */
+       isb();
+       return ret;
+ }
  
  /* Switch to the guest for legacy non-VHE systems */
  int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
diff --combined arch/s390/boot/Makefile
@@@ -37,7 -37,7 +37,7 @@@ CFLAGS_sclp_early_core.o += -I$(srctree
  obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o
  obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
  obj-y += version.o pgm_check_info.o ctype.o text_dma.o
 -obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST)  += uv.o
 +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
  obj-$(CONFIG_RELOCATABLE)     += machine_kexec_reloc.o
  obj-$(CONFIG_RANDOMIZE_BASE)  += kaslr.o
  targets       := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
@@@ -70,7 -70,7 +70,7 @@@ $(obj)/compressed/vmlinux: $(obj)/start
  $(obj)/startup.a: $(OBJECTS) FORCE
        $(call if_changed,ar)
  
- install: $(CONFIGURE) $(obj)/bzImage
+ install:
        sh -x  $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
              System.map "$(INSTALL_PATH)"
  
@@@ -42,7 -42,7 +42,7 @@@ void __storage_key_init_range(unsigned 
  
  static inline void storage_key_init_range(unsigned long start, unsigned long end)
  {
-       if (PAGE_DEFAULT_KEY)
+       if (PAGE_DEFAULT_KEY != 0)
                __storage_key_init_range(start, end);
  }
  
@@@ -153,11 -153,6 +153,11 @@@ static inline int devmem_is_allowed(uns
  #define HAVE_ARCH_FREE_PAGE
  #define HAVE_ARCH_ALLOC_PAGE
  
 +#if IS_ENABLED(CONFIG_PGSTE)
 +int arch_make_page_accessible(struct page *page);
 +#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
 +#endif
 +
  #endif /* !__ASSEMBLY__ */
  
  #define __PAGE_OFFSET         0x0UL
diff --combined arch/x86/kvm/svm.c
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
+ #ifdef MODULE
  static const struct x86_cpu_id svm_cpu_id[] = {
        X86_FEATURE_MATCH(X86_FEATURE_SVM),
        {}
  };
  MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+ #endif
  
  #define IOPM_ALLOC_ORDER 2
  #define MSRPM_ALLOC_ORDER 1
@@@ -519,31 -521,10 +521,31 @@@ static void recalc_intercepts(struct vc
        h = &svm->nested.hsave->control;
        g = &svm->nested;
  
 -      c->intercept_cr = h->intercept_cr | g->intercept_cr;
 -      c->intercept_dr = h->intercept_dr | g->intercept_dr;
 -      c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 -      c->intercept = h->intercept | g->intercept;
 +      c->intercept_cr = h->intercept_cr;
 +      c->intercept_dr = h->intercept_dr;
 +      c->intercept_exceptions = h->intercept_exceptions;
 +      c->intercept = h->intercept;
 +
 +      if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
 +              /* We only want the cr8 intercept bits of L1 */
 +              c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
 +              c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
 +
 +              /*
 +               * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
 +               * affect any interrupt we may want to inject; therefore,
 +               * interrupt window vmexits are irrelevant to L0.
 +               */
 +              c->intercept &= ~(1ULL << INTERCEPT_VINTR);
 +      }
 +
 +      /* We don't want to see VMMCALLs from a nested guest */
 +      c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
 +
 +      c->intercept_cr |= g->intercept_cr;
 +      c->intercept_dr |= g->intercept_dr;
 +      c->intercept_exceptions |= g->intercept_exceptions;
 +      c->intercept |= g->intercept;
  }
  
  static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
@@@ -648,11 -629,6 +650,11 @@@ static inline void clr_intercept(struc
        recalc_intercepts(svm);
  }
  
 +static inline bool is_intercept(struct vcpu_svm *svm, int bit)
 +{
 +      return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
 +}
 +
  static inline bool vgif_enabled(struct vcpu_svm *svm)
  {
        return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
@@@ -1232,7 -1208,6 +1234,7 @@@ static int avic_ga_log_notifier(u32 ga_
        u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
  
        pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
 +      trace_kvm_avic_ga_log(vm_id, vcpu_id);
  
        spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
        hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
@@@ -1394,29 -1369,6 +1396,29 @@@ static void svm_hardware_teardown(void
        iopm_base = 0;
  }
  
 +static __init void svm_set_cpu_caps(void)
 +{
 +      kvm_set_cpu_caps();
 +
 +      supported_xss = 0;
 +
 +      /* CPUID 0x80000001 and 0x8000000A (SVM features) */
 +      if (nested) {
 +              kvm_cpu_cap_set(X86_FEATURE_SVM);
 +
 +              if (nrips)
 +                      kvm_cpu_cap_set(X86_FEATURE_NRIPS);
 +
 +              if (npt_enabled)
 +                      kvm_cpu_cap_set(X86_FEATURE_NPT);
 +      }
 +
 +      /* CPUID 0x80000008 */
 +      if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
 +          boot_cpu_has(X86_FEATURE_AMD_SSBD))
 +              kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
 +}
 +
  static __init int svm_hardware_setup(void)
  {
        int cpu;
  
        init_msrpm_offsets();
  
 +      supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 +
        if (boot_cpu_has(X86_FEATURE_NX))
                kvm_enable_efer_bits(EFER_NX);
  
        if (!boot_cpu_has(X86_FEATURE_NPT))
                npt_enabled = false;
  
 -      if (npt_enabled && !npt) {
 -              printk(KERN_INFO "kvm: Nested Paging disabled\n");
 +      if (npt_enabled && !npt)
                npt_enabled = false;
 -      }
  
 -      if (npt_enabled) {
 -              printk(KERN_INFO "kvm: Nested Paging enabled\n");
 -              kvm_enable_tdp();
 -      } else
 -              kvm_disable_tdp();
 +      kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL);
 +      pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
  
        if (nrips) {
                if (!boot_cpu_has(X86_FEATURE_NRIPS))
                        pr_info("Virtual GIF supported\n");
        }
  
 +      svm_set_cpu_caps();
 +
        return 0;
  
  err:
@@@ -1993,6 -1946,19 +1995,6 @@@ static void __unregister_enc_region_loc
        kfree(region);
  }
  
 -static struct kvm *svm_vm_alloc(void)
 -{
 -      struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
 -                                          GFP_KERNEL_ACCOUNT | __GFP_ZERO,
 -                                          PAGE_KERNEL);
 -      return &kvm_svm->kvm;
 -}
 -
 -static void svm_vm_free(struct kvm *kvm)
 -{
 -      vfree(to_kvm_svm(kvm));
 -}
 -
  static void sev_vm_destroy(struct kvm *kvm)
  {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@@ -2220,7 -2186,7 +2222,7 @@@ static void svm_vcpu_reset(struct kvm_v
        }
        init_vmcb(svm);
  
 -      kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
 +      kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
        kvm_rdx_write(vcpu, eax);
  
        if (kvm_vcpu_apicv_active(vcpu) && !init_event)
  static int avic_init_vcpu(struct vcpu_svm *svm)
  {
        int ret;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
  
-       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+       if (!avic || !irqchip_in_kernel(vcpu->kvm))
                return 0;
  
        ret = avic_init_backing_page(&svm->vcpu);
@@@ -2453,38 -2420,14 +2456,38 @@@ static void svm_cache_reg(struct kvm_vc
        }
  }
  
 +static inline void svm_enable_vintr(struct vcpu_svm *svm)
 +{
 +      struct vmcb_control_area *control;
 +
 +      /* The following fields are ignored when AVIC is enabled */
 +      WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
 +
 +      /*
 +       * This is just a dummy VINTR to actually cause a vmexit to happen.
 +       * Actual injection of virtual interrupts happens through EVENTINJ.
 +       */
 +      control = &svm->vmcb->control;
 +      control->int_vector = 0x0;
 +      control->int_ctl &= ~V_INTR_PRIO_MASK;
 +      control->int_ctl |= V_IRQ_MASK |
 +              ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
 +      mark_dirty(svm->vmcb, VMCB_INTR);
 +}
 +
  static void svm_set_vintr(struct vcpu_svm *svm)
  {
        set_intercept(svm, INTERCEPT_VINTR);
 +      if (is_intercept(svm, INTERCEPT_VINTR))
 +              svm_enable_vintr(svm);
  }
  
  static void svm_clear_vintr(struct vcpu_svm *svm)
  {
        clr_intercept(svm, INTERCEPT_VINTR);
 +
 +      svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 +      mark_dirty(svm->vmcb, VMCB_INTR);
  }
  
  static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@@ -3040,6 -2983,15 +3043,6 @@@ static u64 nested_svm_get_tdp_pdptr(str
        return pdpte;
  }
  
 -static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 -                                 unsigned long root)
 -{
 -      struct vcpu_svm *svm = to_svm(vcpu);
 -
 -      svm->vmcb->control.nested_cr3 = __sme_set(root);
 -      mark_dirty(svm->vmcb, VMCB_NPT);
 -}
 -
  static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
                                       struct x86_exception *fault)
  {
@@@ -3075,7 -3027,8 +3078,7 @@@ static void nested_svm_init_mmu_context
  
        vcpu->arch.mmu = &vcpu->arch.guest_mmu;
        kvm_init_shadow_mmu(vcpu);
 -      vcpu->arch.mmu->set_cr3           = nested_svm_set_tdp_cr3;
 -      vcpu->arch.mmu->get_cr3           = nested_svm_get_tdp_cr3;
 +      vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
        vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
        vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
        vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
@@@ -3136,36 -3089,43 +3139,36 @@@ static int nested_svm_check_exception(s
        return vmexit;
  }
  
 -/* This function returns true if it is save to enable the irq window */
 -static inline bool nested_svm_intr(struct vcpu_svm *svm)
 +static void nested_svm_intr(struct vcpu_svm *svm)
  {
 -      if (!is_guest_mode(&svm->vcpu))
 -              return true;
 -
 -      if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
 -              return true;
 -
 -      if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
 -              return false;
 -
 -      /*
 -       * if vmexit was already requested (by intercepted exception
 -       * for instance) do not overwrite it with "external interrupt"
 -       * vmexit.
 -       */
 -      if (svm->nested.exit_required)
 -              return false;
 -
        svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
        svm->vmcb->control.exit_info_1 = 0;
        svm->vmcb->control.exit_info_2 = 0;
  
 -      if (svm->nested.intercept & 1ULL) {
 -              /*
 -               * The #vmexit can't be emulated here directly because this
 -               * code path runs with irqs and preemption disabled. A
 -               * #vmexit emulation might sleep. Only signal request for
 -               * the #vmexit here.
 -               */
 -              svm->nested.exit_required = true;
 -              trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
 -              return false;
 +      /* nested_svm_vmexit this gets called afterwards from handle_exit */
 +      svm->nested.exit_required = true;
 +      trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
 +}
 +
 +static bool nested_exit_on_intr(struct vcpu_svm *svm)
 +{
 +      return (svm->nested.intercept & 1ULL);
 +}
 +
 +static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 +{
 +      struct vcpu_svm *svm = to_svm(vcpu);
 +      bool block_nested_events =
 +              kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
 +
 +      if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
 +              if (block_nested_events)
 +                      return -EBUSY;
 +              nested_svm_intr(svm);
 +              return 0;
        }
  
 -      return true;
 +      return 0;
  }
  
  /* This function returns true if it is save to enable the nmi window */
@@@ -3284,6 -3244,9 +3287,6 @@@ static int nested_svm_exit_special(stru
        return NESTED_EXIT_CONTINUE;
  }
  
 -/*
 - * If this function returns true, this #vmexit was already handled
 - */
  static int nested_svm_intercept(struct vcpu_svm *svm)
  {
        u32 exit_code = svm->vmcb->control.exit_code;
@@@ -3558,9 -3521,6 +3561,9 @@@ static bool nested_svm_vmrun_msrpm(stru
  
  static bool nested_vmcb_checks(struct vmcb *vmcb)
  {
 +      if ((vmcb->save.efer & EFER_SVME) == 0)
 +              return false;
 +
        if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
                return false;
  
  static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
                                 struct vmcb *nested_vmcb, struct kvm_host_map *map)
  {
 +      bool evaluate_pending_interrupts =
 +              is_intercept(svm, INTERCEPT_VINTR) ||
 +              is_intercept(svm, INTERCEPT_IRET);
 +
        if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
                svm->vcpu.arch.hflags |= HF_HIF_MASK;
        else
        else
                svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
  
 -      if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
 -              /* We only want the cr8 intercept bits of the guest */
 -              clr_cr_intercept(svm, INTERCEPT_CR8_READ);
 -              clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 -      }
 -
 -      /* We don't want to see VMMCALLs from a nested guest */
 -      clr_intercept(svm, INTERCEPT_VMMCALL);
 -
        svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
        svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
  
  
        svm->nested.vmcb = vmcb_gpa;
  
 +      /*
 +       * If L1 had a pending IRQ/NMI before executing VMRUN,
 +       * which wasn't delivered because it was disallowed (e.g.
 +       * interrupts disabled), L0 needs to evaluate if this pending
 +       * event should cause an exit from L2 to L1 or be delivered
 +       * directly to L2.
 +       *
 +       * Usually this would be handled by the processor noticing an
 +       * IRQ/NMI window request.  However, VMRUN can unblock interrupts
 +       * by implicitly setting GIF, so force L0 to perform pending event
 +       * evaluation by requesting a KVM_REQ_EVENT.
 +       */
        enable_gif(svm);
 +      if (unlikely(evaluate_pending_interrupts))
 +              kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
  
        mark_all_dirty(svm->vmcb);
  }
@@@ -3883,8 -3834,11 +3886,8 @@@ static int clgi_interception(struct vcp
        disable_gif(svm);
  
        /* After a CLGI no interrupts should come */
 -      if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
 +      if (!kvm_vcpu_apicv_active(&svm->vcpu))
                svm_clear_vintr(svm);
 -              svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 -              mark_dirty(svm->vmcb, VMCB_INTR);
 -      }
  
        return ret;
  }
@@@ -5170,6 -5124,19 +5173,6 @@@ static void svm_inject_nmi(struct kvm_v
        ++vcpu->stat.nmi_injections;
  }
  
 -static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 -{
 -      struct vmcb_control_area *control;
 -
 -      /* The following fields are ignored when AVIC is enabled */
 -      control = &svm->vmcb->control;
 -      control->int_vector = irq;
 -      control->int_ctl &= ~V_INTR_PRIO_MASK;
 -      control->int_ctl |= V_IRQ_MASK |
 -              ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
 -      mark_dirty(svm->vmcb, VMCB_INTR);
 -}
 -
  static void svm_set_irq(struct kvm_vcpu *vcpu)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
@@@ -5558,15 -5525,18 +5561,15 @@@ static int svm_interrupt_allowed(struc
  {
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb *vmcb = svm->vmcb;
 -      int ret;
  
        if (!gif_set(svm) ||
             (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
                return 0;
  
 -      ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
 -
 -      if (is_guest_mode(vcpu))
 -              return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
 -
 -      return ret;
 +      if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
 +              return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
 +      else
 +              return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
  }
  
  static void enable_irq_window(struct kvm_vcpu *vcpu)
         * enabled, the STGI interception will not occur. Enable the irq
         * window under the assumption that the hardware will set the GIF.
         */
 -      if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
 +      if (vgif_enabled(svm) || gif_set(svm)) {
                /*
                 * IRQ window is not needed when AVIC is enabled,
                 * unless we have pending ExtINT since it cannot be injected
                 */
                svm_toggle_avic_for_irq_window(vcpu, false);
                svm_set_vintr(svm);
 -              svm_inject_irq(svm, 0x0);
        }
  }
  
@@@ -5975,30 -5946,24 +5978,30 @@@ static void svm_vcpu_run(struct kvm_vcp
  }
  STACK_FRAME_NON_STANDARD(svm_vcpu_run);
  
 -static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
 +      bool update_guest_cr3 = true;
 +      unsigned long cr3;
  
 -      svm->vmcb->save.cr3 = __sme_set(root);
 -      mark_dirty(svm->vmcb, VMCB_CR);
 -}
 -
 -static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 -{
 -      struct vcpu_svm *svm = to_svm(vcpu);
 +      cr3 = __sme_set(root);
 +      if (npt_enabled) {
 +              svm->vmcb->control.nested_cr3 = cr3;
 +              mark_dirty(svm->vmcb, VMCB_NPT);
  
 -      svm->vmcb->control.nested_cr3 = __sme_set(root);
 -      mark_dirty(svm->vmcb, VMCB_NPT);
 +              /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
 +              if (is_guest_mode(vcpu))
 +                      update_guest_cr3 = false;
 +              else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
 +                      cr3 = vcpu->arch.cr3;
 +              else /* CR3 is already up-to-date.  */
 +                      update_guest_cr3 = false;
 +      }
  
 -      /* Also sync guest cr3 here in case we live migrate */
 -      svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
 -      mark_dirty(svm->vmcb, VMCB_CR);
 +      if (update_guest_cr3) {
 +              svm->vmcb->save.cr3 = cr3;
 +              mark_dirty(svm->vmcb, VMCB_CR);
 +      }
  }
  
  static int is_disabled(void)
@@@ -6060,19 -6025,12 +6063,19 @@@ static void svm_cpuid_update(struct kvm
                                    boot_cpu_has(X86_FEATURE_XSAVES);
  
        /* Update nrips enabled cache */
 -      svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
 +      svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
 +                           guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
  
        if (!kvm_vcpu_apicv_active(vcpu))
                return;
  
 -      guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
 +      /*
 +       * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
 +       * is exposed to the guest, disable AVIC.
 +       */
 +      if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
 +              kvm_request_apicv_update(vcpu->kvm, false,
 +                                       APICV_INHIBIT_REASON_X2APIC);
  
        /*
         * Currently, AVIC does not work with nested virtualization.
                                         APICV_INHIBIT_REASON_NESTED);
  }
  
 -#define F feature_bit
 -
 -static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 -{
 -      switch (func) {
 -      case 0x1:
 -              if (avic)
 -                      entry->ecx &= ~F(X2APIC);
 -              break;
 -      case 0x80000001:
 -              if (nested)
 -                      entry->ecx |= (1 << 2); /* Set SVM bit */
 -              break;
 -      case 0x80000008:
 -              if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
 -                   boot_cpu_has(X86_FEATURE_AMD_SSBD))
 -                      entry->ebx |= F(VIRT_SSBD);
 -              break;
 -      case 0x8000000A:
 -              entry->eax = 1; /* SVM revision 1 */
 -              entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
 -                                 ASID emulation to nested SVM */
 -              entry->ecx = 0; /* Reserved */
 -              entry->edx = 0; /* Per default do not support any
 -                                 additional features */
 -
 -              /* Support next_rip if host supports it */
 -              if (boot_cpu_has(X86_FEATURE_NRIPS))
 -                      entry->edx |= F(NRIPS);
 -
 -              /* Support NPT for the guest if enabled */
 -              if (npt_enabled)
 -                      entry->edx |= F(NPT);
 -
 -      }
 -}
 -
 -static int svm_get_lpage_level(void)
 -{
 -      return PT_PDPE_LEVEL;
 -}
 -
 -static bool svm_rdtscp_supported(void)
 -{
 -      return boot_cpu_has(X86_FEATURE_RDTSCP);
 -}
 -
 -static bool svm_invpcid_supported(void)
 -{
 -      return false;
 -}
 -
 -static bool svm_mpx_supported(void)
 -{
 -      return false;
 -}
 -
 -static bool svm_xsaves_supported(void)
 -{
 -      return boot_cpu_has(X86_FEATURE_XSAVES);
 -}
 -
 -static bool svm_umip_emulated(void)
 -{
 -      return false;
 -}
 -
 -static bool svm_pt_supported(void)
 -{
 -      return false;
 -}
 -
  static bool svm_has_wbinvd_exit(void)
  {
        return true;
  }
  
 -static bool svm_pku_supported(void)
 -{
 -      return false;
 -}
 -
  #define PRE_EX(exit)  { .exit_code = (exit), \
                        .stage = X86_ICPT_PRE_EXCEPT, }
  #define POST_EX(exit) { .exit_code = (exit), \
@@@ -6154,8 -6189,7 +6157,8 @@@ static const struct __x86_intercept 
  
  static int svm_check_intercept(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
 -                             enum x86_intercept_stage stage)
 +                             enum x86_intercept_stage stage,
 +                             struct x86_exception *exception)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
        int vmexit, ret = X86EMUL_CONTINUE;
@@@ -7339,8 -7373,7 +7342,8 @@@ static bool svm_check_apicv_inhibit_rea
                          BIT(APICV_INHIBIT_REASON_HYPERV) |
                          BIT(APICV_INHIBIT_REASON_NESTED) |
                          BIT(APICV_INHIBIT_REASON_IRQWIN) |
 -                        BIT(APICV_INHIBIT_REASON_PIT_REINJ);
 +                        BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
 +                        BIT(APICV_INHIBIT_REASON_X2APIC);
  
        return supported & BIT(bit);
  }
@@@ -7365,7 -7398,8 +7368,7 @@@ static struct kvm_x86_ops svm_x86_ops _
        .vcpu_free = svm_free_vcpu,
        .vcpu_reset = svm_vcpu_reset,
  
 -      .vm_alloc = svm_vm_alloc,
 -      .vm_free = svm_vm_free,
 +      .vm_size = sizeof(struct kvm_svm),
        .vm_init = svm_vm_init,
        .vm_destroy = svm_vm_destroy,
  
        .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
        .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
        .set_cr0 = svm_set_cr0,
 -      .set_cr3 = svm_set_cr3,
        .set_cr4 = svm_set_cr4,
        .set_efer = svm_set_efer,
        .get_idt = svm_get_idt,
  
        .get_exit_info = svm_get_exit_info,
  
 -      .get_lpage_level = svm_get_lpage_level,
 -
        .cpuid_update = svm_cpuid_update,
  
 -      .rdtscp_supported = svm_rdtscp_supported,
 -      .invpcid_supported = svm_invpcid_supported,
 -      .mpx_supported = svm_mpx_supported,
 -      .xsaves_supported = svm_xsaves_supported,
 -      .umip_emulated = svm_umip_emulated,
 -      .pt_supported = svm_pt_supported,
 -      .pku_supported = svm_pku_supported,
 -
 -      .set_supported_cpuid = svm_set_supported_cpuid,
 -
        .has_wbinvd_exit = svm_has_wbinvd_exit,
  
        .read_l1_tsc_offset = svm_read_l1_tsc_offset,
        .write_l1_tsc_offset = svm_write_l1_tsc_offset,
  
 -      .set_tdp_cr3 = set_tdp_cr3,
 +      .load_mmu_pgd = svm_load_mmu_pgd,
  
        .check_intercept = svm_check_intercept,
        .handle_exit_irqoff = svm_handle_exit_irqoff,
        .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
  
        .apic_init_signal_blocked = svm_apic_init_signal_blocked,
 +
 +      .check_nested_events = svm_check_nested_events,
  };
  
  static int __init svm_init(void)
diff --combined arch/x86/kvm/vmx/vmx.c
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
+ #ifdef MODULE
  static const struct x86_cpu_id vmx_cpu_id[] = {
        X86_FEATURE_MATCH(X86_FEATURE_VMX),
        {}
  };
  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+ #endif
  
  bool __read_mostly enable_vpid = 1;
  module_param_named(vpid, enable_vpid, bool, 0444);
@@@ -433,6 -435,7 +435,6 @@@ static const struct kvm_vmx_segment_fie
        VMX_SEGMENT_FIELD(LDTR),
  };
  
 -u64 host_efer;
  static unsigned long host_idt_base;
  
  /*
@@@ -653,16 -656,53 +655,16 @@@ static int vmx_set_guest_msr(struct vcp
        return ret;
  }
  
 -void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
 -{
 -      vmcs_clear(loaded_vmcs->vmcs);
 -      if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 -              vmcs_clear(loaded_vmcs->shadow_vmcs);
 -      loaded_vmcs->cpu = -1;
 -      loaded_vmcs->launched = 0;
 -}
 -
  #ifdef CONFIG_KEXEC_CORE
 -/*
 - * This bitmap is used to indicate whether the vmclear
 - * operation is enabled on all cpus. All disabled by
 - * default.
 - */
 -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
 -
 -static inline void crash_enable_local_vmclear(int cpu)
 -{
 -      cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
 -}
 -
 -static inline void crash_disable_local_vmclear(int cpu)
 -{
 -      cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
 -}
 -
 -static inline int crash_local_vmclear_enabled(int cpu)
 -{
 -      return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
 -}
 -
  static void crash_vmclear_local_loaded_vmcss(void)
  {
        int cpu = raw_smp_processor_id();
        struct loaded_vmcs *v;
  
 -      if (!crash_local_vmclear_enabled(cpu))
 -              return;
 -
        list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                            loaded_vmcss_on_cpu_link)
                vmcs_clear(v->vmcs);
  }
 -#else
 -static inline void crash_enable_local_vmclear(int cpu) { }
 -static inline void crash_disable_local_vmclear(int cpu) { }
  #endif /* CONFIG_KEXEC_CORE */
  
  static void __loaded_vmcs_clear(void *arg)
                return; /* vcpu migration can race with cpu offline */
        if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
 -      crash_disable_local_vmclear(cpu);
 +
 +      vmcs_clear(loaded_vmcs->vmcs);
 +      if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 +              vmcs_clear(loaded_vmcs->shadow_vmcs);
 +
        list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
  
        /*
 -       * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
 -       * is before setting loaded_vmcs->vcpu to -1 which is done in
 -       * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
 -       * then adds the vmcs into percpu list before it is deleted.
 +       * Ensure all writes to loaded_vmcs, including deleting it from its
 +       * current percpu list, complete before setting loaded_vmcs->vcpu to
 +       * -1, otherwise a different cpu can see vcpu == -1 first and add
 +       * loaded_vmcs to its percpu list before it's deleted from this cpu's
 +       * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
         */
        smp_wmb();
  
 -      loaded_vmcs_init(loaded_vmcs);
 -      crash_enable_local_vmclear(cpu);
 +      loaded_vmcs->cpu = -1;
 +      loaded_vmcs->launched = 0;
  }
  
  void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@@ -775,7 -810,7 +777,7 @@@ void update_exception_bitmap(struct kvm
        if (to_vmx(vcpu)->rmode.vm86_active)
                eb = ~0;
        if (enable_ept)
 -              eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 +              eb &= ~(1u << PF_VECTOR);
  
        /* When we are running a nested L2 guest and L1 specified for it a
         * certain exception bitmap, we must trap the same exceptions and pass
@@@ -1026,7 -1061,7 +1028,7 @@@ static unsigned long segment_base(u16 s
  
  static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
  {
 -      return (pt_mode == PT_MODE_HOST_GUEST) &&
 +      return vmx_pt_mode_is_host_guest() &&
               !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
  }
  
@@@ -1060,7 -1095,7 +1062,7 @@@ static inline void pt_save_msr(struct p
  
  static void pt_guest_enter(struct vcpu_vmx *vmx)
  {
 -      if (pt_mode == PT_MODE_SYSTEM)
 +      if (vmx_pt_mode_is_system())
                return;
  
        /*
  
  static void pt_guest_exit(struct vcpu_vmx *vmx)
  {
 -      if (pt_mode == PT_MODE_SYSTEM)
 +      if (vmx_pt_mode_is_system())
                return;
  
        if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
@@@ -1310,17 -1345,18 +1312,17 @@@ void vmx_vcpu_load_vmcs(struct kvm_vcp
        if (!already_loaded) {
                loaded_vmcs_clear(vmx->loaded_vmcs);
                local_irq_disable();
 -              crash_disable_local_vmclear(cpu);
  
                /*
 -               * Read loaded_vmcs->cpu should be before fetching
 -               * loaded_vmcs->loaded_vmcss_on_cpu_link.
 -               * See the comments in __loaded_vmcs_clear().
 +               * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
 +               * this cpu's percpu list, otherwise it may not yet be deleted
 +               * from its previous cpu's percpu list.  Pairs with the
 +               * smb_wmb() in __loaded_vmcs_clear().
                 */
                smp_rmb();
  
                list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
 -              crash_enable_local_vmclear(cpu);
                local_irq_enable();
        }
  
@@@ -1653,6 -1689,16 +1655,6 @@@ static void vmx_queue_exception(struct 
        vmx_clear_hlt(vcpu);
  }
  
 -static bool vmx_rdtscp_supported(void)
 -{
 -      return cpu_has_vmx_rdtscp();
 -}
 -
 -static bool vmx_invpcid_supported(void)
 -{
 -      return cpu_has_vmx_invpcid();
 -}
 -
  /*
   * Swap MSR entry in host/guest MSR entry array.
   */
@@@ -1860,24 -1906,24 +1862,24 @@@ static int vmx_get_msr(struct kvm_vcpu 
                                                        &msr_info->data);
                break;
        case MSR_IA32_RTIT_CTL:
 -              if (pt_mode != PT_MODE_HOST_GUEST)
 +              if (!vmx_pt_mode_is_host_guest())
                        return 1;
                msr_info->data = vmx->pt_desc.guest.ctl;
                break;
        case MSR_IA32_RTIT_STATUS:
 -              if (pt_mode != PT_MODE_HOST_GUEST)
 +              if (!vmx_pt_mode_is_host_guest())
                        return 1;
                msr_info->data = vmx->pt_desc.guest.status;
                break;
        case MSR_IA32_RTIT_CR3_MATCH:
 -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +              if (!vmx_pt_mode_is_host_guest() ||
                        !intel_pt_validate_cap(vmx->pt_desc.caps,
                                                PT_CAP_cr3_filtering))
                        return 1;
                msr_info->data = vmx->pt_desc.guest.cr3_match;
                break;
        case MSR_IA32_RTIT_OUTPUT_BASE:
 -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +              if (!vmx_pt_mode_is_host_guest() ||
                        (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_topa_output) &&
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
                msr_info->data = vmx->pt_desc.guest.output_base;
                break;
        case MSR_IA32_RTIT_OUTPUT_MASK:
 -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +              if (!vmx_pt_mode_is_host_guest() ||
                        (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_topa_output) &&
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
                break;
        case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
                index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
 -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +              if (!vmx_pt_mode_is_host_guest() ||
                        (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_num_address_ranges)))
                        return 1;
@@@ -2102,7 -2148,7 +2104,7 @@@ static int vmx_set_msr(struct kvm_vcpu 
                        return 1;
                return vmx_set_vmx_msr(vcpu, msr_index, data);
        case MSR_IA32_RTIT_CTL:
 -              if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +              if (!vmx_pt_mode_is_host_guest() ||
                        vmx_rtit_ctl_check(vcpu, data) ||
                        vmx->nested.vmxon)
                        return 1;
@@@ -2218,33 -2264,18 +2220,33 @@@ static __init int vmx_disabled_by_bios(
               !boot_cpu_has(X86_FEATURE_VMX);
  }
  
 -static void kvm_cpu_vmxon(u64 addr)
 +static int kvm_cpu_vmxon(u64 vmxon_pointer)
  {
 +      u64 msr;
 +
        cr4_set_bits(X86_CR4_VMXE);
        intel_pt_handle_vmx(1);
  
 -      asm volatile ("vmxon %0" : : "m"(addr));
 +      asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
 +                        _ASM_EXTABLE(1b, %l[fault])
 +                        : : [vmxon_pointer] "m"(vmxon_pointer)
 +                        : : fault);
 +      return 0;
 +
 +fault:
 +      WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
 +                rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
 +      intel_pt_handle_vmx(0);
 +      cr4_clear_bits(X86_CR4_VMXE);
 +
 +      return -EFAULT;
  }
  
  static int hardware_enable(void)
  {
        int cpu = raw_smp_processor_id();
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 +      int r;
  
        if (cr4_read_shadow() & X86_CR4_VMXE)
                return -EBUSY;
        INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
        spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
  
 -      /*
 -       * Now we can enable the vmclear operation in kdump
 -       * since the loaded_vmcss_on_cpu list on this cpu
 -       * has been initialized.
 -       *
 -       * Though the cpu is not in VMX operation now, there
 -       * is no problem to enable the vmclear operation
 -       * for the loaded_vmcss_on_cpu list is empty!
 -       */
 -      crash_enable_local_vmclear(cpu);
 +      r = kvm_cpu_vmxon(phys_addr);
 +      if (r)
 +              return r;
  
 -      kvm_cpu_vmxon(phys_addr);
        if (enable_ept)
                ept_sync_global();
  
@@@ -2564,12 -2603,9 +2566,12 @@@ int alloc_loaded_vmcs(struct loaded_vmc
        if (!loaded_vmcs->vmcs)
                return -ENOMEM;
  
 +      vmcs_clear(loaded_vmcs->vmcs);
 +
        loaded_vmcs->shadow_vmcs = NULL;
        loaded_vmcs->hv_timer_soft_disabled = false;
 -      loaded_vmcs_init(loaded_vmcs);
 +      loaded_vmcs->cpu = -1;
 +      loaded_vmcs->launched = 0;
  
        if (cpu_has_vmx_msr_bitmap()) {
                loaded_vmcs->msr_bitmap = (unsigned long *)
@@@ -2951,8 -2987,9 +2953,8 @@@ void vmx_set_cr0(struct kvm_vcpu *vcpu
  
  static int get_ept_level(struct kvm_vcpu *vcpu)
  {
 -      /* Nested EPT currently only supports 4-level walks. */
        if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
 -              return 4;
 +              return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
        if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
                return 5;
        return 4;
@@@ -2972,7 -3009,7 +2974,7 @@@ u64 construct_eptp(struct kvm_vcpu *vcp
        return eptp;
  }
  
 -void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
        struct kvm *kvm = vcpu->kvm;
        bool update_guest_cr3 = true;
@@@ -3989,7 -4026,7 +3991,7 @@@ static void vmx_compute_secondary_exec_
  
        u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
  
 -      if (pt_mode == PT_MODE_SYSTEM)
 +      if (vmx_pt_mode_is_system())
                exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
        if (!cpu_need_virtualize_apic_accesses(vcpu))
                exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
                }
        }
  
 -      if (vmx_rdtscp_supported()) {
 +      if (cpu_has_vmx_rdtscp()) {
                bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
                if (!rdtscp_enabled)
                        exec_control &= ~SECONDARY_EXEC_RDTSCP;
                }
        }
  
 -      if (vmx_invpcid_supported()) {
 +      if (cpu_has_vmx_invpcid()) {
                /* Exposing INVPCID only when PCID is exposed */
                bool invpcid_enabled =
                        guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
@@@ -4230,7 -4267,7 +4232,7 @@@ static void init_vmcs(struct vcpu_vmx *
        if (cpu_has_vmx_encls_vmexit())
                vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
  
 -      if (pt_mode == PT_MODE_HOST_GUEST) {
 +      if (vmx_pt_mode_is_host_guest()) {
                memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
                /* Bit[6~0] are forced to 1, writes are ignored. */
                vmx->pt_desc.guest.output_mask = 0x7F;
@@@ -4458,13 -4495,8 +4460,13 @@@ static int vmx_nmi_allowed(struct kvm_v
  
  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
 -      return (!to_vmx(vcpu)->nested.nested_run_pending &&
 -              vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 +      if (to_vmx(vcpu)->nested.nested_run_pending)
 +              return false;
 +
 +      if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
 +              return true;
 +
 +      return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
  }
@@@ -4520,6 -4552,7 +4522,6 @@@ static bool rmode_exception(struct kvm_
        case GP_VECTOR:
        case MF_VECTOR:
                return true;
 -      break;
        }
        return false;
  }
@@@ -5296,6 -5329,7 +5298,6 @@@ static void vmx_enable_tdp(void
                VMX_EPT_RWX_MASK, 0ull);
  
        ept_set_mmio_spte_mask();
 -      kvm_enable_tdp();
  }
  
  /*
@@@ -5828,23 -5862,8 +5830,23 @@@ static int vmx_handle_exit(struct kvm_v
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
  
 -      if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
 -              return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 +      if (is_guest_mode(vcpu)) {
 +              /*
 +               * The host physical addresses of some pages of guest memory
 +               * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
 +               * Page). The CPU may write to these pages via their host
 +               * physical address while L2 is running, bypassing any
 +               * address-translation-based dirty tracking (e.g. EPT write
 +               * protection).
 +               *
 +               * Mark them dirty on every exit from L2 to prevent them from
 +               * getting out of sync with dirty tracking.
 +               */
 +              nested_mark_vmcs12_pages_dirty(vcpu);
 +
 +              if (nested_vmx_exit_reflected(vcpu, exit_reason))
 +                      return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 +      }
  
        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
                dump_vmcs();
@@@ -6204,13 -6223,15 +6206,13 @@@ static void handle_exception_nmi_irqoff
        vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
  
        /* if exit due to PF check for async PF */
 -      if (is_page_fault(vmx->exit_intr_info))
 +      if (is_page_fault(vmx->exit_intr_info)) {
                vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
 -
        /* Handle machine checks before interrupts are enabled */
 -      if (is_machine_check(vmx->exit_intr_info))
 +      } else if (is_machine_check(vmx->exit_intr_info)) {
                kvm_machine_check();
 -
        /* We need to handle NMIs before interrupts are enabled */
 -      if (is_nmi(vmx->exit_intr_info)) {
 +      } else if (is_nmi(vmx->exit_intr_info)) {
                kvm_before_interrupt(&vmx->vcpu);
                asm("int $2");
                kvm_after_interrupt(&vmx->vcpu);
@@@ -6296,6 -6317,11 +6298,6 @@@ static bool vmx_has_emulated_msr(int in
        }
  }
  
 -static bool vmx_pt_supported(void)
 -{
 -      return pt_mode == PT_MODE_HOST_GUEST;
 -}
 -
  static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
  {
        u32 exit_intr_info;
@@@ -6541,8 -6567,7 +6543,8 @@@ static void vmx_vcpu_run(struct kvm_vcp
  
        pt_guest_enter(vmx);
  
 -      atomic_switch_perf_msrs(vmx);
 +      if (vcpu_to_pmu(vcpu)->version)
 +              atomic_switch_perf_msrs(vmx);
        atomic_switch_umwait_control_msr(vmx);
  
        if (enable_preemption_timer)
        vmx_complete_interrupts(vmx);
  }
  
 -static struct kvm *vmx_vm_alloc(void)
 -{
 -      struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
 -                                          GFP_KERNEL_ACCOUNT | __GFP_ZERO,
 -                                          PAGE_KERNEL);
 -      return &kvm_vmx->kvm;
 -}
 -
 -static void vmx_vm_free(struct kvm *kvm)
 -{
 -      kfree(kvm->arch.hyperv.hv_pa_pg);
 -      vfree(to_kvm_vmx(kvm));
 -}
 -
  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -6861,24 -6900,17 +6863,24 @@@ static u64 vmx_get_mt_mask(struct kvm_v
        u8 cache;
        u64 ipat = 0;
  
 -      /* For VT-d and EPT combination
 -       * 1. MMIO: always map as UC
 -       * 2. EPT with VT-d:
 -       *   a. VT-d without snooping control feature: can't guarantee the
 -       *      result, try to trust guest.
 -       *   b. VT-d with snooping control feature: snooping control feature of
 -       *      VT-d engine can guarantee the cache correctness. Just set it
 -       *      to WB to keep consistent with host. So the same as item 3.
 -       * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
 -       *    consistent with host MTRR
 +      /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
 +       * memory aliases with conflicting memory types and sometimes MCEs.
 +       * We have to be careful as to what are honored and when.
 +       *
 +       * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
 +       * UC.  The effective memory type is UC or WC depending on guest PAT.
 +       * This was historically the source of MCEs and we want to be
 +       * conservative.
 +       *
 +       * When there is no need to deal with noncoherent DMA (e.g., no VT-d
 +       * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
 +       * EPT memory type is set to WB.  The effective memory type is forced
 +       * WB.
 +       *
 +       * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
 +       * EPT memory type is used to emulate guest CD/MTRR.
         */
 +
        if (is_mmio) {
                cache = MTRR_TYPE_UNCACHABLE;
                goto exit;
@@@ -6905,6 -6937,15 +6907,6 @@@ exit
        return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
  }
  
 -static int vmx_get_lpage_level(void)
 -{
 -      if (enable_ept && !cpu_has_vmx_ept_1g_page())
 -              return PT_DIRECTORY_LEVEL;
 -      else
 -              /* For shadow and EPT supported 1GB page */
 -              return PT_PDPE_LEVEL;
 -}
 -
  static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
  {
        /*
@@@ -7095,37 -7136,10 +7097,37 @@@ static void vmx_cpuid_update(struct kvm
        }
  }
  
 -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 +static __init void vmx_set_cpu_caps(void)
  {
 -      if (func == 1 && nested)
 -              entry->ecx |= feature_bit(VMX);
 +      kvm_set_cpu_caps();
 +
 +      /* CPUID 0x1 */
 +      if (nested)
 +              kvm_cpu_cap_set(X86_FEATURE_VMX);
 +
 +      /* CPUID 0x7 */
 +      if (kvm_mpx_supported())
 +              kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
 +      if (cpu_has_vmx_invpcid())
 +              kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
 +      if (vmx_pt_mode_is_host_guest())
 +              kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 +
 +      /* PKU is not yet implemented for shadow paging. */
 +      if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
 +              kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
 +
 +      if (vmx_umip_emulated())
 +              kvm_cpu_cap_set(X86_FEATURE_UMIP);
 +
 +      /* CPUID 0xD.1 */
 +      supported_xss = 0;
 +      if (!vmx_xsaves_supported())
 +              kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
 +
 +      /* CPUID 0x80000001 */
 +      if (!cpu_has_vmx_rdtscp())
 +              kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
  }
  
  static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@@ -7163,15 -7177,16 +7165,16 @@@ static int vmx_check_intercept_io(struc
        else
                intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
  
+       /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
        return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
  }
  
  static int vmx_check_intercept(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
 -                             enum x86_intercept_stage stage)
 +                             enum x86_intercept_stage stage,
 +                             struct x86_exception *exception)
  {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
  
        switch (info->intercept) {
        /*
         */
        case x86_intercept_rdtscp:
                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
 -                      ctxt->exception.vector = UD_VECTOR;
 -                      ctxt->exception.error_code_valid = false;
 +                      exception->vector = UD_VECTOR;
 +                      exception->error_code_valid = false;
                        return X86EMUL_PROPAGATE_FAULT;
                }
                break;
        case x86_intercept_outs:
                return vmx_check_intercept_io(vcpu, info);
  
+       case x86_intercept_lgdt:
+       case x86_intercept_lidt:
+       case x86_intercept_lldt:
+       case x86_intercept_ltr:
+       case x86_intercept_sgdt:
+       case x86_intercept_sidt:
+       case x86_intercept_sldt:
+       case x86_intercept_str:
+               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+                       return X86EMUL_CONTINUE;
+               /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
+               break;
        /* TODO: check more intercepts... */
        default:
                break;
@@@ -7278,8 -7307,7 +7295,8 @@@ static void vmx_sched_in(struct kvm_vcp
  static void vmx_slot_enable_log_dirty(struct kvm *kvm,
                                     struct kvm_memory_slot *slot)
  {
 -      kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
 +      if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
 +              kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
        kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
  }
  
@@@ -7633,7 -7661,9 +7650,7 @@@ static __init int hardware_setup(void
  {
        unsigned long host_bndcfgs;
        struct desc_ptr dt;
 -      int r, i;
 -
 -      rdmsrl_safe(MSR_EFER, &host_efer);
 +      int r, i, ept_lpage_level;
  
        store_idt(&dt);
        host_idt_base = dt.address;
                WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
        }
  
 +      if (!cpu_has_vmx_mpx())
 +              supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
 +                                  XFEATURE_MASK_BNDCSR);
 +
        if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
            !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                enable_vpid = 0;
        if (!cpu_has_vmx_tpr_shadow())
                kvm_x86_ops->update_cr8_intercept = NULL;
  
 -      if (enable_ept && !cpu_has_vmx_ept_2m_page())
 -              kvm_disable_largepages();
 -
  #if IS_ENABLED(CONFIG_HYPERV)
        if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
            && enable_ept) {
  
        if (enable_ept)
                vmx_enable_tdp();
 +
 +      if (!enable_ept)
 +              ept_lpage_level = 0;
 +      else if (cpu_has_vmx_ept_1g_page())
 +              ept_lpage_level = PT_PDPE_LEVEL;
 +      else if (cpu_has_vmx_ept_2m_page())
 +              ept_lpage_level = PT_DIRECTORY_LEVEL;
        else
 -              kvm_disable_tdp();
 +              ept_lpage_level = PT_PAGE_TABLE_LEVEL;
 +      kvm_configure_mmu(enable_ept, ept_lpage_level);
  
        /*
         * Only enable PML when hardware supports PML feature, and both EPT
                        return r;
        }
  
 +      vmx_set_cpu_caps();
 +
        r = alloc_kvm_area();
        if (r)
                nested_vmx_hardware_unsetup();
@@@ -7829,8 -7848,9 +7846,8 @@@ static struct kvm_x86_ops vmx_x86_ops _
        .cpu_has_accelerated_tpr = report_flexpriority,
        .has_emulated_msr = vmx_has_emulated_msr,
  
 +      .vm_size = sizeof(struct kvm_vmx),
        .vm_init = vmx_vm_init,
 -      .vm_alloc = vmx_vm_alloc,
 -      .vm_free = vmx_vm_free,
  
        .vcpu_create = vmx_create_vcpu,
        .vcpu_free = vmx_free_vcpu,
        .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
        .set_cr0 = vmx_set_cr0,
 -      .set_cr3 = vmx_set_cr3,
        .set_cr4 = vmx_set_cr4,
        .set_efer = vmx_set_efer,
        .get_idt = vmx_get_idt,
  
        .get_exit_info = vmx_get_exit_info,
  
 -      .get_lpage_level = vmx_get_lpage_level,
 -
        .cpuid_update = vmx_cpuid_update,
  
 -      .rdtscp_supported = vmx_rdtscp_supported,
 -      .invpcid_supported = vmx_invpcid_supported,
 -
 -      .set_supported_cpuid = vmx_set_supported_cpuid,
 -
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  
        .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
        .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
  
 -      .set_tdp_cr3 = vmx_set_cr3,
 +      .load_mmu_pgd = vmx_load_mmu_pgd,
  
        .check_intercept = vmx_check_intercept,
        .handle_exit_irqoff = vmx_handle_exit_irqoff,
 -      .mpx_supported = vmx_mpx_supported,
 -      .xsaves_supported = vmx_xsaves_supported,
 -      .umip_emulated = vmx_umip_emulated,
 -      .pt_supported = vmx_pt_supported,
 -      .pku_supported = vmx_pku_supported,
  
        .request_immediate_exit = vmx_request_immediate_exit,
  
diff --combined arch/x86/kvm/x86.c
@@@ -22,7 -22,6 +22,7 @@@
  #include "i8254.h"
  #include "tss.h"
  #include "kvm_cache_regs.h"
 +#include "kvm_emulate.h"
  #include "x86.h"
  #include "cpuid.h"
  #include "pmu.h"
@@@ -82,7 -81,7 +82,7 @@@ u64 __read_mostly kvm_mce_cap_supporte
  EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
  
  #define emul_to_vcpu(ctxt) \
 -      container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
 +      ((struct kvm_vcpu *)(ctxt)->vcpu)
  
  /* EFER defaults:
   * - enable syscall per default because its emulated by KVM
@@@ -181,17 -180,7 +181,17 @@@ struct kvm_shared_msrs 
  static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
  static struct kvm_shared_msrs __percpu *shared_msrs;
  
 +#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 +                              | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
 +                              | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
 +                              | XFEATURE_MASK_PKRU)
 +
 +u64 __read_mostly host_efer;
 +EXPORT_SYMBOL_GPL(host_efer);
 +
  static u64 __read_mostly host_xss;
 +u64 __read_mostly supported_xss;
 +EXPORT_SYMBOL_GPL(supported_xss);
  
  struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
  };
  
  u64 __read_mostly host_xcr0;
 +u64 __read_mostly supported_xcr0;
 +EXPORT_SYMBOL_GPL(supported_xcr0);
  
  struct kmem_cache *x86_fpu_cache;
  EXPORT_SYMBOL_GPL(x86_fpu_cache);
  
 +static struct kmem_cache *x86_emulator_cache;
 +
 +static struct kmem_cache *kvm_alloc_emulator_cache(void)
 +{
 +      unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
 +      unsigned int size = sizeof(struct x86_emulate_ctxt);
 +
 +      return kmem_cache_create_usercopy("x86_emulator", size,
 +                                        __alignof__(struct x86_emulate_ctxt),
 +                                        SLAB_ACCOUNT, useroffset,
 +                                        size - useroffset, NULL);
 +}
 +
  static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
  
  static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
@@@ -376,7 -350,6 +376,7 @@@ int kvm_set_apic_base(struct kvm_vcpu *
        }
  
        kvm_lapic_set_base(vcpu, msr_info->data);
 +      kvm_recalculate_apic_map(vcpu->kvm);
        return 0;
  }
  EXPORT_SYMBOL_GPL(kvm_set_apic_base);
@@@ -930,10 -903,10 +930,10 @@@ static u64 kvm_host_cr4_reserved_bits(s
  {
        u64 reserved_bits = __cr4_reserved_bits(cpu_has, c);
  
 -      if (cpuid_ecx(0x7) & feature_bit(LA57))
 +      if (kvm_cpu_cap_has(X86_FEATURE_LA57))
                reserved_bits &= ~X86_CR4_LA57;
  
 -      if (kvm_x86_ops->umip_emulated())
 +      if (kvm_cpu_cap_has(X86_FEATURE_UMIP))
                reserved_bits &= ~X86_CR4_UMIP;
  
        return reserved_bits;
@@@ -1585,12 -1558,8 +1585,12 @@@ static int handle_fastpath_set_x2apic_i
                ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
                ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
  
 +              data &= ~(1 << 12);
 +              kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
                kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
 -              return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
 +              kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
 +              trace_kvm_apic_write(APIC_ICR, (u32)data);
 +              return 0;
        }
  
        return 1;
  enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
  {
        u32 msr = kvm_rcx_read(vcpu);
 -      u64 data = kvm_read_edx_eax(vcpu);
 +      u64 data;
        int ret = 0;
  
        switch (msr) {
        case APIC_BASE_MSR + (APIC_ICR >> 4):
 +              data = kvm_read_edx_eax(vcpu);
                ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
                break;
        default:
@@@ -2555,7 -2523,7 +2555,7 @@@ static void kvmclock_sync_fn(struct wor
  static bool can_set_mci_status(struct kvm_vcpu *vcpu)
  {
        /* McStatusWrEn enabled? */
 -      if (guest_cpuid_is_amd(vcpu))
 +      if (guest_cpuid_is_amd_or_hygon(vcpu))
                return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
  
        return false;
@@@ -2830,11 -2798,12 +2830,11 @@@ int kvm_set_msr_common(struct kvm_vcpu 
                    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
                        return 1;
                /*
 -               * We do support PT if kvm_x86_ops->pt_supported(), but we do
 -               * not support IA32_XSS[bit 8]. Guests will have to use
 -               * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
 -               * MSRs.
 +               * KVM supports exposing PT to the guest, but does not support
 +               * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
 +               * XSAVES/XRSTORS to save/restore PT MSRs.
                 */
 -              if (data != 0)
 +              if (data & ~supported_xss)
                        return 1;
                vcpu->arch.ia32_xss = data;
                break;
@@@ -3108,6 -3077,7 +3108,6 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                break;
        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
                return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
 -              break;
        case MSR_IA32_TSCDEADLINE:
                msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
                break;
                return kvm_hv_get_msr_common(vcpu,
                                             msr_info->index, &msr_info->data,
                                             msr_info->host_initiated);
 -              break;
        case MSR_IA32_BBL_CR_CTL3:
                /* This legacy MSR exists but isn't fully documented in current
                 * silicon.  It is however accessed by winxp in very narrow
@@@ -3493,7 -3464,7 +3493,7 @@@ long kvm_arch_dev_ioctl(struct file *fi
                r = 0;
                break;
        }
 -      case KVM_X86_GET_MCE_CAP_SUPPORTED: {
 +      case KVM_X86_GET_MCE_CAP_SUPPORTED:
                r = -EFAULT;
                if (copy_to_user(argp, &kvm_mce_cap_supported,
                                 sizeof(kvm_mce_cap_supported)))
        case KVM_GET_MSRS:
                r = msr_io(NULL, argp, do_get_msr_feature, 1);
                break;
 -      }
        default:
                r = -EINVAL;
 +              break;
        }
  out:
        return r;
@@@ -4130,7 -4101,8 +4130,7 @@@ static int kvm_vcpu_ioctl_x86_set_xsave
                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
                 * with old userspace.
                 */
 -              if (xstate_bv & ~kvm_supported_xcr0() ||
 -                      mxcsr & ~mxcsr_feature_mask)
 +              if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
                        return -EINVAL;
                load_xsave(vcpu, (u8 *)guest_xsave->region);
        } else {
@@@ -4789,13 -4761,77 +4789,13 @@@ static int kvm_vm_ioctl_reinject(struc
        return 0;
  }
  
 -/**
 - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
 - * @kvm: kvm instance
 - * @log: slot id and address to which we copy the log
 - *
 - * Steps 1-4 below provide general overview of dirty page logging. See
 - * kvm_get_dirty_log_protect() function description for additional details.
 - *
 - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
 - * always flush the TLB (step 4) even if previous step failed  and the dirty
 - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
 - * does not preclude user space subsequent dirty log read. Flushing TLB ensures
 - * writes will be marked dirty for next log read.
 - *
 - *   1. Take a snapshot of the bit and clear it if needed.
 - *   2. Write protect the corresponding page.
 - *   3. Copy the snapshot to the userspace.
 - *   4. Flush TLB's if needed.
 - */
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
  {
 -      bool flush = false;
 -      int r;
 -
 -      mutex_lock(&kvm->slots_lock);
 -
        /*
         * Flush potentially hardware-cached dirty pages to dirty_bitmap.
         */
        if (kvm_x86_ops->flush_log_dirty)
                kvm_x86_ops->flush_log_dirty(kvm);
 -
 -      r = kvm_get_dirty_log_protect(kvm, log, &flush);
 -
 -      /*
 -       * All the TLBs can be flushed out of mmu lock, see the comments in
 -       * kvm_mmu_slot_remove_write_access().
 -       */
 -      lockdep_assert_held(&kvm->slots_lock);
 -      if (flush)
 -              kvm_flush_remote_tlbs(kvm);
 -
 -      mutex_unlock(&kvm->slots_lock);
 -      return r;
 -}
 -
 -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
 -{
 -      bool flush = false;
 -      int r;
 -
 -      mutex_lock(&kvm->slots_lock);
 -
 -      /*
 -       * Flush potentially hardware-cached dirty pages to dirty_bitmap.
 -       */
 -      if (kvm_x86_ops->flush_log_dirty)
 -              kvm_x86_ops->flush_log_dirty(kvm);
 -
 -      r = kvm_clear_dirty_log_protect(kvm, log, &flush);
 -
 -      /*
 -       * All the TLBs can be flushed out of mmu lock, see the comments in
 -       * kvm_mmu_slot_remove_write_access().
 -       */
 -      lockdep_assert_held(&kvm->slots_lock);
 -      if (flush)
 -              kvm_flush_remote_tlbs(kvm);
 -
 -      mutex_unlock(&kvm->slots_lock);
 -      return r;
  }
  
  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@@ -5224,28 -5260,28 +5224,28 @@@ static void kvm_init_msr_list(void
                                continue;
                        break;
                case MSR_TSC_AUX:
 -                      if (!kvm_x86_ops->rdtscp_supported())
 +                      if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
                                continue;
                        break;
                case MSR_IA32_RTIT_CTL:
                case MSR_IA32_RTIT_STATUS:
 -                      if (!kvm_x86_ops->pt_supported())
 +                      if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
                                continue;
                        break;
                case MSR_IA32_RTIT_CR3_MATCH:
 -                      if (!kvm_x86_ops->pt_supported() ||
 +                      if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                            !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
                                continue;
                        break;
                case MSR_IA32_RTIT_OUTPUT_BASE:
                case MSR_IA32_RTIT_OUTPUT_MASK:
 -                      if (!kvm_x86_ops->pt_supported() ||
 +                      if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                                (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
                                 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
                                continue;
                        break;
                case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
 -                      if (!kvm_x86_ops->pt_supported() ||
 +                      if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                                msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
                                intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
                                continue;
@@@ -5702,7 -5738,7 +5702,7 @@@ static int emulator_read_write_onepage(
        int handled, ret;
        bool write = ops->write;
        struct kvm_mmio_fragment *frag;
 -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
  
        /*
         * If the exit was due to a NPF we may already have a GPA.
         * operation using rep will only have the initial GPA from the NPF
         * occurred.
         */
 -      if (vcpu->arch.gpa_available &&
 -          emulator_can_use_gpa(ctxt) &&
 -          (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
 -              gpa = vcpu->arch.gpa_val;
 +      if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
 +          (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
 +              gpa = ctxt->gpa_val;
                ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
        } else {
                ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@@ -5933,9 -5970,11 +5933,9 @@@ static int emulator_pio_in_out(struct k
        return 0;
  }
  
 -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 -                                  int size, unsigned short port, void *val,
 -                                  unsigned int count)
 +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
 +                         unsigned short port, void *val, unsigned int count)
  {
 -      struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        int ret;
  
        if (vcpu->arch.pio.count)
@@@ -5955,30 -5994,17 +5955,30 @@@ data_avail
        return 0;
  }
  
 -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
 -                                   int size, unsigned short port,
 -                                   const void *val, unsigned int count)
 +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 +                                  int size, unsigned short port, void *val,
 +                                  unsigned int count)
  {
 -      struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 +      return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
 +
 +}
  
 +static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
 +                          unsigned short port, const void *val,
 +                          unsigned int count)
 +{
        memcpy(vcpu->arch.pio_data, val, size * count);
        trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
        return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
  }
  
 +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
 +                                   int size, unsigned short port,
 +                                   const void *val, unsigned int count)
 +{
 +      return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
 +}
 +
  static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
  {
        return kvm_x86_ops->get_segment_base(vcpu, seg);
@@@ -6241,15 -6267,13 +6241,15 @@@ static int emulator_intercept(struct x8
                              struct x86_instruction_info *info,
                              enum x86_intercept_stage stage)
  {
 -      return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
 +      return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage,
 +                                          &ctxt->exception);
  }
  
  static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
 -                      u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
 +                            u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
 +                            bool exact_only)
  {
 -      return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
 +      return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
  }
  
  static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
@@@ -6376,7 -6400,7 +6376,7 @@@ static void toggle_interruptibility(str
  
  static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
  {
 -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        if (ctxt->exception.vector == PF_VECTOR)
                return kvm_propagate_fault(vcpu, &ctxt->exception);
  
        return false;
  }
  
 +static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
 +{
 +      struct x86_emulate_ctxt *ctxt;
 +
 +      ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
 +      if (!ctxt) {
 +              pr_err("kvm: failed to allocate vcpu's emulator\n");
 +              return NULL;
 +      }
 +
 +      ctxt->vcpu = vcpu;
 +      ctxt->ops = &emulate_ops;
 +      vcpu->arch.emulate_ctxt = ctxt;
 +
 +      return ctxt;
 +}
 +
  static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
  {
 -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int cs_db, cs_l;
  
        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
  
 +      ctxt->gpa_available = false;
        ctxt->eflags = kvm_get_rflags(vcpu);
        ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
  
  
  void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
  {
 -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int ret;
  
        init_emulate_ctxt(vcpu);
@@@ -6488,11 -6494,10 +6488,11 @@@ static bool reexecute_instruction(struc
        gpa_t gpa = cr2_or_gpa;
        kvm_pfn_t pfn;
  
 -      if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
 +      if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                return false;
  
 -      if (WARN_ON_ONCE(is_guest_mode(vcpu)))
 +      if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
 +          WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                return false;
  
        if (!vcpu->arch.mmu->direct_map) {
@@@ -6580,11 -6585,10 +6580,11 @@@ static bool retry_instruction(struct x8
         */
        vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
  
 -      if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
 +      if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                return false;
  
 -      if (WARN_ON_ONCE(is_guest_mode(vcpu)))
 +      if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
 +          WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                return false;
  
        if (x86_page_table_writing_insn(ctxt))
@@@ -6747,7 -6751,7 +6747,7 @@@ int x86_emulate_instruction(struct kvm_
                            int emulation_type, void *insn, int insn_len)
  {
        int r;
 -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        bool writeback = true;
        bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
  
        }
  
  restart:
 -      /* Save the faulting GPA (cr2) in the address field */
 -      ctxt->exception.address = cr2_or_gpa;
 +      if (emulation_type & EMULTYPE_PF) {
 +              /* Save the faulting GPA (cr2) in the address field */
 +              ctxt->exception.address = cr2_or_gpa;
 +
 +              /* With shadow page tables, cr2 contains a GVA or nGPA. */
 +              if (vcpu->arch.mmu->direct_map) {
 +                      ctxt->gpa_available = true;
 +                      ctxt->gpa_val = cr2_or_gpa;
 +              }
 +      } else {
 +              /* Sanitize the address out of an abundance of paranoia. */
 +              ctxt->exception.address = 0;
 +      }
  
        r = x86_emulate_insn(ctxt);
  
@@@ -6950,8 -6943,8 +6950,8 @@@ static int kvm_fast_pio_out(struct kvm_
                            unsigned short port)
  {
        unsigned long val = kvm_rax_read(vcpu);
 -      int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
 -                                          size, port, &val, 1);
 +      int ret = emulator_pio_out(vcpu, size, port, &val, 1);
 +
        if (ret)
                return ret;
  
@@@ -6987,10 -6980,11 +6987,10 @@@ static int complete_fast_pio_in(struct 
        val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
  
        /*
 -       * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
 +       * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
         * the copy and tracing
         */
 -      emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
 -                               vcpu->arch.pio.port, &val, 1);
 +      emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
        kvm_rax_write(vcpu, val);
  
        return kvm_skip_emulated_instruction(vcpu);
@@@ -7005,7 -6999,8 +7005,7 @@@ static int kvm_fast_pio_in(struct kvm_v
        /* For size less than 4 we merge, else we zero extend */
        val = (size < 4) ? kvm_rax_read(vcpu) : 0;
  
 -      ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
 -                                     &val, 1);
 +      ret = emulator_pio_in(vcpu, size, port, &val, 1);
        if (ret) {
                kvm_rax_write(vcpu, val);
                return ret;
@@@ -7195,15 -7190,15 +7195,15 @@@ static void kvm_timer_init(void
  
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
  #ifdef CONFIG_CPU_FREQ
-               struct cpufreq_policy policy;
+               struct cpufreq_policy *policy;
                int cpu;
  
-               memset(&policy, 0, sizeof(policy));
                cpu = get_cpu();
-               cpufreq_get_policy(&policy, cpu);
-               if (policy.cpuinfo.max_freq)
-                       max_tsc_khz = policy.cpuinfo.max_freq;
+               policy = cpufreq_cpu_get(cpu);
+               if (policy && policy->cpuinfo.max_freq)
+                       max_tsc_khz = policy->cpuinfo.max_freq;
                put_cpu();
+               cpufreq_cpu_put(policy);
  #endif
                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
                                          CPUFREQ_TRANSITION_NOTIFIER);
@@@ -7313,12 -7308,12 +7313,12 @@@ int kvm_arch_init(void *opaque
        }
  
        if (!ops->cpu_has_kvm_support()) {
-               printk(KERN_ERR "kvm: no hardware support\n");
+               pr_err_ratelimited("kvm: no hardware support\n");
                r = -EOPNOTSUPP;
                goto out;
        }
        if (ops->disabled_by_bios()) {
-               printk(KERN_ERR "kvm: disabled by bios\n");
+               pr_err_ratelimited("kvm: disabled by bios\n");
                r = -EOPNOTSUPP;
                goto out;
        }
                goto out;
        }
  
 +      x86_emulator_cache = kvm_alloc_emulator_cache();
 +      if (!x86_emulator_cache) {
 +              pr_err("kvm: failed to allocate cache for x86 emulator\n");
 +              goto out_free_x86_fpu_cache;
 +      }
 +
        shared_msrs = alloc_percpu(struct kvm_shared_msrs);
        if (!shared_msrs) {
                printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
 -              goto out_free_x86_fpu_cache;
 +              goto out_free_x86_emulator_cache;
        }
  
        r = kvm_mmu_module_init();
  
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
  
 -      if (boot_cpu_has(X86_FEATURE_XSAVE))
 +      if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 +              supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
 +      }
  
        kvm_lapic_init();
        if (pi_inject_timer == -1)
  
  out_free_percpu:
        free_percpu(shared_msrs);
 +out_free_x86_emulator_cache:
 +      kmem_cache_destroy(x86_emulator_cache);
  out_free_x86_fpu_cache:
        kmem_cache_destroy(x86_fpu_cache);
  out:
@@@ -7646,7 -7631,7 +7646,7 @@@ static void update_cr8_intercept(struc
        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
  }
  
 -static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 +static int inject_pending_event(struct kvm_vcpu *vcpu)
  {
        int r;
  
         * from L2 to L1.
         */
        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
 -              r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
 +              r = kvm_x86_ops->check_nested_events(vcpu);
                if (r != 0)
                        return r;
        }
                 * KVM_REQ_EVENT only on certain events and not unconditionally?
                 */
                if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
 -                      r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
 +                      r = kvm_x86_ops->check_nested_events(vcpu);
                        if (r != 0)
                                return r;
                }
@@@ -8054,26 -8039,19 +8054,26 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv
   */
  void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
  {
 +      unsigned long old, new, expected;
 +
        if (!kvm_x86_ops->check_apicv_inhibit_reasons ||
            !kvm_x86_ops->check_apicv_inhibit_reasons(bit))
                return;
  
 -      if (activate) {
 -              if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
 -                  !kvm_apicv_activated(kvm))
 -                      return;
 -      } else {
 -              if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
 -                  kvm_apicv_activated(kvm))
 -                      return;
 -      }
 +      old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
 +      do {
 +              expected = new = old;
 +              if (activate)
 +                      __clear_bit(bit, &new);
 +              else
 +                      __set_bit(bit, &new);
 +              if (new == old)
 +                      break;
 +              old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
 +      } while (old != expected);
 +
 +      if (!!old == !!new)
 +              return;
  
        trace_kvm_apicv_update_request(activate, bit);
        if (kvm_x86_ops->pre_update_apicv_exec_ctrl)
@@@ -8198,8 -8176,8 +8198,8 @@@ static int vcpu_enter_guest(struct kvm_
                }
                if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
                        kvm_mmu_sync_roots(vcpu);
 -              if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
 -                      kvm_mmu_load_cr3(vcpu);
 +              if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
 +                      kvm_mmu_load_pgd(vcpu);
                if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
                        kvm_vcpu_flush_tlb(vcpu, true);
                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
                        goto out;
                }
  
 -              if (inject_pending_event(vcpu, req_int_win) != 0)
 +              if (inject_pending_event(vcpu) != 0)
                        req_immediate_exit = true;
                else {
                        /* Enable SMI/NMI/IRQ window open exits if needed.
        if (vcpu->arch.apic_attention)
                kvm_lapic_sync_from_vapic(vcpu);
  
 -      vcpu->arch.gpa_available = false;
        r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath);
        return r;
  
@@@ -8505,6 -8484,7 +8505,6 @@@ static inline int vcpu_block(struct kv
                break;
        default:
                return -EINTR;
 -              break;
        }
        return 1;
  }
  static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
  {
        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
 -              kvm_x86_ops->check_nested_events(vcpu, false);
 +              kvm_x86_ops->check_nested_events(vcpu);
  
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
@@@ -8773,7 -8753,7 +8773,7 @@@ static void __get_regs(struct kvm_vcpu 
                 * that usually, but some bad designed PV devices (vmware
                 * backdoor interface) need this to work
                 */
 -              emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
 +              emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
        }
        regs->rax = kvm_rax_read(vcpu);
@@@ -8959,7 -8939,7 +8959,7 @@@ out
  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
                    int reason, bool has_error_code, u32 error_code)
  {
 -      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int ret;
  
        init_emulate_ctxt(vcpu);
@@@ -9291,6 -9271,7 +9291,6 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
        struct page *page;
        int r;
  
 -      vcpu->arch.emulate_ctxt.ops = &emulate_ops;
        if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
                                GFP_KERNEL_ACCOUNT))
                goto fail_free_mce_banks;
  
 +      if (!alloc_emulate_ctxt(vcpu))
 +              goto free_wbinvd_dirty_mask;
 +
        vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
                                                GFP_KERNEL_ACCOUNT);
        if (!vcpu->arch.user_fpu) {
                pr_err("kvm: failed to allocate userspace's fpu\n");
 -              goto free_wbinvd_dirty_mask;
 +              goto free_emulate_ctxt;
        }
  
        vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
@@@ -9377,8 -9355,6 +9377,8 @@@ free_guest_fpu
        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
  free_user_fpu:
        kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
 +free_emulate_ctxt:
 +      kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
  free_wbinvd_dirty_mask:
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
  fail_free_mce_banks:
@@@ -9413,9 -9389,11 +9413,9 @@@ void kvm_arch_vcpu_postcreate(struct kv
  
        mutex_unlock(&vcpu->mutex);
  
 -      if (!kvmclock_periodic_sync)
 -              return;
 -
 -      schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 -                                      KVMCLOCK_SYNC_PERIOD);
 +      if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
 +              schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 +                                              KVMCLOCK_SYNC_PERIOD);
  }
  
  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  
        kvm_x86_ops->vcpu_free(vcpu);
  
 +      kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
        kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
@@@ -9630,18 -9607,10 +9630,18 @@@ int kvm_arch_hardware_setup(void
  {
        int r;
  
 +      rdmsrl_safe(MSR_EFER, &host_efer);
 +
 +      if (boot_cpu_has(X86_FEATURE_XSAVES))
 +              rdmsrl(MSR_IA32_XSS, host_xss);
 +
        r = kvm_x86_ops->hardware_setup();
        if (r != 0)
                return r;
  
 +      if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
 +              supported_xss = 0;
 +
        cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
  
        if (kvm_has_tsc_control) {
                kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
        }
  
 -      if (boot_cpu_has(X86_FEATURE_XSAVES))
 -              rdmsrl(MSR_IA32_XSS, host_xss);
 -
        kvm_init_msr_list();
        return 0;
  }
@@@ -9705,13 -9677,6 +9705,13 @@@ void kvm_arch_sched_in(struct kvm_vcpu 
        kvm_x86_ops->sched_in(vcpu, cpu);
  }
  
 +void kvm_arch_free_vm(struct kvm *kvm)
 +{
 +      kfree(kvm->arch.hyperv.hv_pa_pg);
 +      vfree(kvm);
 +}
 +
 +
  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  {
        if (type)
@@@ -9794,9 -9759,9 +9794,9 @@@ void kvm_arch_sync_events(struct kvm *k
  int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
  {
        int i, r;
 -      unsigned long hva;
 +      unsigned long hva, uninitialized_var(old_npages);
        struct kvm_memslots *slots = kvm_memslots(kvm);
 -      struct kvm_memory_slot *slot, old;
 +      struct kvm_memory_slot *slot;
  
        /* Called with kvm->slots_lock held.  */
        if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
  
        slot = id_to_memslot(slots, id);
        if (size) {
 -              if (slot->npages)
 +              if (slot && slot->npages)
                        return -EEXIST;
  
                /*
                if (IS_ERR((void *)hva))
                        return PTR_ERR((void *)hva);
        } else {
 -              if (!slot->npages)
 +              if (!slot || !slot->npages)
                        return 0;
  
 -              hva = 0;
 +              /*
 +               * Stuff a non-canonical value to catch use-after-delete.  This
 +               * ends up being 0 on 32-bit KVM, but there's no better
 +               * alternative.
 +               */
 +              hva = (unsigned long)(0xdeadull << 48);
 +              old_npages = slot->npages;
        }
  
 -      old = *slot;
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                struct kvm_userspace_memory_region m;
  
        }
  
        if (!size)
 -              vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
 +              vm_munmap(hva, old_npages * PAGE_SIZE);
  
        return 0;
  }
@@@ -9881,36 -9841,34 +9881,36 @@@ void kvm_arch_destroy_vm(struct kvm *kv
        kvm_hv_destroy_vm(kvm);
  }
  
 -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 -                         struct kvm_memory_slot *dont)
 +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
  {
        int i;
  
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 -              if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
 -                      kvfree(free->arch.rmap[i]);
 -                      free->arch.rmap[i] = NULL;
 -              }
 +              kvfree(slot->arch.rmap[i]);
 +              slot->arch.rmap[i] = NULL;
 +
                if (i == 0)
                        continue;
  
 -              if (!dont || free->arch.lpage_info[i - 1] !=
 -                           dont->arch.lpage_info[i - 1]) {
 -                      kvfree(free->arch.lpage_info[i - 1]);
 -                      free->arch.lpage_info[i - 1] = NULL;
 -              }
 +              kvfree(slot->arch.lpage_info[i - 1]);
 +              slot->arch.lpage_info[i - 1] = NULL;
        }
  
 -      kvm_page_track_free_memslot(free, dont);
 +      kvm_page_track_free_memslot(slot);
  }
  
 -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 -                          unsigned long npages)
 +static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
 +                                    unsigned long npages)
  {
        int i;
  
 +      /*
 +       * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
 +       * old arrays will be freed by __kvm_set_memory_region() if installing
 +       * the new memslot is successful.
 +       */
 +      memset(&slot->arch, 0, sizeof(slot->arch));
 +
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                struct kvm_lpage_info *linfo;
                unsigned long ugfn;
                ugfn = slot->userspace_addr >> PAGE_SHIFT;
                /*
                 * If the gfn and userspace address are not aligned wrt each
 -               * other, or if explicitly asked to, disable large page
 -               * support for this slot
 +               * other, disable large page support for this slot.
                 */
 -              if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
 -                  !kvm_largepages_enabled()) {
 +              if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
                        unsigned long j;
  
                        for (j = 0; j < lpages; ++j)
@@@ -9990,9 -9950,6 +9990,9 @@@ int kvm_arch_prepare_memory_region(stru
                                const struct kvm_userspace_memory_region *mem,
                                enum kvm_mr_change change)
  {
 +      if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
 +              return kvm_alloc_memslot_metadata(memslot,
 +                                                mem->memory_size >> PAGE_SHIFT);
        return 0;
  }
  
@@@ -10001,7 -9958,7 +10001,7 @@@ static void kvm_mmu_slot_apply_flags(st
  {
        /* Still write protect RO slot */
        if (new->flags & KVM_MEM_READONLY) {
 -              kvm_mmu_slot_remove_write_access(kvm, new);
 +              kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL);
                return;
        }
  
         * See the comments in fast_page_fault().
         */
        if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
 -              if (kvm_x86_ops->slot_enable_log_dirty)
 +              if (kvm_x86_ops->slot_enable_log_dirty) {
                        kvm_x86_ops->slot_enable_log_dirty(kvm, new);
 -              else
 -                      kvm_mmu_slot_remove_write_access(kvm, new);
 +              } else {
 +                      int level =
 +                              kvm_dirty_log_manual_protect_and_init_set(kvm) ?
 +                              PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL;
 +
 +                      /*
 +                       * If we're with initial-all-set, we don't need
 +                       * to write protect any small page because
 +                       * they're reported as dirty already.  However
 +                       * we still need to write-protect huge pages
 +                       * so that the page split can happen lazily on
 +                       * the first write to the huge page.
 +                       */
 +                      kvm_mmu_slot_remove_write_access(kvm, new, level);
 +              }
        } else {
                if (kvm_x86_ops->slot_disable_log_dirty)
                        kvm_x86_ops->slot_disable_log_dirty(kvm, new);
  
  void kvm_arch_commit_memory_region(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem,
 -                              const struct kvm_memory_slot *old,
 +                              struct kvm_memory_slot *old,
                                const struct kvm_memory_slot *new,
                                enum kvm_mr_change change)
  {
         */
        if (change != KVM_MR_DELETE)
                kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
 +
 +      /* Free the arrays associated with the old memslot. */
 +      if (change == KVM_MR_MOVE)
 +              kvm_arch_free_memslot(kvm, old);
  }
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@@ -10251,7 -10191,7 +10251,7 @@@ void kvm_arch_async_page_ready(struct k
                return;
  
        if (!vcpu->arch.mmu->direct_map &&
 -            work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
 +            work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
                return;
  
        kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
@@@ -10574,5 -10514,4 +10574,5 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_fu
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
 +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
diff --combined include/linux/kvm_host.h
@@@ -360,10 -360,6 +360,10 @@@ static inline unsigned long *kvm_second
        return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap);
  }
  
 +#ifndef KVM_DIRTY_LOG_MANUAL_CAPS
 +#define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE
 +#endif
 +
  struct kvm_s390_adapter_int {
        u64 ind_addr;
        u64 summary_addr;
@@@ -435,11 -431,11 +435,11 @@@ static inline int kvm_arch_vcpu_memslot
   */
  struct kvm_memslots {
        u64 generation;
 -      struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
        /* The mapping table from slot id to the index in memslots[]. */
        short id_to_index[KVM_MEM_SLOTS_NUM];
        atomic_t lru_slot;
        int used_slots;
 +      struct kvm_memory_slot memslots[];
  };
  
  struct kvm {
  #endif
        long tlbs_dirty;
        struct list_head devices;
 -      bool manual_dirty_log_protect;
 +      u64 manual_dirty_log_protect;
        struct dentry *debugfs_dentry;
        struct kvm_stat_data **debugfs_stat_data;
        struct srcu_struct srcu;
  #define vcpu_err(vcpu, fmt, ...)                                      \
        kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
  
 +static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
 +{
 +      return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
 +}
 +
  static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx)
  {
        return srcu_dereference_check(kvm->buses[idx], &kvm->srcu,
@@@ -581,11 -572,10 +581,11 @@@ static inline int kvm_vcpu_get_idx(stru
        return vcpu->vcpu_idx;
  }
  
 -#define kvm_for_each_memslot(memslot, slots)  \
 -      for (memslot = &slots->memslots[0];     \
 -            memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
 -              memslot++)
 +#define kvm_for_each_memslot(memslot, slots)                          \
 +      for (memslot = &slots->memslots[0];                             \
 +           memslot < slots->memslots + slots->used_slots; memslot++)  \
 +              if (WARN_ON_ONCE(!memslot->npages)) {                   \
 +              } else
  
  void kvm_vcpu_destroy(struct kvm_vcpu *vcpu);
  
@@@ -645,15 -635,12 +645,15 @@@ static inline struct kvm_memslots *kvm_
        return __kvm_memslots(vcpu->kvm, as_id);
  }
  
 -static inline struct kvm_memory_slot *
 -id_to_memslot(struct kvm_memslots *slots, int id)
 +static inline
 +struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
  {
        int index = slots->id_to_index[id];
        struct kvm_memory_slot *slot;
  
 +      if (index < 0)
 +              return NULL;
 +
        slot = &slots->memslots[index];
  
        WARN_ON(slot->id != id);
@@@ -682,7 -669,10 +682,7 @@@ int kvm_set_memory_region(struct kvm *k
                          const struct kvm_userspace_memory_region *mem);
  int __kvm_set_memory_region(struct kvm *kvm,
                            const struct kvm_userspace_memory_region *mem);
 -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 -                         struct kvm_memory_slot *dont);
 -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 -                          unsigned long npages);
 +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
  void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                enum kvm_mr_change change);
  void kvm_arch_commit_memory_region(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem,
 -                              const struct kvm_memory_slot *old,
 +                              struct kvm_memory_slot *old,
                                const struct kvm_memory_slot *new,
                                enum kvm_mr_change change);
 -bool kvm_largepages_enabled(void);
 -void kvm_disable_largepages(void);
  /* flush all memory translations */
  void kvm_arch_flush_shadow_all(struct kvm *kvm);
  /* flush memory translations pointing to 'slot' */
@@@ -712,6 -704,7 +712,6 @@@ void kvm_release_page_clean(struct pag
  void kvm_release_page_dirty(struct page *page);
  void kvm_set_page_accessed(struct page *page);
  
 -kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
  kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable);
@@@ -826,20 -819,23 +826,20 @@@ vm_fault_t kvm_arch_vcpu_fault(struct k
  
  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
  
 -int kvm_get_dirty_log(struct kvm *kvm,
 -                      struct kvm_dirty_log *log, int *is_dirty);
 -
 -int kvm_get_dirty_log_protect(struct kvm *kvm,
 -                            struct kvm_dirty_log *log, bool *flush);
 -int kvm_clear_dirty_log_protect(struct kvm *kvm,
 -                              struct kvm_clear_dirty_log *log, bool *flush);
 -
  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                                        struct kvm_memory_slot *slot,
                                        gfn_t gfn_offset,
                                        unsigned long mask);
 -
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 -                              struct kvm_dirty_log *log);
 -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 -                                struct kvm_clear_dirty_log *log);
 +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
 +
 +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
 +                                      struct kvm_memory_slot *memslot);
 +#else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
 +int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
 +                    int *is_dirty, struct kvm_memory_slot **memslot);
 +#endif
  
  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
                        bool line_status);
@@@ -1022,8 -1018,6 +1022,8 @@@ bool kvm_arch_irqfd_allowed(struct kvm 
   * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
   * gfn_to_memslot() itself isn't here as an inline because that would
   * bloat other code too much.
 + *
 + * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!
   */
  static inline struct kvm_memory_slot *
  search_memslots(struct kvm_memslots *slots, gfn_t gfn)
        int slot = atomic_read(&slots->lru_slot);
        struct kvm_memory_slot *memslots = slots->memslots;
  
 +      if (unlikely(!slots->used_slots))
 +              return NULL;
 +
        if (gfn >= memslots[slot].base_gfn &&
            gfn < memslots[slot].base_gfn + memslots[slot].npages)
                return &memslots[slot];
@@@ -1353,7 -1344,7 +1353,7 @@@ static inline void kvm_vcpu_set_dy_elig
  #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
  
  struct kvm_vcpu *kvm_get_running_vcpu(void);
- struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
+ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
  
  #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
  bool kvm_arch_has_irq_bypass(void);
diff --combined virt/kvm/arm/arm.c
@@@ -625,6 -625,14 +625,14 @@@ static void check_vcpu_requests(struct 
  
                if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
                        kvm_update_stolen_time(vcpu);
+               if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
+                       /* The distributor enable bits were changed */
+                       preempt_disable();
+                       vgic_v4_put(vcpu, false);
+                       vgic_v4_load(vcpu);
+                       preempt_enable();
+               }
        }
  }
  
@@@ -742,9 -750,7 +750,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                guest_enter_irqoff();
  
                if (has_vhe()) {
-                       kvm_arm_vhe_guest_enter();
                        ret = kvm_vcpu_run_vhe(vcpu);
-                       kvm_arm_vhe_guest_exit();
                } else {
                        ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
                }
@@@ -1183,15 -1189,55 +1189,15 @@@ long kvm_arch_vcpu_ioctl(struct file *f
        return r;
  }
  
 -/**
 - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
 - * @kvm: kvm instance
 - * @log: slot id and address to which we copy the log
 - *
 - * Steps 1-4 below provide general overview of dirty page logging. See
 - * kvm_get_dirty_log_protect() function description for additional details.
 - *
 - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
 - * always flush the TLB (step 4) even if previous step failed  and the dirty
 - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
 - * does not preclude user space subsequent dirty log read. Flushing TLB ensures
 - * writes will be marked dirty for next log read.
 - *
 - *   1. Take a snapshot of the bit and clear it if needed.
 - *   2. Write protect the corresponding page.
 - *   3. Copy the snapshot to the userspace.
 - *   4. Flush TLB's if needed.
 - */
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
  {
 -      bool flush = false;
 -      int r;
 -
 -      mutex_lock(&kvm->slots_lock);
 -
 -      r = kvm_get_dirty_log_protect(kvm, log, &flush);
  
 -      if (flush)
 -              kvm_flush_remote_tlbs(kvm);
 -
 -      mutex_unlock(&kvm->slots_lock);
 -      return r;
  }
  
 -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
 +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
 +                                      struct kvm_memory_slot *memslot)
  {
 -      bool flush = false;
 -      int r;
 -
 -      mutex_lock(&kvm->slots_lock);
 -
 -      r = kvm_clear_dirty_log_protect(kvm, log, &flush);
 -
 -      if (flush)
 -              kvm_flush_remote_tlbs(kvm);
 -
 -      mutex_unlock(&kvm->slots_lock);
 -      return r;
 +      kvm_flush_remote_tlbs(kvm);
  }
  
  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,