From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 31 Mar 2020 14:44:53 +0000 (-0400)
Subject: Merge tag 'kvmarm-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm... 
X-Git-Tag: v5.10.7~2921^2~13
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=cf39d37539068d53e015d8b4f1dcf42c65306b0d;hp=-c;p=platform%2Fkernel%2Flinux-rpi.git

Merge tag 'kvmarm-5.7' of git://git./linux/kernel/git/kvmarm/kvmarm into HEAD

KVM/arm updates for Linux 5.7

- GICv4.1 support
- 32bit host removal
---

cf39d37539068d53e015d8b4f1dcf42c65306b0d
diff --combined Documentation/admin-guide/kernel-parameters.txt
index b0beae9,c07815d..144c130
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -136,6 -136,10 +136,10 @@@
  			dynamic table installation which will install SSDT
  			tables to /sys/firmware/acpi/tables/dynamic.
  
+ 	acpi_no_watchdog	[HW,ACPI,WDT]
+ 			Ignore the ACPI-based watchdog interface (WDAT) and let
+ 			a native driver control the watchdog device instead.
+ 
  	acpi_rsdp=	[ACPI,EFI,KEXEC]
  			Pass the RSDP address to the kernel, mostly used
  			on machines running EFI runtime service to boot the
@@@ -3795,11 -3799,6 +3799,11 @@@
  			before loading.
  			See Documentation/admin-guide/blockdev/ramdisk.rst.
  
 +	prot_virt=	[S390] enable hosting protected virtual machines
 +			isolated from the hypervisor (if hardware supports
 +			that).
 +			Format: <bool>
 +
  	psi=		[KNL] Enable or disable pressure stall information
  			tracking.
  			Format: <bool>
diff --combined MAINTAINERS
index 97a7064,e84a94e..d87d009
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@@ -3649,6 -3649,7 +3649,7 @@@ F:	sound/pci/oxygen
  
  C-SKY ARCHITECTURE
  M:	Guo Ren <guoren@kernel.org>
+ L:	linux-csky@vger.kernel.org
  T:	git https://github.com/c-sky/csky-linux.git
  S:	Supported
  F:	arch/csky/
@@@ -3909,7 -3910,7 +3910,7 @@@ S:	Supporte
  F:	Documentation/filesystems/ceph.txt
  F:	fs/ceph/
  
- CERTIFICATE HANDLING:
+ CERTIFICATE HANDLING
  M:	David Howells <dhowells@redhat.com>
  M:	David Woodhouse <dwmw2@infradead.org>
  L:	keyrings@vger.kernel.org
@@@ -3919,7 -3920,7 +3920,7 @@@ F:	certs
  F:	scripts/sign-file.c
  F:	scripts/extract-cert.c
  
- CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
+ CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM
  L:	devel@driverdev.osuosl.org
  S:	Obsolete
  F:	drivers/staging/wusbcore/
@@@ -5932,12 -5933,12 +5933,12 @@@ S:	Maintaine
  F:	drivers/media/dvb-frontends/ec100*
  
  ECRYPT FILE SYSTEM
- M:	Tyler Hicks <tyhicks@canonical.com>
+ M:	Tyler Hicks <code@tyhicks.com>
  L:	ecryptfs@vger.kernel.org
  W:	http://ecryptfs.org
  W:	https://launchpad.net/ecryptfs
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git
- S:	Supported
+ S:	Odd Fixes
  F:	Documentation/filesystems/ecryptfs.txt
  F:	fs/ecryptfs/
  
@@@ -7047,7 -7048,7 +7048,7 @@@ L:	kvm@vger.kernel.or
  S:	Supported
  F:	drivers/uio/uio_pci_generic.c
  
- GENERIC VDSO LIBRARY:
+ GENERIC VDSO LIBRARY
  M:	Andy Lutomirski <luto@kernel.org>
  M:	Thomas Gleixner <tglx@linutronix.de>
  M:	Vincenzo Frascino <vincenzo.frascino@arm.com>
@@@ -8392,7 -8393,7 +8393,7 @@@ M:	Joonas Lahtinen <joonas.lahtinen@lin
  M:	Rodrigo Vivi <rodrigo.vivi@intel.com>
  L:	intel-gfx@lists.freedesktop.org
  W:	https://01.org/linuxgraphics/
- B:	https://01.org/linuxgraphics/documentation/how-report-bugs
+ B:	https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs
  C:	irc://chat.freenode.net/intel-gfx
  Q:	http://patchwork.freedesktop.org/project/intel-gfx/
  T:	git git://anongit.freedesktop.org/drm-intel
@@@ -9163,7 -9164,7 +9164,7 @@@ F:	virt/kvm/
  F:	tools/kvm/
  F:	tools/testing/selftests/kvm/
  
- KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64)
+ KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
  M:	Marc Zyngier <maz@kernel.org>
  R:	James Morse <james.morse@arm.com>
  R:	Julien Thierry <julien.thierry.kdev@gmail.com>
@@@ -9172,9 -9173,6 +9173,6 @@@ L:	linux-arm-kernel@lists.infradead.or
  L:	kvmarm@lists.cs.columbia.edu
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
  S:	Maintained
- F:	arch/arm/include/uapi/asm/kvm*
- F:	arch/arm/include/asm/kvm*
- F:	arch/arm/kvm/
  F:	arch/arm64/include/uapi/asm/kvm*
  F:	arch/arm64/include/asm/kvm*
  F:	arch/arm64/kvm/
@@@ -9209,7 -9207,6 +9207,7 @@@ L:	kvm@vger.kernel.or
  W:	http://www.ibm.com/developerworks/linux/linux390/
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
  S:	Supported
 +F:	Documentation/virt/kvm/s390*
  F:	arch/s390/include/uapi/asm/kvm*
  F:	arch/s390/include/asm/gmap.h
  F:	arch/s390/include/asm/kvm*
@@@ -9279,7 -9276,7 +9277,7 @@@ F:	include/keys/trusted-type.
  F:	security/keys/trusted.c
  F:	include/keys/trusted.h
  
- KEYS/KEYRINGS:
+ KEYS/KEYRINGS
  M:	David Howells <dhowells@redhat.com>
  M:	Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
  L:	keyrings@vger.kernel.org
@@@ -11115,14 -11112,12 +11113,12 @@@ S:	Maintaine
  F:	drivers/usb/image/microtek.*
  
  MIPS
- M:	Ralf Baechle <ralf@linux-mips.org>
- M:	Paul Burton <paulburton@kernel.org>
+ M:	Thomas Bogendoerfer <tsbogend@alpha.franken.de>
  L:	linux-mips@vger.kernel.org
  W:	http://www.linux-mips.org/
- T:	git git://git.linux-mips.org/pub/scm/ralf/linux.git
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git
  Q:	http://patchwork.linux-mips.org/project/linux-mips/list/
- S:	Supported
+ S:	Maintained
  F:	Documentation/devicetree/bindings/mips/
  F:	Documentation/mips/
  F:	arch/mips/
@@@ -11485,7 -11480,7 +11481,7 @@@ F:	drivers/scsi/mac_scsi.
  F:	drivers/scsi/sun3_scsi.*
  F:	drivers/scsi/sun3_scsi_vme.c
  
- NCSI LIBRARY:
+ NCSI LIBRARY
  M:	Samuel Mendoza-Jonas <sam@mendozajonas.com>
  S:	Maintained
  F:	net/ncsi/
@@@ -12741,7 -12736,7 +12737,7 @@@ M:	Tom Joseph <tjoseph@cadence.com
  L:	linux-pci@vger.kernel.org
  S:	Maintained
  F:	Documentation/devicetree/bindings/pci/cdns,*.txt
- F:	drivers/pci/controller/pcie-cadence*
+ F:	drivers/pci/controller/cadence/
  
  PCI DRIVER FOR FREESCALE LAYERSCAPE
  M:	Minghuan Lian <minghuan.Lian@nxp.com>
@@@ -13513,7 -13508,7 +13509,7 @@@ L:	linuxppc-dev@lists.ozlabs.or
  S:	Maintained
  F:	drivers/block/ps3vram.c
  
- PSAMPLE PACKET SAMPLING SUPPORT:
+ PSAMPLE PACKET SAMPLING SUPPORT
  M:	Yotam Gigi <yotam.gi@gmail.com>
  S:	Maintained
  F:	net/psample
@@@ -14583,10 -14578,10 +14579,10 @@@ F:	drivers/media/pci/saa7146
  F:	include/media/drv-intf/saa7146*
  
  SAFESETID SECURITY MODULE
- M:     Micah Morton <mortonm@chromium.org>
- S:     Supported
- F:     security/safesetid/
- F:     Documentation/admin-guide/LSM/SafeSetID.rst
+ M:	Micah Morton <mortonm@chromium.org>
+ S:	Supported
+ F:	security/safesetid/
+ F:	Documentation/admin-guide/LSM/SafeSetID.rst
  
  SAMSUNG AUDIO (ASoC) DRIVERS
  M:	Krzysztof Kozlowski <krzk@kernel.org>
@@@ -16553,8 -16548,8 +16549,8 @@@ M:	Michael Jamet <michael.jamet@intel.c
  M:	Mika Westerberg <mika.westerberg@linux.intel.com>
  M:	Yehezkel Bernat <YehezkelShB@gmail.com>
  L:	linux-usb@vger.kernel.org
- T:	git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
  S:	Maintained
+ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
  F:	Documentation/admin-guide/thunderbolt.rst
  F:	drivers/thunderbolt/
  F:	include/linux/thunderbolt.h
@@@ -17081,7 -17076,7 +17077,7 @@@ S:	Maintaine
  F:	Documentation/admin-guide/ufs.rst
  F:	fs/ufs/
  
- UHID USERSPACE HID IO DRIVER:
+ UHID USERSPACE HID IO DRIVER
  M:	David Herrmann <dh.herrmann@googlemail.com>
  L:	linux-input@vger.kernel.org
  S:	Maintained
@@@ -17095,18 -17090,18 +17091,18 @@@ S:	Maintaine
  F:	drivers/usb/common/ulpi.c
  F:	include/linux/ulpi/
  
- ULTRA-WIDEBAND (UWB) SUBSYSTEM:
+ ULTRA-WIDEBAND (UWB) SUBSYSTEM
  L:	devel@driverdev.osuosl.org
  S:	Obsolete
  F:	drivers/staging/uwb/
  
- UNICODE SUBSYSTEM:
+ UNICODE SUBSYSTEM
  M:	Gabriel Krisman Bertazi <krisman@collabora.com>
  L:	linux-fsdevel@vger.kernel.org
  S:	Supported
  F:	fs/unicode/
  
- UNICORE32 ARCHITECTURE:
+ UNICORE32 ARCHITECTURE
  M:	Guan Xuetao <gxt@pku.edu.cn>
  W:	http://mprc.pku.edu.cn/~guanxuetao/linux
  S:	Maintained
@@@ -17393,11 -17388,14 +17389,14 @@@ F:	drivers/usb
  F:	include/linux/usb.h
  F:	include/linux/usb/
  
- USB TYPEC PI3USB30532 MUX DRIVER
- M:	Hans de Goede <hdegoede@redhat.com>
+ USB TYPEC BUS FOR ALTERNATE MODES
+ M:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
  L:	linux-usb@vger.kernel.org
  S:	Maintained
- F:	drivers/usb/typec/mux/pi3usb30532.c
+ F:	Documentation/ABI/testing/sysfs-bus-typec
+ F:	Documentation/driver-api/usb/typec_bus.rst
+ F:	drivers/usb/typec/altmodes/
+ F:	include/linux/usb/typec_altmode.h
  
  USB TYPEC CLASS
  M:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
@@@ -17408,14 -17406,11 +17407,11 @@@ F:	Documentation/driver-api/usb/typec.r
  F:	drivers/usb/typec/
  F:	include/linux/usb/typec.h
  
- USB TYPEC BUS FOR ALTERNATE MODES
- M:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+ USB TYPEC PI3USB30532 MUX DRIVER
+ M:	Hans de Goede <hdegoede@redhat.com>
  L:	linux-usb@vger.kernel.org
  S:	Maintained
- F:	Documentation/ABI/testing/sysfs-bus-typec
- F:	Documentation/driver-api/usb/typec_bus.rst
- F:	drivers/usb/typec/altmodes/
- F:	include/linux/usb/typec_altmode.h
+ F:	drivers/usb/typec/mux/pi3usb30532.c
  
  USB TYPEC PORT CONTROLLER DRIVERS
  M:	Guenter Roeck <linux@roeck-us.net>
@@@ -17792,7 -17787,7 +17788,7 @@@ F:	include/linux/vbox_utils.
  F:	include/uapi/linux/vbox*.h
  F:	drivers/virt/vboxguest/
  
- VIRTUAL BOX SHARED FOLDER VFS DRIVER:
+ VIRTUAL BOX SHARED FOLDER VFS DRIVER
  M:	Hans de Goede <hdegoede@redhat.com>
  L:	linux-fsdevel@vger.kernel.org
  S:	Maintained
diff --combined arch/arm64/kvm/hyp/switch.c
index f3e0ab9,925086b..600010cd6
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@@ -17,6 -17,7 +17,6 @@@
  #include <asm/kprobes.h>
  #include <asm/kvm_asm.h>
  #include <asm/kvm_emulate.h>
 -#include <asm/kvm_host.h>
  #include <asm/kvm_hyp.h>
  #include <asm/kvm_mmu.h>
  #include <asm/fpsimd.h>
@@@ -624,7 -625,7 +624,7 @@@ static void __hyp_text __pmu_switch_to_
  }
  
  /* Switch to the guest for VHE systems running in EL2 */
- int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
+ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
  {
  	struct kvm_cpu_context *host_ctxt;
  	struct kvm_cpu_context *guest_ctxt;
@@@ -677,7 -678,42 +677,42 @@@
  
  	return exit_code;
  }
- NOKPROBE_SYMBOL(kvm_vcpu_run_vhe);
+ NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe);
+ 
+ int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
+ {
+ 	int ret;
+ 
+ 	local_daif_mask();
+ 
+ 	/*
+ 	 * Having IRQs masked via PMR when entering the guest means the GIC
+ 	 * will not signal the CPU of interrupts of lower priority, and the
+ 	 * only way to get out will be via guest exceptions.
+ 	 * Naturally, we want to avoid this.
+ 	 *
+ 	 * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a
+ 	 * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU.
+ 	 */
+ 	pmr_sync();
+ 
+ 	ret = __kvm_vcpu_run_vhe(vcpu);
+ 
+ 	/*
+ 	 * local_daif_restore() takes care to properly restore PSTATE.DAIF
+ 	 * and the GIC PMR if the host is using IRQ priorities.
+ 	 */
+ 	local_daif_restore(DAIF_PROCCTX_NOIRQ);
+ 
+ 	/*
+ 	 * When we exit from the guest we change a number of CPU configuration
+ 	 * parameters, such as traps.  Make sure these changes take effect
+ 	 * before running the host or additional guests.
+ 	 */
+ 	isb();
+ 
+ 	return ret;
+ }
  
  /* Switch to the guest for legacy non-VHE systems */
  int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
diff --combined arch/s390/boot/Makefile
index 30f1811,0ff9261..45b33b8
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@@ -37,7 -37,7 +37,7 @@@ CFLAGS_sclp_early_core.o += -I$(srctree
  obj-y	:= head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o
  obj-y	+= string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
  obj-y	+= version.o pgm_check_info.o ctype.o text_dma.o
 -obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST)	+= uv.o
 +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))	+= uv.o
  obj-$(CONFIG_RELOCATABLE)	+= machine_kexec_reloc.o
  obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr.o
  targets	:= bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
@@@ -70,7 -70,7 +70,7 @@@ $(obj)/compressed/vmlinux: $(obj)/start
  $(obj)/startup.a: $(OBJECTS) FORCE
  	$(call if_changed,ar)
  
- install: $(CONFIGURE) $(obj)/bzImage
+ install:
  	sh -x  $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
  	      System.map "$(INSTALL_PATH)"
  
diff --combined arch/s390/include/asm/page.h
index 4ebcf89,1019efd..62440a8
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@@ -42,7 -42,7 +42,7 @@@ void __storage_key_init_range(unsigned 
  
  static inline void storage_key_init_range(unsigned long start, unsigned long end)
  {
- 	if (PAGE_DEFAULT_KEY)
+ 	if (PAGE_DEFAULT_KEY != 0)
  		__storage_key_init_range(start, end);
  }
  
@@@ -153,11 -153,6 +153,11 @@@ static inline int devmem_is_allowed(uns
  #define HAVE_ARCH_FREE_PAGE
  #define HAVE_ARCH_ALLOC_PAGE
  
 +#if IS_ENABLED(CONFIG_PGSTE)
 +int arch_make_page_accessible(struct page *page);
 +#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
 +#endif
 +
  #endif /* !__ASSEMBLY__ */
  
  #define __PAGE_OFFSET		0x0UL
diff --combined arch/x86/kvm/svm.c
index 2125c6a,24c0b2b..05cb45b
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@@ -57,11 -57,13 +57,13 @@@
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
+ #ifdef MODULE
  static const struct x86_cpu_id svm_cpu_id[] = {
  	X86_FEATURE_MATCH(X86_FEATURE_SVM),
  	{}
  };
  MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+ #endif
  
  #define IOPM_ALLOC_ORDER 2
  #define MSRPM_ALLOC_ORDER 1
@@@ -519,31 -521,10 +521,31 @@@ static void recalc_intercepts(struct vc
  	h = &svm->nested.hsave->control;
  	g = &svm->nested;
  
 -	c->intercept_cr = h->intercept_cr | g->intercept_cr;
 -	c->intercept_dr = h->intercept_dr | g->intercept_dr;
 -	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 -	c->intercept = h->intercept | g->intercept;
 +	c->intercept_cr = h->intercept_cr;
 +	c->intercept_dr = h->intercept_dr;
 +	c->intercept_exceptions = h->intercept_exceptions;
 +	c->intercept = h->intercept;
 +
 +	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
 +		/* We only want the cr8 intercept bits of L1 */
 +		c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
 +		c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
 +
 +		/*
 +		 * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
 +		 * affect any interrupt we may want to inject; therefore,
 +		 * interrupt window vmexits are irrelevant to L0.
 +		 */
 +		c->intercept &= ~(1ULL << INTERCEPT_VINTR);
 +	}
 +
 +	/* We don't want to see VMMCALLs from a nested guest */
 +	c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
 +
 +	c->intercept_cr |= g->intercept_cr;
 +	c->intercept_dr |= g->intercept_dr;
 +	c->intercept_exceptions |= g->intercept_exceptions;
 +	c->intercept |= g->intercept;
  }
  
  static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
@@@ -648,11 -629,6 +650,11 @@@ static inline void clr_intercept(struc
  	recalc_intercepts(svm);
  }
  
 +static inline bool is_intercept(struct vcpu_svm *svm, int bit)
 +{
 +	return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
 +}
 +
  static inline bool vgif_enabled(struct vcpu_svm *svm)
  {
  	return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
@@@ -1232,7 -1208,6 +1234,7 @@@ static int avic_ga_log_notifier(u32 ga_
  	u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
  
  	pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
 +	trace_kvm_avic_ga_log(vm_id, vcpu_id);
  
  	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
  	hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
@@@ -1394,29 -1369,6 +1396,29 @@@ static void svm_hardware_teardown(void
  	iopm_base = 0;
  }
  
 +static __init void svm_set_cpu_caps(void)
 +{
 +	kvm_set_cpu_caps();
 +
 +	supported_xss = 0;
 +
 +	/* CPUID 0x80000001 and 0x8000000A (SVM features) */
 +	if (nested) {
 +		kvm_cpu_cap_set(X86_FEATURE_SVM);
 +
 +		if (nrips)
 +			kvm_cpu_cap_set(X86_FEATURE_NRIPS);
 +
 +		if (npt_enabled)
 +			kvm_cpu_cap_set(X86_FEATURE_NPT);
 +	}
 +
 +	/* CPUID 0x80000008 */
 +	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
 +	    boot_cpu_has(X86_FEATURE_AMD_SSBD))
 +		kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
 +}
 +
  static __init int svm_hardware_setup(void)
  {
  	int cpu;
@@@ -1435,8 -1387,6 +1437,8 @@@
  
  	init_msrpm_offsets();
  
 +	supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 +
  	if (boot_cpu_has(X86_FEATURE_NX))
  		kvm_enable_efer_bits(EFER_NX);
  
@@@ -1484,11 -1434,16 +1486,11 @@@
  	if (!boot_cpu_has(X86_FEATURE_NPT))
  		npt_enabled = false;
  
 -	if (npt_enabled && !npt) {
 -		printk(KERN_INFO "kvm: Nested Paging disabled\n");
 +	if (npt_enabled && !npt)
  		npt_enabled = false;
 -	}
  
 -	if (npt_enabled) {
 -		printk(KERN_INFO "kvm: Nested Paging enabled\n");
 -		kvm_enable_tdp();
 -	} else
 -		kvm_disable_tdp();
 +	kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL);
 +	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
  
  	if (nrips) {
  		if (!boot_cpu_has(X86_FEATURE_NRIPS))
@@@ -1524,8 -1479,6 +1526,8 @@@
  			pr_info("Virtual GIF supported\n");
  	}
  
 +	svm_set_cpu_caps();
 +
  	return 0;
  
  err:
@@@ -1993,6 -1946,19 +1995,6 @@@ static void __unregister_enc_region_loc
  	kfree(region);
  }
  
 -static struct kvm *svm_vm_alloc(void)
 -{
 -	struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
 -					    GFP_KERNEL_ACCOUNT | __GFP_ZERO,
 -					    PAGE_KERNEL);
 -	return &kvm_svm->kvm;
 -}
 -
 -static void svm_vm_free(struct kvm *kvm)
 -{
 -	vfree(to_kvm_svm(kvm));
 -}
 -
  static void sev_vm_destroy(struct kvm *kvm)
  {
  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@@ -2220,7 -2186,7 +2222,7 @@@ static void svm_vcpu_reset(struct kvm_v
  	}
  	init_vmcb(svm);
  
 -	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
 +	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
  	kvm_rdx_write(vcpu, eax);
  
  	if (kvm_vcpu_apicv_active(vcpu) && !init_event)
@@@ -2230,8 -2196,9 +2232,9 @@@
  static int avic_init_vcpu(struct vcpu_svm *svm)
  {
  	int ret;
+ 	struct kvm_vcpu *vcpu = &svm->vcpu;
  
- 	if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ 	if (!avic || !irqchip_in_kernel(vcpu->kvm))
  		return 0;
  
  	ret = avic_init_backing_page(&svm->vcpu);
@@@ -2453,38 -2420,14 +2456,38 @@@ static void svm_cache_reg(struct kvm_vc
  	}
  }
  
 +static inline void svm_enable_vintr(struct vcpu_svm *svm)
 +{
 +	struct vmcb_control_area *control;
 +
 +	/* The following fields are ignored when AVIC is enabled */
 +	WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
 +
 +	/*
 +	 * This is just a dummy VINTR to actually cause a vmexit to happen.
 +	 * Actual injection of virtual interrupts happens through EVENTINJ.
 +	 */
 +	control = &svm->vmcb->control;
 +	control->int_vector = 0x0;
 +	control->int_ctl &= ~V_INTR_PRIO_MASK;
 +	control->int_ctl |= V_IRQ_MASK |
 +		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
 +	mark_dirty(svm->vmcb, VMCB_INTR);
 +}
 +
  static void svm_set_vintr(struct vcpu_svm *svm)
  {
  	set_intercept(svm, INTERCEPT_VINTR);
 +	if (is_intercept(svm, INTERCEPT_VINTR))
 +		svm_enable_vintr(svm);
  }
  
  static void svm_clear_vintr(struct vcpu_svm *svm)
  {
  	clr_intercept(svm, INTERCEPT_VINTR);
 +
 +	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 +	mark_dirty(svm->vmcb, VMCB_INTR);
  }
  
  static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@@ -3040,6 -2983,15 +3043,6 @@@ static u64 nested_svm_get_tdp_pdptr(str
  	return pdpte;
  }
  
 -static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 -				   unsigned long root)
 -{
 -	struct vcpu_svm *svm = to_svm(vcpu);
 -
 -	svm->vmcb->control.nested_cr3 = __sme_set(root);
 -	mark_dirty(svm->vmcb, VMCB_NPT);
 -}
 -
  static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
  				       struct x86_exception *fault)
  {
@@@ -3075,7 -3027,8 +3078,7 @@@ static void nested_svm_init_mmu_context
  
  	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
  	kvm_init_shadow_mmu(vcpu);
 -	vcpu->arch.mmu->set_cr3           = nested_svm_set_tdp_cr3;
 -	vcpu->arch.mmu->get_cr3           = nested_svm_get_tdp_cr3;
 +	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
  	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
  	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
  	vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
@@@ -3136,36 -3089,43 +3139,36 @@@ static int nested_svm_check_exception(s
  	return vmexit;
  }
  
 -/* This function returns true if it is save to enable the irq window */
 -static inline bool nested_svm_intr(struct vcpu_svm *svm)
 +static void nested_svm_intr(struct vcpu_svm *svm)
  {
 -	if (!is_guest_mode(&svm->vcpu))
 -		return true;
 -
 -	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
 -		return true;
 -
 -	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
 -		return false;
 -
 -	/*
 -	 * if vmexit was already requested (by intercepted exception
 -	 * for instance) do not overwrite it with "external interrupt"
 -	 * vmexit.
 -	 */
 -	if (svm->nested.exit_required)
 -		return false;
 -
  	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
  	svm->vmcb->control.exit_info_1 = 0;
  	svm->vmcb->control.exit_info_2 = 0;
  
 -	if (svm->nested.intercept & 1ULL) {
 -		/*
 -		 * The #vmexit can't be emulated here directly because this
 -		 * code path runs with irqs and preemption disabled. A
 -		 * #vmexit emulation might sleep. Only signal request for
 -		 * the #vmexit here.
 -		 */
 -		svm->nested.exit_required = true;
 -		trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
 -		return false;
 +	/* nested_svm_vmexit this gets called afterwards from handle_exit */
 +	svm->nested.exit_required = true;
 +	trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
 +}
 +
 +static bool nested_exit_on_intr(struct vcpu_svm *svm)
 +{
 +	return (svm->nested.intercept & 1ULL);
 +}
 +
 +static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 +{
 +	struct vcpu_svm *svm = to_svm(vcpu);
 +	bool block_nested_events =
 +		kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
 +
 +	if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
 +		if (block_nested_events)
 +			return -EBUSY;
 +		nested_svm_intr(svm);
 +		return 0;
  	}
  
 -	return true;
 +	return 0;
  }
  
  /* This function returns true if it is save to enable the nmi window */
@@@ -3284,6 -3244,9 +3287,6 @@@ static int nested_svm_exit_special(stru
  	return NESTED_EXIT_CONTINUE;
  }
  
 -/*
 - * If this function returns true, this #vmexit was already handled
 - */
  static int nested_svm_intercept(struct vcpu_svm *svm)
  {
  	u32 exit_code = svm->vmcb->control.exit_code;
@@@ -3558,9 -3521,6 +3561,9 @@@ static bool nested_svm_vmrun_msrpm(stru
  
  static bool nested_vmcb_checks(struct vmcb *vmcb)
  {
 +	if ((vmcb->save.efer & EFER_SVME) == 0)
 +		return false;
 +
  	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
  		return false;
  
@@@ -3577,10 -3537,6 +3580,10 @@@
  static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
  				 struct vmcb *nested_vmcb, struct kvm_host_map *map)
  {
 +	bool evaluate_pending_interrupts =
 +		is_intercept(svm, INTERCEPT_VINTR) ||
 +		is_intercept(svm, INTERCEPT_IRET);
 +
  	if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
  		svm->vcpu.arch.hflags |= HF_HIF_MASK;
  	else
@@@ -3640,6 -3596,15 +3643,6 @@@
  	else
  		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
  
 -	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
 -		/* We only want the cr8 intercept bits of the guest */
 -		clr_cr_intercept(svm, INTERCEPT_CR8_READ);
 -		clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 -	}
 -
 -	/* We don't want to see VMMCALLs from a nested guest */
 -	clr_intercept(svm, INTERCEPT_VMMCALL);
 -
  	svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
  	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
  
@@@ -3667,21 -3632,7 +3670,21 @@@
  
  	svm->nested.vmcb = vmcb_gpa;
  
 +	/*
 +	 * If L1 had a pending IRQ/NMI before executing VMRUN,
 +	 * which wasn't delivered because it was disallowed (e.g.
 +	 * interrupts disabled), L0 needs to evaluate if this pending
 +	 * event should cause an exit from L2 to L1 or be delivered
 +	 * directly to L2.
 +	 *
 +	 * Usually this would be handled by the processor noticing an
 +	 * IRQ/NMI window request.  However, VMRUN can unblock interrupts
 +	 * by implicitly setting GIF, so force L0 to perform pending event
 +	 * evaluation by requesting a KVM_REQ_EVENT.
 +	 */
  	enable_gif(svm);
 +	if (unlikely(evaluate_pending_interrupts))
 +		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
  
  	mark_all_dirty(svm->vmcb);
  }
@@@ -3883,8 -3834,11 +3886,8 @@@ static int clgi_interception(struct vcp
  	disable_gif(svm);
  
  	/* After a CLGI no interrupts should come */
 -	if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
 +	if (!kvm_vcpu_apicv_active(&svm->vcpu))
  		svm_clear_vintr(svm);
 -		svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 -		mark_dirty(svm->vmcb, VMCB_INTR);
 -	}
  
  	return ret;
  }
@@@ -5170,6 -5124,19 +5173,6 @@@ static void svm_inject_nmi(struct kvm_v
  	++vcpu->stat.nmi_injections;
  }
  
 -static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 -{
 -	struct vmcb_control_area *control;
 -
 -	/* The following fields are ignored when AVIC is enabled */
 -	control = &svm->vmcb->control;
 -	control->int_vector = irq;
 -	control->int_ctl &= ~V_INTR_PRIO_MASK;
 -	control->int_ctl |= V_IRQ_MASK |
 -		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
 -	mark_dirty(svm->vmcb, VMCB_INTR);
 -}
 -
  static void svm_set_irq(struct kvm_vcpu *vcpu)
  {
  	struct vcpu_svm *svm = to_svm(vcpu);
@@@ -5558,15 -5525,18 +5561,15 @@@ static int svm_interrupt_allowed(struc
  {
  	struct vcpu_svm *svm = to_svm(vcpu);
  	struct vmcb *vmcb = svm->vmcb;
 -	int ret;
  
  	if (!gif_set(svm) ||
  	     (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
  		return 0;
  
 -	ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
 -
 -	if (is_guest_mode(vcpu))
 -		return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
 -
 -	return ret;
 +	if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
 +		return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
 +	else
 +		return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
  }
  
  static void enable_irq_window(struct kvm_vcpu *vcpu)
@@@ -5581,7 -5551,7 +5584,7 @@@
  	 * enabled, the STGI interception will not occur. Enable the irq
  	 * window under the assumption that the hardware will set the GIF.
  	 */
 -	if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
 +	if (vgif_enabled(svm) || gif_set(svm)) {
  		/*
  		 * IRQ window is not needed when AVIC is enabled,
  		 * unless we have pending ExtINT since it cannot be injected
@@@ -5590,6 -5560,7 +5593,6 @@@
  		 */
  		svm_toggle_avic_for_irq_window(vcpu, false);
  		svm_set_vintr(svm);
 -		svm_inject_irq(svm, 0x0);
  	}
  }
  
@@@ -5975,30 -5946,24 +5978,30 @@@ static void svm_vcpu_run(struct kvm_vcp
  }
  STACK_FRAME_NON_STANDARD(svm_vcpu_run);
  
 -static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
  {
  	struct vcpu_svm *svm = to_svm(vcpu);
 +	bool update_guest_cr3 = true;
 +	unsigned long cr3;
  
 -	svm->vmcb->save.cr3 = __sme_set(root);
 -	mark_dirty(svm->vmcb, VMCB_CR);
 -}
 -
 -static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 -{
 -	struct vcpu_svm *svm = to_svm(vcpu);
 +	cr3 = __sme_set(root);
 +	if (npt_enabled) {
 +		svm->vmcb->control.nested_cr3 = cr3;
 +		mark_dirty(svm->vmcb, VMCB_NPT);
  
 -	svm->vmcb->control.nested_cr3 = __sme_set(root);
 -	mark_dirty(svm->vmcb, VMCB_NPT);
 +		/* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
 +		if (is_guest_mode(vcpu))
 +			update_guest_cr3 = false;
 +		else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
 +			cr3 = vcpu->arch.cr3;
 +		else /* CR3 is already up-to-date.  */
 +			update_guest_cr3 = false;
 +	}
  
 -	/* Also sync guest cr3 here in case we live migrate */
 -	svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
 -	mark_dirty(svm->vmcb, VMCB_CR);
 +	if (update_guest_cr3) {
 +		svm->vmcb->save.cr3 = cr3;
 +		mark_dirty(svm->vmcb, VMCB_CR);
 +	}
  }
  
  static int is_disabled(void)
@@@ -6060,19 -6025,12 +6063,19 @@@ static void svm_cpuid_update(struct kvm
  				    boot_cpu_has(X86_FEATURE_XSAVES);
  
  	/* Update nrips enabled cache */
 -	svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
 +	svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
 +			     guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
  
  	if (!kvm_vcpu_apicv_active(vcpu))
  		return;
  
 -	guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
 +	/*
 +	 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
 +	 * is exposed to the guest, disable AVIC.
 +	 */
 +	if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
 +		kvm_request_apicv_update(vcpu->kvm, false,
 +					 APICV_INHIBIT_REASON_X2APIC);
  
  	/*
  	 * Currently, AVIC does not work with nested virtualization.
@@@ -6083,11 -6041,88 +6086,11 @@@
  					 APICV_INHIBIT_REASON_NESTED);
  }
  
 -#define F feature_bit
 -
 -static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 -{
 -	switch (func) {
 -	case 0x1:
 -		if (avic)
 -			entry->ecx &= ~F(X2APIC);
 -		break;
 -	case 0x80000001:
 -		if (nested)
 -			entry->ecx |= (1 << 2); /* Set SVM bit */
 -		break;
 -	case 0x80000008:
 -		if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
 -		     boot_cpu_has(X86_FEATURE_AMD_SSBD))
 -			entry->ebx |= F(VIRT_SSBD);
 -		break;
 -	case 0x8000000A:
 -		entry->eax = 1; /* SVM revision 1 */
 -		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
 -				   ASID emulation to nested SVM */
 -		entry->ecx = 0; /* Reserved */
 -		entry->edx = 0; /* Per default do not support any
 -				   additional features */
 -
 -		/* Support next_rip if host supports it */
 -		if (boot_cpu_has(X86_FEATURE_NRIPS))
 -			entry->edx |= F(NRIPS);
 -
 -		/* Support NPT for the guest if enabled */
 -		if (npt_enabled)
 -			entry->edx |= F(NPT);
 -
 -	}
 -}
 -
 -static int svm_get_lpage_level(void)
 -{
 -	return PT_PDPE_LEVEL;
 -}
 -
 -static bool svm_rdtscp_supported(void)
 -{
 -	return boot_cpu_has(X86_FEATURE_RDTSCP);
 -}
 -
 -static bool svm_invpcid_supported(void)
 -{
 -	return false;
 -}
 -
 -static bool svm_mpx_supported(void)
 -{
 -	return false;
 -}
 -
 -static bool svm_xsaves_supported(void)
 -{
 -	return boot_cpu_has(X86_FEATURE_XSAVES);
 -}
 -
 -static bool svm_umip_emulated(void)
 -{
 -	return false;
 -}
 -
 -static bool svm_pt_supported(void)
 -{
 -	return false;
 -}
 -
  static bool svm_has_wbinvd_exit(void)
  {
  	return true;
  }
  
 -static bool svm_pku_supported(void)
 -{
 -	return false;
 -}
 -
  #define PRE_EX(exit)  { .exit_code = (exit), \
  			.stage = X86_ICPT_PRE_EXCEPT, }
  #define POST_EX(exit) { .exit_code = (exit), \
@@@ -6154,8 -6189,7 +6157,8 @@@ static const struct __x86_intercept 
  
  static int svm_check_intercept(struct kvm_vcpu *vcpu,
  			       struct x86_instruction_info *info,
 -			       enum x86_intercept_stage stage)
 +			       enum x86_intercept_stage stage,
 +			       struct x86_exception *exception)
  {
  	struct vcpu_svm *svm = to_svm(vcpu);
  	int vmexit, ret = X86EMUL_CONTINUE;
@@@ -7339,8 -7373,7 +7342,8 @@@ static bool svm_check_apicv_inhibit_rea
  			  BIT(APICV_INHIBIT_REASON_HYPERV) |
  			  BIT(APICV_INHIBIT_REASON_NESTED) |
  			  BIT(APICV_INHIBIT_REASON_IRQWIN) |
 -			  BIT(APICV_INHIBIT_REASON_PIT_REINJ);
 +			  BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
 +			  BIT(APICV_INHIBIT_REASON_X2APIC);
  
  	return supported & BIT(bit);
  }
@@@ -7365,7 -7398,8 +7368,7 @@@ static struct kvm_x86_ops svm_x86_ops _
  	.vcpu_free = svm_free_vcpu,
  	.vcpu_reset = svm_vcpu_reset,
  
 -	.vm_alloc = svm_vm_alloc,
 -	.vm_free = svm_vm_free,
 +	.vm_size = sizeof(struct kvm_svm),
  	.vm_init = svm_vm_init,
  	.vm_destroy = svm_vm_destroy,
  
@@@ -7387,6 -7421,7 +7390,6 @@@
  	.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
  	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
  	.set_cr0 = svm_set_cr0,
 -	.set_cr3 = svm_set_cr3,
  	.set_cr4 = svm_set_cr4,
  	.set_efer = svm_set_efer,
  	.get_idt = svm_get_idt,
@@@ -7439,14 -7474,26 +7442,14 @@@
  
  	.get_exit_info = svm_get_exit_info,
  
 -	.get_lpage_level = svm_get_lpage_level,
 -
  	.cpuid_update = svm_cpuid_update,
  
 -	.rdtscp_supported = svm_rdtscp_supported,
 -	.invpcid_supported = svm_invpcid_supported,
 -	.mpx_supported = svm_mpx_supported,
 -	.xsaves_supported = svm_xsaves_supported,
 -	.umip_emulated = svm_umip_emulated,
 -	.pt_supported = svm_pt_supported,
 -	.pku_supported = svm_pku_supported,
 -
 -	.set_supported_cpuid = svm_set_supported_cpuid,
 -
  	.has_wbinvd_exit = svm_has_wbinvd_exit,
  
  	.read_l1_tsc_offset = svm_read_l1_tsc_offset,
  	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
  
 -	.set_tdp_cr3 = set_tdp_cr3,
 +	.load_mmu_pgd = svm_load_mmu_pgd,
  
  	.check_intercept = svm_check_intercept,
  	.handle_exit_irqoff = svm_handle_exit_irqoff,
@@@ -7476,8 -7523,6 +7479,8 @@@
  	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
  
  	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
 +
 +	.check_nested_events = svm_check_nested_events,
  };
  
  static int __init svm_init(void)
diff --combined arch/x86/kvm/vmx/vmx.c
index 3aba51d,40b1e61..a7dd678
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -64,11 -64,13 +64,13 @@@
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
+ #ifdef MODULE
  static const struct x86_cpu_id vmx_cpu_id[] = {
  	X86_FEATURE_MATCH(X86_FEATURE_VMX),
  	{}
  };
  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+ #endif
  
  bool __read_mostly enable_vpid = 1;
  module_param_named(vpid, enable_vpid, bool, 0444);
@@@ -433,6 -435,7 +435,6 @@@ static const struct kvm_vmx_segment_fie
  	VMX_SEGMENT_FIELD(LDTR),
  };
  
 -u64 host_efer;
  static unsigned long host_idt_base;
  
  /*
@@@ -653,16 -656,53 +655,16 @@@ static int vmx_set_guest_msr(struct vcp
  	return ret;
  }
  
 -void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
 -{
 -	vmcs_clear(loaded_vmcs->vmcs);
 -	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 -		vmcs_clear(loaded_vmcs->shadow_vmcs);
 -	loaded_vmcs->cpu = -1;
 -	loaded_vmcs->launched = 0;
 -}
 -
  #ifdef CONFIG_KEXEC_CORE
 -/*
 - * This bitmap is used to indicate whether the vmclear
 - * operation is enabled on all cpus. All disabled by
 - * default.
 - */
 -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
 -
 -static inline void crash_enable_local_vmclear(int cpu)
 -{
 -	cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
 -}
 -
 -static inline void crash_disable_local_vmclear(int cpu)
 -{
 -	cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
 -}
 -
 -static inline int crash_local_vmclear_enabled(int cpu)
 -{
 -	return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
 -}
 -
  static void crash_vmclear_local_loaded_vmcss(void)
  {
  	int cpu = raw_smp_processor_id();
  	struct loaded_vmcs *v;
  
 -	if (!crash_local_vmclear_enabled(cpu))
 -		return;
 -
  	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
  			    loaded_vmcss_on_cpu_link)
  		vmcs_clear(v->vmcs);
  }
 -#else
 -static inline void crash_enable_local_vmclear(int cpu) { }
 -static inline void crash_disable_local_vmclear(int cpu) { }
  #endif /* CONFIG_KEXEC_CORE */
  
  static void __loaded_vmcs_clear(void *arg)
@@@ -674,24 -714,19 +676,24 @@@
  		return; /* vcpu migration can race with cpu offline */
  	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
  		per_cpu(current_vmcs, cpu) = NULL;
 -	crash_disable_local_vmclear(cpu);
 +
 +	vmcs_clear(loaded_vmcs->vmcs);
 +	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 +		vmcs_clear(loaded_vmcs->shadow_vmcs);
 +
  	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
  
  	/*
 -	 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
 -	 * is before setting loaded_vmcs->vcpu to -1 which is done in
 -	 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
 -	 * then adds the vmcs into percpu list before it is deleted.
 +	 * Ensure all writes to loaded_vmcs, including deleting it from its
 +	 * current percpu list, complete before setting loaded_vmcs->vcpu to
 +	 * -1, otherwise a different cpu can see vcpu == -1 first and add
 +	 * loaded_vmcs to its percpu list before it's deleted from this cpu's
 +	 * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
  	 */
  	smp_wmb();
  
 -	loaded_vmcs_init(loaded_vmcs);
 -	crash_enable_local_vmclear(cpu);
 +	loaded_vmcs->cpu = -1;
 +	loaded_vmcs->launched = 0;
  }
  
  void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@@ -775,7 -810,7 +777,7 @@@ void update_exception_bitmap(struct kvm
  	if (to_vmx(vcpu)->rmode.vm86_active)
  		eb = ~0;
  	if (enable_ept)
 -		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 +		eb &= ~(1u << PF_VECTOR);
  
  	/* When we are running a nested L2 guest and L1 specified for it a
  	 * certain exception bitmap, we must trap the same exceptions and pass
@@@ -1026,7 -1061,7 +1028,7 @@@ static unsigned long segment_base(u16 s
  
  static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
  {
 -	return (pt_mode == PT_MODE_HOST_GUEST) &&
 +	return vmx_pt_mode_is_host_guest() &&
  	       !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
  }
  
@@@ -1060,7 -1095,7 +1062,7 @@@ static inline void pt_save_msr(struct p
  
  static void pt_guest_enter(struct vcpu_vmx *vmx)
  {
 -	if (pt_mode == PT_MODE_SYSTEM)
 +	if (vmx_pt_mode_is_system())
  		return;
  
  	/*
@@@ -1077,7 -1112,7 +1079,7 @@@
  
  static void pt_guest_exit(struct vcpu_vmx *vmx)
  {
 -	if (pt_mode == PT_MODE_SYSTEM)
 +	if (vmx_pt_mode_is_system())
  		return;
  
  	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
@@@ -1310,17 -1345,18 +1312,17 @@@ void vmx_vcpu_load_vmcs(struct kvm_vcp
  	if (!already_loaded) {
  		loaded_vmcs_clear(vmx->loaded_vmcs);
  		local_irq_disable();
 -		crash_disable_local_vmclear(cpu);
  
  		/*
 -		 * Read loaded_vmcs->cpu should be before fetching
 -		 * loaded_vmcs->loaded_vmcss_on_cpu_link.
 -		 * See the comments in __loaded_vmcs_clear().
 +		 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
 +		 * this cpu's percpu list, otherwise it may not yet be deleted
 +		 * from its previous cpu's percpu list.  Pairs with the
 +		 * smb_wmb() in __loaded_vmcs_clear().
  		 */
  		smp_rmb();
  
  		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
  			 &per_cpu(loaded_vmcss_on_cpu, cpu));
 -		crash_enable_local_vmclear(cpu);
  		local_irq_enable();
  	}
  
@@@ -1653,6 -1689,16 +1655,6 @@@ static void vmx_queue_exception(struct 
  	vmx_clear_hlt(vcpu);
  }
  
 -static bool vmx_rdtscp_supported(void)
 -{
 -	return cpu_has_vmx_rdtscp();
 -}
 -
 -static bool vmx_invpcid_supported(void)
 -{
 -	return cpu_has_vmx_invpcid();
 -}
 -
  /*
   * Swap MSR entry in host/guest MSR entry array.
   */
@@@ -1860,24 -1906,24 +1862,24 @@@ static int vmx_get_msr(struct kvm_vcpu 
  							&msr_info->data);
  		break;
  	case MSR_IA32_RTIT_CTL:
 -		if (pt_mode != PT_MODE_HOST_GUEST)
 +		if (!vmx_pt_mode_is_host_guest())
  			return 1;
  		msr_info->data = vmx->pt_desc.guest.ctl;
  		break;
  	case MSR_IA32_RTIT_STATUS:
 -		if (pt_mode != PT_MODE_HOST_GUEST)
 +		if (!vmx_pt_mode_is_host_guest())
  			return 1;
  		msr_info->data = vmx->pt_desc.guest.status;
  		break;
  	case MSR_IA32_RTIT_CR3_MATCH:
 -		if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +		if (!vmx_pt_mode_is_host_guest() ||
  			!intel_pt_validate_cap(vmx->pt_desc.caps,
  						PT_CAP_cr3_filtering))
  			return 1;
  		msr_info->data = vmx->pt_desc.guest.cr3_match;
  		break;
  	case MSR_IA32_RTIT_OUTPUT_BASE:
 -		if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +		if (!vmx_pt_mode_is_host_guest() ||
  			(!intel_pt_validate_cap(vmx->pt_desc.caps,
  					PT_CAP_topa_output) &&
  			 !intel_pt_validate_cap(vmx->pt_desc.caps,
@@@ -1886,7 -1932,7 +1888,7 @@@
  		msr_info->data = vmx->pt_desc.guest.output_base;
  		break;
  	case MSR_IA32_RTIT_OUTPUT_MASK:
 -		if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +		if (!vmx_pt_mode_is_host_guest() ||
  			(!intel_pt_validate_cap(vmx->pt_desc.caps,
  					PT_CAP_topa_output) &&
  			 !intel_pt_validate_cap(vmx->pt_desc.caps,
@@@ -1896,7 -1942,7 +1898,7 @@@
  		break;
  	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
  		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
 -		if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +		if (!vmx_pt_mode_is_host_guest() ||
  			(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
  					PT_CAP_num_address_ranges)))
  			return 1;
@@@ -2102,7 -2148,7 +2104,7 @@@ static int vmx_set_msr(struct kvm_vcpu 
  			return 1;
  		return vmx_set_vmx_msr(vcpu, msr_index, data);
  	case MSR_IA32_RTIT_CTL:
 -		if ((pt_mode != PT_MODE_HOST_GUEST) ||
 +		if (!vmx_pt_mode_is_host_guest() ||
  			vmx_rtit_ctl_check(vcpu, data) ||
  			vmx->nested.vmxon)
  			return 1;
@@@ -2218,33 -2264,18 +2220,33 @@@ static __init int vmx_disabled_by_bios(
  	       !boot_cpu_has(X86_FEATURE_VMX);
  }
  
 -static void kvm_cpu_vmxon(u64 addr)
 +static int kvm_cpu_vmxon(u64 vmxon_pointer)
  {
 +	u64 msr;
 +
  	cr4_set_bits(X86_CR4_VMXE);
  	intel_pt_handle_vmx(1);
  
 -	asm volatile ("vmxon %0" : : "m"(addr));
 +	asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
 +			  _ASM_EXTABLE(1b, %l[fault])
 +			  : : [vmxon_pointer] "m"(vmxon_pointer)
 +			  : : fault);
 +	return 0;
 +
 +fault:
 +	WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
 +		  rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
 +	intel_pt_handle_vmx(0);
 +	cr4_clear_bits(X86_CR4_VMXE);
 +
 +	return -EFAULT;
  }
  
  static int hardware_enable(void)
  {
  	int cpu = raw_smp_processor_id();
  	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 +	int r;
  
  	if (cr4_read_shadow() & X86_CR4_VMXE)
  		return -EBUSY;
@@@ -2261,10 -2292,18 +2263,10 @@@
  	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
  	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
  
 -	/*
 -	 * Now we can enable the vmclear operation in kdump
 -	 * since the loaded_vmcss_on_cpu list on this cpu
 -	 * has been initialized.
 -	 *
 -	 * Though the cpu is not in VMX operation now, there
 -	 * is no problem to enable the vmclear operation
 -	 * for the loaded_vmcss_on_cpu list is empty!
 -	 */
 -	crash_enable_local_vmclear(cpu);
 +	r = kvm_cpu_vmxon(phys_addr);
 +	if (r)
 +		return r;
  
 -	kvm_cpu_vmxon(phys_addr);
  	if (enable_ept)
  		ept_sync_global();
  
@@@ -2564,12 -2603,9 +2566,12 @@@ int alloc_loaded_vmcs(struct loaded_vmc
  	if (!loaded_vmcs->vmcs)
  		return -ENOMEM;
  
 +	vmcs_clear(loaded_vmcs->vmcs);
 +
  	loaded_vmcs->shadow_vmcs = NULL;
  	loaded_vmcs->hv_timer_soft_disabled = false;
 -	loaded_vmcs_init(loaded_vmcs);
 +	loaded_vmcs->cpu = -1;
 +	loaded_vmcs->launched = 0;
  
  	if (cpu_has_vmx_msr_bitmap()) {
  		loaded_vmcs->msr_bitmap = (unsigned long *)
@@@ -2951,8 -2987,9 +2953,8 @@@ void vmx_set_cr0(struct kvm_vcpu *vcpu
  
  static int get_ept_level(struct kvm_vcpu *vcpu)
  {
 -	/* Nested EPT currently only supports 4-level walks. */
  	if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
 -		return 4;
 +		return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
  	if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
  		return 5;
  	return 4;
@@@ -2972,7 -3009,7 +2974,7 @@@ u64 construct_eptp(struct kvm_vcpu *vcp
  	return eptp;
  }
  
 -void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
  	struct kvm *kvm = vcpu->kvm;
  	bool update_guest_cr3 = true;
@@@ -3989,7 -4026,7 +3991,7 @@@ static void vmx_compute_secondary_exec_
  
  	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
  
 -	if (pt_mode == PT_MODE_SYSTEM)
 +	if (vmx_pt_mode_is_system())
  		exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
  	if (!cpu_need_virtualize_apic_accesses(vcpu))
  		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
@@@ -4044,7 -4081,7 +4046,7 @@@
  		}
  	}
  
 -	if (vmx_rdtscp_supported()) {
 +	if (cpu_has_vmx_rdtscp()) {
  		bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
  		if (!rdtscp_enabled)
  			exec_control &= ~SECONDARY_EXEC_RDTSCP;
@@@ -4059,7 -4096,7 +4061,7 @@@
  		}
  	}
  
 -	if (vmx_invpcid_supported()) {
 +	if (cpu_has_vmx_invpcid()) {
  		/* Exposing INVPCID only when PCID is exposed */
  		bool invpcid_enabled =
  			guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
@@@ -4230,7 -4267,7 +4232,7 @@@ static void init_vmcs(struct vcpu_vmx *
  	if (cpu_has_vmx_encls_vmexit())
  		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
  
 -	if (pt_mode == PT_MODE_HOST_GUEST) {
 +	if (vmx_pt_mode_is_host_guest()) {
  		memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
  		/* Bit[6~0] are forced to 1, writes are ignored. */
  		vmx->pt_desc.guest.output_mask = 0x7F;
@@@ -4458,13 -4495,8 +4460,13 @@@ static int vmx_nmi_allowed(struct kvm_v
  
  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
 -	return (!to_vmx(vcpu)->nested.nested_run_pending &&
 -		vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 +	if (to_vmx(vcpu)->nested.nested_run_pending)
 +		return false;
 +
 +	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
 +		return true;
 +
 +	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
  		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
  			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
  }
@@@ -4520,6 -4552,7 +4522,6 @@@ static bool rmode_exception(struct kvm_
  	case GP_VECTOR:
  	case MF_VECTOR:
  		return true;
 -	break;
  	}
  	return false;
  }
@@@ -5296,6 -5329,7 +5298,6 @@@ static void vmx_enable_tdp(void
  		VMX_EPT_RWX_MASK, 0ull);
  
  	ept_set_mmio_spte_mask();
 -	kvm_enable_tdp();
  }
  
  /*
@@@ -5828,23 -5862,8 +5830,23 @@@ static int vmx_handle_exit(struct kvm_v
  	if (vmx->emulation_required)
  		return handle_invalid_guest_state(vcpu);
  
 -	if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
 -		return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 +	if (is_guest_mode(vcpu)) {
 +		/*
 +		 * The host physical addresses of some pages of guest memory
 +		 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
 +		 * Page). The CPU may write to these pages via their host
 +		 * physical address while L2 is running, bypassing any
 +		 * address-translation-based dirty tracking (e.g. EPT write
 +		 * protection).
 +		 *
 +		 * Mark them dirty on every exit from L2 to prevent them from
 +		 * getting out of sync with dirty tracking.
 +		 */
 +		nested_mark_vmcs12_pages_dirty(vcpu);
 +
 +		if (nested_vmx_exit_reflected(vcpu, exit_reason))
 +			return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 +	}
  
  	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
  		dump_vmcs();
@@@ -6204,13 -6223,15 +6206,13 @@@ static void handle_exception_nmi_irqoff
  	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
  
  	/* if exit due to PF check for async PF */
 -	if (is_page_fault(vmx->exit_intr_info))
 +	if (is_page_fault(vmx->exit_intr_info)) {
  		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
 -
  	/* Handle machine checks before interrupts are enabled */
 -	if (is_machine_check(vmx->exit_intr_info))
 +	} else if (is_machine_check(vmx->exit_intr_info)) {
  		kvm_machine_check();
 -
  	/* We need to handle NMIs before interrupts are enabled */
 -	if (is_nmi(vmx->exit_intr_info)) {
 +	} else if (is_nmi(vmx->exit_intr_info)) {
  		kvm_before_interrupt(&vmx->vcpu);
  		asm("int $2");
  		kvm_after_interrupt(&vmx->vcpu);
@@@ -6296,6 -6317,11 +6298,6 @@@ static bool vmx_has_emulated_msr(int in
  	}
  }
  
 -static bool vmx_pt_supported(void)
 -{
 -	return pt_mode == PT_MODE_HOST_GUEST;
 -}
 -
  static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
  {
  	u32 exit_intr_info;
@@@ -6541,8 -6567,7 +6543,8 @@@ static void vmx_vcpu_run(struct kvm_vcp
  
  	pt_guest_enter(vmx);
  
 -	atomic_switch_perf_msrs(vmx);
 +	if (vcpu_to_pmu(vcpu)->version)
 +		atomic_switch_perf_msrs(vmx);
  	atomic_switch_umwait_control_msr(vmx);
  
  	if (enable_preemption_timer)
@@@ -6659,6 -6684,20 +6661,6 @@@
  	vmx_complete_interrupts(vmx);
  }
  
 -static struct kvm *vmx_vm_alloc(void)
 -{
 -	struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
 -					    GFP_KERNEL_ACCOUNT | __GFP_ZERO,
 -					    PAGE_KERNEL);
 -	return &kvm_vmx->kvm;
 -}
 -
 -static void vmx_vm_free(struct kvm *kvm)
 -{
 -	kfree(kvm->arch.hyperv.hv_pa_pg);
 -	vfree(to_kvm_vmx(kvm));
 -}
 -
  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
  {
  	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -6861,24 -6900,17 +6863,24 @@@ static u64 vmx_get_mt_mask(struct kvm_v
  	u8 cache;
  	u64 ipat = 0;
  
 -	/* For VT-d and EPT combination
 -	 * 1. MMIO: always map as UC
 -	 * 2. EPT with VT-d:
 -	 *   a. VT-d without snooping control feature: can't guarantee the
 -	 *	result, try to trust guest.
 -	 *   b. VT-d with snooping control feature: snooping control feature of
 -	 *	VT-d engine can guarantee the cache correctness. Just set it
 -	 *	to WB to keep consistent with host. So the same as item 3.
 -	 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
 -	 *    consistent with host MTRR
 +	/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
 +	 * memory aliases with conflicting memory types and sometimes MCEs.
 +	 * We have to be careful as to what are honored and when.
 +	 *
 +	 * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
 +	 * UC.  The effective memory type is UC or WC depending on guest PAT.
 +	 * This was historically the source of MCEs and we want to be
 +	 * conservative.
 +	 *
 +	 * When there is no need to deal with noncoherent DMA (e.g., no VT-d
 +	 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
 +	 * EPT memory type is set to WB.  The effective memory type is forced
 +	 * WB.
 +	 *
 +	 * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
 +	 * EPT memory type is used to emulate guest CD/MTRR.
  	 */
 +
  	if (is_mmio) {
  		cache = MTRR_TYPE_UNCACHABLE;
  		goto exit;
@@@ -6905,6 -6937,15 +6907,6 @@@ exit
  	return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
  }
  
 -static int vmx_get_lpage_level(void)
 -{
 -	if (enable_ept && !cpu_has_vmx_ept_1g_page())
 -		return PT_DIRECTORY_LEVEL;
 -	else
 -		/* For shadow and EPT supported 1GB page */
 -		return PT_PDPE_LEVEL;
 -}
 -
  static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
  {
  	/*
@@@ -7095,37 -7136,10 +7097,37 @@@ static void vmx_cpuid_update(struct kvm
  	}
  }
  
 -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 +static __init void vmx_set_cpu_caps(void)
  {
 -	if (func == 1 && nested)
 -		entry->ecx |= feature_bit(VMX);
 +	kvm_set_cpu_caps();
 +
 +	/* CPUID 0x1 */
 +	if (nested)
 +		kvm_cpu_cap_set(X86_FEATURE_VMX);
 +
 +	/* CPUID 0x7 */
 +	if (kvm_mpx_supported())
 +		kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
 +	if (cpu_has_vmx_invpcid())
 +		kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
 +	if (vmx_pt_mode_is_host_guest())
 +		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 +
 +	/* PKU is not yet implemented for shadow paging. */
 +	if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
 +		kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
 +
 +	if (vmx_umip_emulated())
 +		kvm_cpu_cap_set(X86_FEATURE_UMIP);
 +
 +	/* CPUID 0xD.1 */
 +	supported_xss = 0;
 +	if (!vmx_xsaves_supported())
 +		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
 +
 +	/* CPUID 0x80000001 */
 +	if (!cpu_has_vmx_rdtscp())
 +		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
  }
  
  static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@@ -7163,15 -7177,16 +7165,16 @@@ static int vmx_check_intercept_io(struc
  	else
  		intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
  
+ 	/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
  	return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
  }
  
  static int vmx_check_intercept(struct kvm_vcpu *vcpu,
  			       struct x86_instruction_info *info,
 -			       enum x86_intercept_stage stage)
 +			       enum x86_intercept_stage stage,
 +			       struct x86_exception *exception)
  {
  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 -	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
  
  	switch (info->intercept) {
  	/*
@@@ -7180,8 -7195,8 +7183,8 @@@
  	 */
  	case x86_intercept_rdtscp:
  		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
 -			ctxt->exception.vector = UD_VECTOR;
 -			ctxt->exception.error_code_valid = false;
 +			exception->vector = UD_VECTOR;
 +			exception->error_code_valid = false;
  			return X86EMUL_PROPAGATE_FAULT;
  		}
  		break;
@@@ -7192,6 -7207,20 +7195,20 @@@
  	case x86_intercept_outs:
  		return vmx_check_intercept_io(vcpu, info);
  
+ 	case x86_intercept_lgdt:
+ 	case x86_intercept_lidt:
+ 	case x86_intercept_lldt:
+ 	case x86_intercept_ltr:
+ 	case x86_intercept_sgdt:
+ 	case x86_intercept_sidt:
+ 	case x86_intercept_sldt:
+ 	case x86_intercept_str:
+ 		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+ 			return X86EMUL_CONTINUE;
+ 
+ 		/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
+ 		break;
+ 
  	/* TODO: check more intercepts... */
  	default:
  		break;
@@@ -7278,8 -7307,7 +7295,8 @@@ static void vmx_sched_in(struct kvm_vcp
  static void vmx_slot_enable_log_dirty(struct kvm *kvm,
  				     struct kvm_memory_slot *slot)
  {
 -	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
 +	if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
 +		kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
  	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
  }
  
@@@ -7633,7 -7661,9 +7650,7 @@@ static __init int hardware_setup(void
  {
  	unsigned long host_bndcfgs;
  	struct desc_ptr dt;
 -	int r, i;
 -
 -	rdmsrl_safe(MSR_EFER, &host_efer);
 +	int r, i, ept_lpage_level;
  
  	store_idt(&dt);
  	host_idt_base = dt.address;
@@@ -7652,10 -7682,6 +7669,10 @@@
  		WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
  	}
  
 +	if (!cpu_has_vmx_mpx())
 +		supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
 +				    XFEATURE_MASK_BNDCSR);
 +
  	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
  	    !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
  		enable_vpid = 0;
@@@ -7689,6 -7715,9 +7706,6 @@@
  	if (!cpu_has_vmx_tpr_shadow())
  		kvm_x86_ops->update_cr8_intercept = NULL;
  
 -	if (enable_ept && !cpu_has_vmx_ept_2m_page())
 -		kvm_disable_largepages();
 -
  #if IS_ENABLED(CONFIG_HYPERV)
  	if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
  	    && enable_ept) {
@@@ -7721,16 -7750,8 +7738,16 @@@
  
  	if (enable_ept)
  		vmx_enable_tdp();
 +
 +	if (!enable_ept)
 +		ept_lpage_level = 0;
 +	else if (cpu_has_vmx_ept_1g_page())
 +		ept_lpage_level = PT_PDPE_LEVEL;
 +	else if (cpu_has_vmx_ept_2m_page())
 +		ept_lpage_level = PT_DIRECTORY_LEVEL;
  	else
 -		kvm_disable_tdp();
 +		ept_lpage_level = PT_PAGE_TABLE_LEVEL;
 +	kvm_configure_mmu(enable_ept, ept_lpage_level);
  
  	/*
  	 * Only enable PML when hardware supports PML feature, and both EPT
@@@ -7794,8 -7815,6 +7811,8 @@@
  			return r;
  	}
  
 +	vmx_set_cpu_caps();
 +
  	r = alloc_kvm_area();
  	if (r)
  		nested_vmx_hardware_unsetup();
@@@ -7829,8 -7848,9 +7846,8 @@@ static struct kvm_x86_ops vmx_x86_ops _
  	.cpu_has_accelerated_tpr = report_flexpriority,
  	.has_emulated_msr = vmx_has_emulated_msr,
  
 +	.vm_size = sizeof(struct kvm_vmx),
  	.vm_init = vmx_vm_init,
 -	.vm_alloc = vmx_vm_alloc,
 -	.vm_free = vmx_vm_free,
  
  	.vcpu_create = vmx_create_vcpu,
  	.vcpu_free = vmx_free_vcpu,
@@@ -7852,6 -7872,7 +7869,6 @@@
  	.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
  	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
  	.set_cr0 = vmx_set_cr0,
 -	.set_cr3 = vmx_set_cr3,
  	.set_cr4 = vmx_set_cr4,
  	.set_efer = vmx_set_efer,
  	.get_idt = vmx_get_idt,
@@@ -7907,17 -7928,29 +7924,17 @@@
  
  	.get_exit_info = vmx_get_exit_info,
  
 -	.get_lpage_level = vmx_get_lpage_level,
 -
  	.cpuid_update = vmx_cpuid_update,
  
 -	.rdtscp_supported = vmx_rdtscp_supported,
 -	.invpcid_supported = vmx_invpcid_supported,
 -
 -	.set_supported_cpuid = vmx_set_supported_cpuid,
 -
  	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  
  	.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
  	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
  
 -	.set_tdp_cr3 = vmx_set_cr3,
 +	.load_mmu_pgd = vmx_load_mmu_pgd,
  
  	.check_intercept = vmx_check_intercept,
  	.handle_exit_irqoff = vmx_handle_exit_irqoff,
 -	.mpx_supported = vmx_mpx_supported,
 -	.xsaves_supported = vmx_xsaves_supported,
 -	.umip_emulated = vmx_umip_emulated,
 -	.pt_supported = vmx_pt_supported,
 -	.pku_supported = vmx_pku_supported,
  
  	.request_immediate_exit = vmx_request_immediate_exit,
  
diff --combined arch/x86/kvm/x86.c
index 6fa014c,5de2006..1b6d9ac
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -22,7 -22,6 +22,7 @@@
  #include "i8254.h"
  #include "tss.h"
  #include "kvm_cache_regs.h"
 +#include "kvm_emulate.h"
  #include "x86.h"
  #include "cpuid.h"
  #include "pmu.h"
@@@ -82,7 -81,7 +82,7 @@@ u64 __read_mostly kvm_mce_cap_supporte
  EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
  
  #define emul_to_vcpu(ctxt) \
 -	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
 +	((struct kvm_vcpu *)(ctxt)->vcpu)
  
  /* EFER defaults:
   * - enable syscall per default because its emulated by KVM
@@@ -181,17 -180,7 +181,17 @@@ struct kvm_shared_msrs 
  static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
  static struct kvm_shared_msrs __percpu *shared_msrs;
  
 +#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 +				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
 +				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
 +				| XFEATURE_MASK_PKRU)
 +
 +u64 __read_mostly host_efer;
 +EXPORT_SYMBOL_GPL(host_efer);
 +
  static u64 __read_mostly host_xss;
 +u64 __read_mostly supported_xss;
 +EXPORT_SYMBOL_GPL(supported_xss);
  
  struct kvm_stats_debugfs_item debugfs_entries[] = {
  	{ "pf_fixed", VCPU_STAT(pf_fixed) },
@@@ -237,25 -226,10 +237,25 @@@
  };
  
  u64 __read_mostly host_xcr0;
 +u64 __read_mostly supported_xcr0;
 +EXPORT_SYMBOL_GPL(supported_xcr0);
  
  struct kmem_cache *x86_fpu_cache;
  EXPORT_SYMBOL_GPL(x86_fpu_cache);
  
 +static struct kmem_cache *x86_emulator_cache;
 +
 +static struct kmem_cache *kvm_alloc_emulator_cache(void)
 +{
 +	unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
 +	unsigned int size = sizeof(struct x86_emulate_ctxt);
 +
 +	return kmem_cache_create_usercopy("x86_emulator", size,
 +					  __alignof__(struct x86_emulate_ctxt),
 +					  SLAB_ACCOUNT, useroffset,
 +					  size - useroffset, NULL);
 +}
 +
  static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
  
  static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
@@@ -376,7 -350,6 +376,7 @@@ int kvm_set_apic_base(struct kvm_vcpu *
  	}
  
  	kvm_lapic_set_base(vcpu, msr_info->data);
 +	kvm_recalculate_apic_map(vcpu->kvm);
  	return 0;
  }
  EXPORT_SYMBOL_GPL(kvm_set_apic_base);
@@@ -930,10 -903,10 +930,10 @@@ static u64 kvm_host_cr4_reserved_bits(s
  {
  	u64 reserved_bits = __cr4_reserved_bits(cpu_has, c);
  
 -	if (cpuid_ecx(0x7) & feature_bit(LA57))
 +	if (kvm_cpu_cap_has(X86_FEATURE_LA57))
  		reserved_bits &= ~X86_CR4_LA57;
  
 -	if (kvm_x86_ops->umip_emulated())
 +	if (kvm_cpu_cap_has(X86_FEATURE_UMIP))
  		reserved_bits &= ~X86_CR4_UMIP;
  
  	return reserved_bits;
@@@ -1585,12 -1558,8 +1585,12 @@@ static int handle_fastpath_set_x2apic_i
  		((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
  		((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
  
 +		data &= ~(1 << 12);
 +		kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
  		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
 -		return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
 +		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
 +		trace_kvm_apic_write(APIC_ICR, (u32)data);
 +		return 0;
  	}
  
  	return 1;
@@@ -1599,12 -1568,11 +1599,12 @@@
  enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
  {
  	u32 msr = kvm_rcx_read(vcpu);
 -	u64 data = kvm_read_edx_eax(vcpu);
 +	u64 data;
  	int ret = 0;
  
  	switch (msr) {
  	case APIC_BASE_MSR + (APIC_ICR >> 4):
 +		data = kvm_read_edx_eax(vcpu);
  		ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
  		break;
  	default:
@@@ -2555,7 -2523,7 +2555,7 @@@ static void kvmclock_sync_fn(struct wor
  static bool can_set_mci_status(struct kvm_vcpu *vcpu)
  {
  	/* McStatusWrEn enabled? */
 -	if (guest_cpuid_is_amd(vcpu))
 +	if (guest_cpuid_is_amd_or_hygon(vcpu))
  		return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
  
  	return false;
@@@ -2830,11 -2798,12 +2830,11 @@@ int kvm_set_msr_common(struct kvm_vcpu 
  		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
  			return 1;
  		/*
 -		 * We do support PT if kvm_x86_ops->pt_supported(), but we do
 -		 * not support IA32_XSS[bit 8]. Guests will have to use
 -		 * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
 -		 * MSRs.
 +		 * KVM supports exposing PT to the guest, but does not support
 +		 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
 +		 * XSAVES/XRSTORS to save/restore PT MSRs.
  		 */
 -		if (data != 0)
 +		if (data & ~supported_xss)
  			return 1;
  		vcpu->arch.ia32_xss = data;
  		break;
@@@ -3108,6 -3077,7 +3108,6 @@@ int kvm_get_msr_common(struct kvm_vcpu 
  		break;
  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
  		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
 -		break;
  	case MSR_IA32_TSCDEADLINE:
  		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
  		break;
@@@ -3190,6 -3160,7 +3190,6 @@@
  		return kvm_hv_get_msr_common(vcpu,
  					     msr_info->index, &msr_info->data,
  					     msr_info->host_initiated);
 -		break;
  	case MSR_IA32_BBL_CR_CTL3:
  		/* This legacy MSR exists but isn't fully documented in current
  		 * silicon.  It is however accessed by winxp in very narrow
@@@ -3493,7 -3464,7 +3493,7 @@@ long kvm_arch_dev_ioctl(struct file *fi
  		r = 0;
  		break;
  	}
 -	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
 +	case KVM_X86_GET_MCE_CAP_SUPPORTED:
  		r = -EFAULT;
  		if (copy_to_user(argp, &kvm_mce_cap_supported,
  				 sizeof(kvm_mce_cap_supported)))
@@@ -3525,9 -3496,9 +3525,9 @@@
  	case KVM_GET_MSRS:
  		r = msr_io(NULL, argp, do_get_msr_feature, 1);
  		break;
 -	}
  	default:
  		r = -EINVAL;
 +		break;
  	}
  out:
  	return r;
@@@ -4130,7 -4101,8 +4130,7 @@@ static int kvm_vcpu_ioctl_x86_set_xsave
  		 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
  		 * with old userspace.
  		 */
 -		if (xstate_bv & ~kvm_supported_xcr0() ||
 -			mxcsr & ~mxcsr_feature_mask)
 +		if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
  			return -EINVAL;
  		load_xsave(vcpu, (u8 *)guest_xsave->region);
  	} else {
@@@ -4789,13 -4761,77 +4789,13 @@@ static int kvm_vm_ioctl_reinject(struc
  	return 0;
  }
  
 -/**
 - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
 - * @kvm: kvm instance
 - * @log: slot id and address to which we copy the log
 - *
 - * Steps 1-4 below provide general overview of dirty page logging. See
 - * kvm_get_dirty_log_protect() function description for additional details.
 - *
 - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
 - * always flush the TLB (step 4) even if previous step failed  and the dirty
 - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
 - * does not preclude user space subsequent dirty log read. Flushing TLB ensures
 - * writes will be marked dirty for next log read.
 - *
 - *   1. Take a snapshot of the bit and clear it if needed.
 - *   2. Write protect the corresponding page.
 - *   3. Copy the snapshot to the userspace.
 - *   4. Flush TLB's if needed.
 - */
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
  {
 -	bool flush = false;
 -	int r;
 -
 -	mutex_lock(&kvm->slots_lock);
 -
  	/*
  	 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
  	 */
  	if (kvm_x86_ops->flush_log_dirty)
  		kvm_x86_ops->flush_log_dirty(kvm);
 -
 -	r = kvm_get_dirty_log_protect(kvm, log, &flush);
 -
 -	/*
 -	 * All the TLBs can be flushed out of mmu lock, see the comments in
 -	 * kvm_mmu_slot_remove_write_access().
 -	 */
 -	lockdep_assert_held(&kvm->slots_lock);
 -	if (flush)
 -		kvm_flush_remote_tlbs(kvm);
 -
 -	mutex_unlock(&kvm->slots_lock);
 -	return r;
 -}
 -
 -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
 -{
 -	bool flush = false;
 -	int r;
 -
 -	mutex_lock(&kvm->slots_lock);
 -
 -	/*
 -	 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
 -	 */
 -	if (kvm_x86_ops->flush_log_dirty)
 -		kvm_x86_ops->flush_log_dirty(kvm);
 -
 -	r = kvm_clear_dirty_log_protect(kvm, log, &flush);
 -
 -	/*
 -	 * All the TLBs can be flushed out of mmu lock, see the comments in
 -	 * kvm_mmu_slot_remove_write_access().
 -	 */
 -	lockdep_assert_held(&kvm->slots_lock);
 -	if (flush)
 -		kvm_flush_remote_tlbs(kvm);
 -
 -	mutex_unlock(&kvm->slots_lock);
 -	return r;
  }
  
  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@@ -5224,28 -5260,28 +5224,28 @@@ static void kvm_init_msr_list(void
  				continue;
  			break;
  		case MSR_TSC_AUX:
 -			if (!kvm_x86_ops->rdtscp_supported())
 +			if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
  				continue;
  			break;
  		case MSR_IA32_RTIT_CTL:
  		case MSR_IA32_RTIT_STATUS:
 -			if (!kvm_x86_ops->pt_supported())
 +			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
  				continue;
  			break;
  		case MSR_IA32_RTIT_CR3_MATCH:
 -			if (!kvm_x86_ops->pt_supported() ||
 +			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
  			    !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
  				continue;
  			break;
  		case MSR_IA32_RTIT_OUTPUT_BASE:
  		case MSR_IA32_RTIT_OUTPUT_MASK:
 -			if (!kvm_x86_ops->pt_supported() ||
 +			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
  				(!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
  				 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
  				continue;
  			break;
  		case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
 -			if (!kvm_x86_ops->pt_supported() ||
 +			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
  				msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
  				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
  				continue;
@@@ -5702,7 -5738,7 +5702,7 @@@ static int emulator_read_write_onepage(
  	int handled, ret;
  	bool write = ops->write;
  	struct kvm_mmio_fragment *frag;
 -	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
  
  	/*
  	 * If the exit was due to a NPF we may already have a GPA.
@@@ -5711,9 -5747,10 +5711,9 @@@
  	 * operation using rep will only have the initial GPA from the NPF
  	 * occurred.
  	 */
 -	if (vcpu->arch.gpa_available &&
 -	    emulator_can_use_gpa(ctxt) &&
 -	    (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
 -		gpa = vcpu->arch.gpa_val;
 +	if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
 +	    (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
 +		gpa = ctxt->gpa_val;
  		ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
  	} else {
  		ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@@ -5933,9 -5970,11 +5933,9 @@@ static int emulator_pio_in_out(struct k
  	return 0;
  }
  
 -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 -				    int size, unsigned short port, void *val,
 -				    unsigned int count)
 +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
 +			   unsigned short port, void *val, unsigned int count)
  {
 -	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
  	int ret;
  
  	if (vcpu->arch.pio.count)
@@@ -5955,30 -5994,17 +5955,30 @@@ data_avail
  	return 0;
  }
  
 -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
 -				     int size, unsigned short port,
 -				     const void *val, unsigned int count)
 +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 +				    int size, unsigned short port, void *val,
 +				    unsigned int count)
  {
 -	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 +	return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
 +
 +}
  
 +static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
 +			    unsigned short port, const void *val,
 +			    unsigned int count)
 +{
  	memcpy(vcpu->arch.pio_data, val, size * count);
  	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
  	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
  }
  
 +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
 +				     int size, unsigned short port,
 +				     const void *val, unsigned int count)
 +{
 +	return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
 +}
 +
  static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
  {
  	return kvm_x86_ops->get_segment_base(vcpu, seg);
@@@ -6241,15 -6267,13 +6241,15 @@@ static int emulator_intercept(struct x8
  			      struct x86_instruction_info *info,
  			      enum x86_intercept_stage stage)
  {
 -	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
 +	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage,
 +					    &ctxt->exception);
  }
  
  static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
 -			u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
 +			      u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
 +			      bool exact_only)
  {
 -	return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
 +	return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
  }
  
  static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
@@@ -6376,7 -6400,7 +6376,7 @@@ static void toggle_interruptibility(str
  
  static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
  {
 -	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
  	if (ctxt->exception.vector == PF_VECTOR)
  		return kvm_propagate_fault(vcpu, &ctxt->exception);
  
@@@ -6388,31 -6412,13 +6388,31 @@@
  	return false;
  }
  
 +static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
 +{
 +	struct x86_emulate_ctxt *ctxt;
 +
 +	ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
 +	if (!ctxt) {
 +		pr_err("kvm: failed to allocate vcpu's emulator\n");
 +		return NULL;
 +	}
 +
 +	ctxt->vcpu = vcpu;
 +	ctxt->ops = &emulate_ops;
 +	vcpu->arch.emulate_ctxt = ctxt;
 +
 +	return ctxt;
 +}
 +
  static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
  {
 -	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
  	int cs_db, cs_l;
  
  	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
  
 +	ctxt->gpa_available = false;
  	ctxt->eflags = kvm_get_rflags(vcpu);
  	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
  
@@@ -6432,7 -6438,7 +6432,7 @@@
  
  void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
  {
 -	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
  	int ret;
  
  	init_emulate_ctxt(vcpu);
@@@ -6488,11 -6494,10 +6488,11 @@@ static bool reexecute_instruction(struc
  	gpa_t gpa = cr2_or_gpa;
  	kvm_pfn_t pfn;
  
 -	if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
 +	if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
  		return false;
  
 -	if (WARN_ON_ONCE(is_guest_mode(vcpu)))
 +	if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
 +	    WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
  		return false;
  
  	if (!vcpu->arch.mmu->direct_map) {
@@@ -6580,11 -6585,10 +6580,11 @@@ static bool retry_instruction(struct x8
  	 */
  	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
  
 -	if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
 +	if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
  		return false;
  
 -	if (WARN_ON_ONCE(is_guest_mode(vcpu)))
 +	if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
 +	    WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
  		return false;
  
  	if (x86_page_table_writing_insn(ctxt))
@@@ -6747,7 -6751,7 +6747,7 @@@ int x86_emulate_instruction(struct kvm_
  			    int emulation_type, void *insn, int insn_len)
  {
  	int r;
 -	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
  	bool writeback = true;
  	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
  
@@@ -6837,19 -6841,8 +6837,19 @@@
  	}
  
  restart:
 -	/* Save the faulting GPA (cr2) in the address field */
 -	ctxt->exception.address = cr2_or_gpa;
 +	if (emulation_type & EMULTYPE_PF) {
 +		/* Save the faulting GPA (cr2) in the address field */
 +		ctxt->exception.address = cr2_or_gpa;
 +
 +		/* With shadow page tables, cr2 contains a GVA or nGPA. */
 +		if (vcpu->arch.mmu->direct_map) {
 +			ctxt->gpa_available = true;
 +			ctxt->gpa_val = cr2_or_gpa;
 +		}
 +	} else {
 +		/* Sanitize the address out of an abundance of paranoia. */
 +		ctxt->exception.address = 0;
 +	}
  
  	r = x86_emulate_insn(ctxt);
  
@@@ -6950,8 -6943,8 +6950,8 @@@ static int kvm_fast_pio_out(struct kvm_
  			    unsigned short port)
  {
  	unsigned long val = kvm_rax_read(vcpu);
 -	int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
 -					    size, port, &val, 1);
 +	int ret = emulator_pio_out(vcpu, size, port, &val, 1);
 +
  	if (ret)
  		return ret;
  
@@@ -6987,10 -6980,11 +6987,10 @@@ static int complete_fast_pio_in(struct 
  	val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
  
  	/*
 -	 * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
 +	 * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
  	 * the copy and tracing
  	 */
 -	emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
 -				 vcpu->arch.pio.port, &val, 1);
 +	emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
  	kvm_rax_write(vcpu, val);
  
  	return kvm_skip_emulated_instruction(vcpu);
@@@ -7005,7 -6999,8 +7005,7 @@@ static int kvm_fast_pio_in(struct kvm_v
  	/* For size less than 4 we merge, else we zero extend */
  	val = (size < 4) ? kvm_rax_read(vcpu) : 0;
  
 -	ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
 -				       &val, 1);
 +	ret = emulator_pio_in(vcpu, size, port, &val, 1);
  	if (ret) {
  		kvm_rax_write(vcpu, val);
  		return ret;
@@@ -7195,15 -7190,15 +7195,15 @@@ static void kvm_timer_init(void
  
  	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
  #ifdef CONFIG_CPU_FREQ
- 		struct cpufreq_policy policy;
+ 		struct cpufreq_policy *policy;
  		int cpu;
  
- 		memset(&policy, 0, sizeof(policy));
  		cpu = get_cpu();
- 		cpufreq_get_policy(&policy, cpu);
- 		if (policy.cpuinfo.max_freq)
- 			max_tsc_khz = policy.cpuinfo.max_freq;
+ 		policy = cpufreq_cpu_get(cpu);
+ 		if (policy && policy->cpuinfo.max_freq)
+ 			max_tsc_khz = policy->cpuinfo.max_freq;
  		put_cpu();
+ 		cpufreq_cpu_put(policy);
  #endif
  		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
  					  CPUFREQ_TRANSITION_NOTIFIER);
@@@ -7313,12 -7308,12 +7313,12 @@@ int kvm_arch_init(void *opaque
  	}
  
  	if (!ops->cpu_has_kvm_support()) {
- 		printk(KERN_ERR "kvm: no hardware support\n");
+ 		pr_err_ratelimited("kvm: no hardware support\n");
  		r = -EOPNOTSUPP;
  		goto out;
  	}
  	if (ops->disabled_by_bios()) {
- 		printk(KERN_ERR "kvm: disabled by bios\n");
+ 		pr_err_ratelimited("kvm: disabled by bios\n");
  		r = -EOPNOTSUPP;
  		goto out;
  	}
@@@ -7343,16 -7338,10 +7343,16 @@@
  		goto out;
  	}
  
 +	x86_emulator_cache = kvm_alloc_emulator_cache();
 +	if (!x86_emulator_cache) {
 +		pr_err("kvm: failed to allocate cache for x86 emulator\n");
 +		goto out_free_x86_fpu_cache;
 +	}
 +
  	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
  	if (!shared_msrs) {
  		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
 -		goto out_free_x86_fpu_cache;
 +		goto out_free_x86_emulator_cache;
  	}
  
  	r = kvm_mmu_module_init();
@@@ -7368,10 -7357,8 +7368,10 @@@
  
  	perf_register_guest_info_callbacks(&kvm_guest_cbs);
  
 -	if (boot_cpu_has(X86_FEATURE_XSAVE))
 +	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
  		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 +		supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
 +	}
  
  	kvm_lapic_init();
  	if (pi_inject_timer == -1)
@@@ -7387,8 -7374,6 +7387,8 @@@
  
  out_free_percpu:
  	free_percpu(shared_msrs);
 +out_free_x86_emulator_cache:
 +	kmem_cache_destroy(x86_emulator_cache);
  out_free_x86_fpu_cache:
  	kmem_cache_destroy(x86_fpu_cache);
  out:
@@@ -7646,7 -7631,7 +7646,7 @@@ static void update_cr8_intercept(struc
  	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
  }
  
 -static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 +static int inject_pending_event(struct kvm_vcpu *vcpu)
  {
  	int r;
  
@@@ -7682,7 -7667,7 +7682,7 @@@
  	 * from L2 to L1.
  	 */
  	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
 -		r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
 +		r = kvm_x86_ops->check_nested_events(vcpu);
  		if (r != 0)
  			return r;
  	}
@@@ -7744,7 -7729,7 +7744,7 @@@
  		 * KVM_REQ_EVENT only on certain events and not unconditionally?
  		 */
  		if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
 -			r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
 +			r = kvm_x86_ops->check_nested_events(vcpu);
  			if (r != 0)
  				return r;
  		}
@@@ -8054,26 -8039,19 +8054,26 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv
   */
  void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
  {
 +	unsigned long old, new, expected;
 +
  	if (!kvm_x86_ops->check_apicv_inhibit_reasons ||
  	    !kvm_x86_ops->check_apicv_inhibit_reasons(bit))
  		return;
  
 -	if (activate) {
 -		if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
 -		    !kvm_apicv_activated(kvm))
 -			return;
 -	} else {
 -		if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
 -		    kvm_apicv_activated(kvm))
 -			return;
 -	}
 +	old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
 +	do {
 +		expected = new = old;
 +		if (activate)
 +			__clear_bit(bit, &new);
 +		else
 +			__set_bit(bit, &new);
 +		if (new == old)
 +			break;
 +		old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
 +	} while (old != expected);
 +
 +	if (!!old == !!new)
 +		return;
  
  	trace_kvm_apicv_update_request(activate, bit);
  	if (kvm_x86_ops->pre_update_apicv_exec_ctrl)
@@@ -8198,8 -8176,8 +8198,8 @@@ static int vcpu_enter_guest(struct kvm_
  		}
  		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
  			kvm_mmu_sync_roots(vcpu);
 -		if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
 -			kvm_mmu_load_cr3(vcpu);
 +		if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
 +			kvm_mmu_load_pgd(vcpu);
  		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
  			kvm_vcpu_flush_tlb(vcpu, true);
  		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@@ -8284,7 -8262,7 +8284,7 @@@
  			goto out;
  		}
  
 -		if (inject_pending_event(vcpu, req_int_win) != 0)
 +		if (inject_pending_event(vcpu) != 0)
  			req_immediate_exit = true;
  		else {
  			/* Enable SMI/NMI/IRQ window open exits if needed.
@@@ -8465,6 -8443,7 +8465,6 @@@
  	if (vcpu->arch.apic_attention)
  		kvm_lapic_sync_from_vapic(vcpu);
  
 -	vcpu->arch.gpa_available = false;
  	r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath);
  	return r;
  
@@@ -8505,6 -8484,7 +8505,6 @@@ static inline int vcpu_block(struct kv
  		break;
  	default:
  		return -EINTR;
 -		break;
  	}
  	return 1;
  }
@@@ -8512,7 -8492,7 +8512,7 @@@
  static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
  {
  	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
 -		kvm_x86_ops->check_nested_events(vcpu, false);
 +		kvm_x86_ops->check_nested_events(vcpu);
  
  	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
  		!vcpu->arch.apf.halted);
@@@ -8773,7 -8753,7 +8773,7 @@@ static void __get_regs(struct kvm_vcpu 
  		 * that usually, but some bad designed PV devices (vmware
  		 * backdoor interface) need this to work
  		 */
 -		emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
 +		emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
  		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
  	}
  	regs->rax = kvm_rax_read(vcpu);
@@@ -8959,7 -8939,7 +8959,7 @@@ out
  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
  		    int reason, bool has_error_code, u32 error_code)
  {
 -	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
  	int ret;
  
  	init_emulate_ctxt(vcpu);
@@@ -9291,6 -9271,7 +9291,6 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
  	struct page *page;
  	int r;
  
 -	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
  	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  	else
@@@ -9328,14 -9309,11 +9328,14 @@@
  				GFP_KERNEL_ACCOUNT))
  		goto fail_free_mce_banks;
  
 +	if (!alloc_emulate_ctxt(vcpu))
 +		goto free_wbinvd_dirty_mask;
 +
  	vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
  						GFP_KERNEL_ACCOUNT);
  	if (!vcpu->arch.user_fpu) {
  		pr_err("kvm: failed to allocate userspace's fpu\n");
 -		goto free_wbinvd_dirty_mask;
 +		goto free_emulate_ctxt;
  	}
  
  	vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
@@@ -9377,8 -9355,6 +9377,8 @@@ free_guest_fpu
  	kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
  free_user_fpu:
  	kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
 +free_emulate_ctxt:
 +	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
  free_wbinvd_dirty_mask:
  	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
  fail_free_mce_banks:
@@@ -9413,9 -9389,11 +9413,9 @@@ void kvm_arch_vcpu_postcreate(struct kv
  
  	mutex_unlock(&vcpu->mutex);
  
 -	if (!kvmclock_periodic_sync)
 -		return;
 -
 -	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 -					KVMCLOCK_SYNC_PERIOD);
 +	if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
 +		schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 +						KVMCLOCK_SYNC_PERIOD);
  }
  
  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@@ -9429,7 -9407,6 +9429,7 @@@
  
  	kvm_x86_ops->vcpu_free(vcpu);
  
 +	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
  	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
  	kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
  	kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
@@@ -9630,18 -9607,10 +9630,18 @@@ int kvm_arch_hardware_setup(void
  {
  	int r;
  
 +	rdmsrl_safe(MSR_EFER, &host_efer);
 +
 +	if (boot_cpu_has(X86_FEATURE_XSAVES))
 +		rdmsrl(MSR_IA32_XSS, host_xss);
 +
  	r = kvm_x86_ops->hardware_setup();
  	if (r != 0)
  		return r;
  
 +	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
 +		supported_xss = 0;
 +
  	cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
  
  	if (kvm_has_tsc_control) {
@@@ -9658,6 -9627,9 +9658,6 @@@
  		kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
  	}
  
 -	if (boot_cpu_has(X86_FEATURE_XSAVES))
 -		rdmsrl(MSR_IA32_XSS, host_xss);
 -
  	kvm_init_msr_list();
  	return 0;
  }
@@@ -9705,13 -9677,6 +9705,13 @@@ void kvm_arch_sched_in(struct kvm_vcpu 
  	kvm_x86_ops->sched_in(vcpu, cpu);
  }
  
 +void kvm_arch_free_vm(struct kvm *kvm)
 +{
 +	kfree(kvm->arch.hyperv.hv_pa_pg);
 +	vfree(kvm);
 +}
 +
 +
  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  {
  	if (type)
@@@ -9794,9 -9759,9 +9794,9 @@@ void kvm_arch_sync_events(struct kvm *k
  int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
  {
  	int i, r;
 -	unsigned long hva;
 +	unsigned long hva, uninitialized_var(old_npages);
  	struct kvm_memslots *slots = kvm_memslots(kvm);
 -	struct kvm_memory_slot *slot, old;
 +	struct kvm_memory_slot *slot;
  
  	/* Called with kvm->slots_lock held.  */
  	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
@@@ -9804,7 -9769,7 +9804,7 @@@
  
  	slot = id_to_memslot(slots, id);
  	if (size) {
 -		if (slot->npages)
 +		if (slot && slot->npages)
  			return -EEXIST;
  
  		/*
@@@ -9816,18 -9781,13 +9816,18 @@@
  		if (IS_ERR((void *)hva))
  			return PTR_ERR((void *)hva);
  	} else {
 -		if (!slot->npages)
 +		if (!slot || !slot->npages)
  			return 0;
  
 -		hva = 0;
 +		/*
 +		 * Stuff a non-canonical value to catch use-after-delete.  This
 +		 * ends up being 0 on 32-bit KVM, but there's no better
 +		 * alternative.
 +		 */
 +		hva = (unsigned long)(0xdeadull << 48);
 +		old_npages = slot->npages;
  	}
  
 -	old = *slot;
  	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
  		struct kvm_userspace_memory_region m;
  
@@@ -9842,7 -9802,7 +9842,7 @@@
  	}
  
  	if (!size)
 -		vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
 +		vm_munmap(hva, old_npages * PAGE_SIZE);
  
  	return 0;
  }
@@@ -9881,36 -9841,34 +9881,36 @@@ void kvm_arch_destroy_vm(struct kvm *kv
  	kvm_hv_destroy_vm(kvm);
  }
  
 -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 -			   struct kvm_memory_slot *dont)
 +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
  {
  	int i;
  
  	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 -		if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
 -			kvfree(free->arch.rmap[i]);
 -			free->arch.rmap[i] = NULL;
 -		}
 +		kvfree(slot->arch.rmap[i]);
 +		slot->arch.rmap[i] = NULL;
 +
  		if (i == 0)
  			continue;
  
 -		if (!dont || free->arch.lpage_info[i - 1] !=
 -			     dont->arch.lpage_info[i - 1]) {
 -			kvfree(free->arch.lpage_info[i - 1]);
 -			free->arch.lpage_info[i - 1] = NULL;
 -		}
 +		kvfree(slot->arch.lpage_info[i - 1]);
 +		slot->arch.lpage_info[i - 1] = NULL;
  	}
  
 -	kvm_page_track_free_memslot(free, dont);
 +	kvm_page_track_free_memslot(slot);
  }
  
 -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 -			    unsigned long npages)
 +static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
 +				      unsigned long npages)
  {
  	int i;
  
 +	/*
 +	 * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
 +	 * old arrays will be freed by __kvm_set_memory_region() if installing
 +	 * the new memslot is successful.
 +	 */
 +	memset(&slot->arch, 0, sizeof(slot->arch));
 +
  	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
  		struct kvm_lpage_info *linfo;
  		unsigned long ugfn;
@@@ -9941,9 -9899,11 +9941,9 @@@
  		ugfn = slot->userspace_addr >> PAGE_SHIFT;
  		/*
  		 * If the gfn and userspace address are not aligned wrt each
 -		 * other, or if explicitly asked to, disable large page
 -		 * support for this slot
 +		 * other, disable large page support for this slot.
  		 */
 -		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
 -		    !kvm_largepages_enabled()) {
 +		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
  			unsigned long j;
  
  			for (j = 0; j < lpages; ++j)
@@@ -9990,9 -9950,6 +9990,9 @@@ int kvm_arch_prepare_memory_region(stru
  				const struct kvm_userspace_memory_region *mem,
  				enum kvm_mr_change change)
  {
 +	if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
 +		return kvm_alloc_memslot_metadata(memslot,
 +						  mem->memory_size >> PAGE_SHIFT);
  	return 0;
  }
  
@@@ -10001,7 -9958,7 +10001,7 @@@ static void kvm_mmu_slot_apply_flags(st
  {
  	/* Still write protect RO slot */
  	if (new->flags & KVM_MEM_READONLY) {
 -		kvm_mmu_slot_remove_write_access(kvm, new);
 +		kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL);
  		return;
  	}
  
@@@ -10036,23 -9993,10 +10036,23 @@@
  	 * See the comments in fast_page_fault().
  	 */
  	if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
 -		if (kvm_x86_ops->slot_enable_log_dirty)
 +		if (kvm_x86_ops->slot_enable_log_dirty) {
  			kvm_x86_ops->slot_enable_log_dirty(kvm, new);
 -		else
 -			kvm_mmu_slot_remove_write_access(kvm, new);
 +		} else {
 +			int level =
 +				kvm_dirty_log_manual_protect_and_init_set(kvm) ?
 +				PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL;
 +
 +			/*
 +			 * If we're with initial-all-set, we don't need
 +			 * to write protect any small page because
 +			 * they're reported as dirty already.  However
 +			 * we still need to write-protect huge pages
 +			 * so that the page split can happen lazily on
 +			 * the first write to the huge page.
 +			 */
 +			kvm_mmu_slot_remove_write_access(kvm, new, level);
 +		}
  	} else {
  		if (kvm_x86_ops->slot_disable_log_dirty)
  			kvm_x86_ops->slot_disable_log_dirty(kvm, new);
@@@ -10061,7 -10005,7 +10061,7 @@@
  
  void kvm_arch_commit_memory_region(struct kvm *kvm,
  				const struct kvm_userspace_memory_region *mem,
 -				const struct kvm_memory_slot *old,
 +				struct kvm_memory_slot *old,
  				const struct kvm_memory_slot *new,
  				enum kvm_mr_change change)
  {
@@@ -10103,10 -10047,6 +10103,10 @@@
  	 */
  	if (change != KVM_MR_DELETE)
  		kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
 +
 +	/* Free the arrays associated with the old memslot. */
 +	if (change == KVM_MR_MOVE)
 +		kvm_arch_free_memslot(kvm, old);
  }
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@@ -10251,7 -10191,7 +10251,7 @@@ void kvm_arch_async_page_ready(struct k
  		return;
  
  	if (!vcpu->arch.mmu->direct_map &&
 -	      work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
 +	      work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
  		return;
  
  	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
@@@ -10574,5 -10514,4 +10574,5 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_fu
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
 +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
diff --combined include/linux/kvm_host.h
index b19dee4,bcb9b2a..f6a1905
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@@ -360,10 -360,6 +360,10 @@@ static inline unsigned long *kvm_second
  	return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap);
  }
  
 +#ifndef KVM_DIRTY_LOG_MANUAL_CAPS
 +#define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE
 +#endif
 +
  struct kvm_s390_adapter_int {
  	u64 ind_addr;
  	u64 summary_addr;
@@@ -435,11 -431,11 +435,11 @@@ static inline int kvm_arch_vcpu_memslot
   */
  struct kvm_memslots {
  	u64 generation;
 -	struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
  	/* The mapping table from slot id to the index in memslots[]. */
  	short id_to_index[KVM_MEM_SLOTS_NUM];
  	atomic_t lru_slot;
  	int used_slots;
 +	struct kvm_memory_slot memslots[];
  };
  
  struct kvm {
@@@ -497,7 -493,7 +497,7 @@@
  #endif
  	long tlbs_dirty;
  	struct list_head devices;
 -	bool manual_dirty_log_protect;
 +	u64 manual_dirty_log_protect;
  	struct dentry *debugfs_dentry;
  	struct kvm_stat_data **debugfs_stat_data;
  	struct srcu_struct srcu;
@@@ -531,11 -527,6 +531,11 @@@
  #define vcpu_err(vcpu, fmt, ...)					\
  	kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
  
 +static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
 +{
 +	return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
 +}
 +
  static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx)
  {
  	return srcu_dereference_check(kvm->buses[idx], &kvm->srcu,
@@@ -581,11 -572,10 +581,11 @@@ static inline int kvm_vcpu_get_idx(stru
  	return vcpu->vcpu_idx;
  }
  
 -#define kvm_for_each_memslot(memslot, slots)	\
 -	for (memslot = &slots->memslots[0];	\
 -	      memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
 -		memslot++)
 +#define kvm_for_each_memslot(memslot, slots)				\
 +	for (memslot = &slots->memslots[0];				\
 +	     memslot < slots->memslots + slots->used_slots; memslot++)	\
 +		if (WARN_ON_ONCE(!memslot->npages)) {			\
 +		} else
  
  void kvm_vcpu_destroy(struct kvm_vcpu *vcpu);
  
@@@ -645,15 -635,12 +645,15 @@@ static inline struct kvm_memslots *kvm_
  	return __kvm_memslots(vcpu->kvm, as_id);
  }
  
 -static inline struct kvm_memory_slot *
 -id_to_memslot(struct kvm_memslots *slots, int id)
 +static inline
 +struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
  {
  	int index = slots->id_to_index[id];
  	struct kvm_memory_slot *slot;
  
 +	if (index < 0)
 +		return NULL;
 +
  	slot = &slots->memslots[index];
  
  	WARN_ON(slot->id != id);
@@@ -682,7 -669,10 +682,7 @@@ int kvm_set_memory_region(struct kvm *k
  			  const struct kvm_userspace_memory_region *mem);
  int __kvm_set_memory_region(struct kvm *kvm,
  			    const struct kvm_userspace_memory_region *mem);
 -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 -			   struct kvm_memory_slot *dont);
 -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 -			    unsigned long npages);
 +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
  void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
  				struct kvm_memory_slot *memslot,
@@@ -690,9 -680,11 +690,9 @@@
  				enum kvm_mr_change change);
  void kvm_arch_commit_memory_region(struct kvm *kvm,
  				const struct kvm_userspace_memory_region *mem,
 -				const struct kvm_memory_slot *old,
 +				struct kvm_memory_slot *old,
  				const struct kvm_memory_slot *new,
  				enum kvm_mr_change change);
 -bool kvm_largepages_enabled(void);
 -void kvm_disable_largepages(void);
  /* flush all memory translations */
  void kvm_arch_flush_shadow_all(struct kvm *kvm);
  /* flush memory translations pointing to 'slot' */
@@@ -712,6 -704,7 +712,6 @@@ void kvm_release_page_clean(struct pag
  void kvm_release_page_dirty(struct page *page);
  void kvm_set_page_accessed(struct page *page);
  
 -kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
  kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
  		      bool *writable);
@@@ -826,20 -819,23 +826,20 @@@ vm_fault_t kvm_arch_vcpu_fault(struct k
  
  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
  
 -int kvm_get_dirty_log(struct kvm *kvm,
 -			struct kvm_dirty_log *log, int *is_dirty);
 -
 -int kvm_get_dirty_log_protect(struct kvm *kvm,
 -			      struct kvm_dirty_log *log, bool *flush);
 -int kvm_clear_dirty_log_protect(struct kvm *kvm,
 -				struct kvm_clear_dirty_log *log, bool *flush);
 -
  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
  					struct kvm_memory_slot *slot,
  					gfn_t gfn_offset,
  					unsigned long mask);
 -
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 -				struct kvm_dirty_log *log);
 -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 -				  struct kvm_clear_dirty_log *log);
 +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
 +
 +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
 +					struct kvm_memory_slot *memslot);
 +#else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
 +int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
 +		      int *is_dirty, struct kvm_memory_slot **memslot);
 +#endif
  
  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
  			bool line_status);
@@@ -1022,8 -1018,6 +1022,8 @@@ bool kvm_arch_irqfd_allowed(struct kvm 
   * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
   * gfn_to_memslot() itself isn't here as an inline because that would
   * bloat other code too much.
 + *
 + * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!
   */
  static inline struct kvm_memory_slot *
  search_memslots(struct kvm_memslots *slots, gfn_t gfn)
@@@ -1032,9 -1026,6 +1032,9 @@@
  	int slot = atomic_read(&slots->lru_slot);
  	struct kvm_memory_slot *memslots = slots->memslots;
  
 +	if (unlikely(!slots->used_slots))
 +		return NULL;
 +
  	if (gfn >= memslots[slot].base_gfn &&
  	    gfn < memslots[slot].base_gfn + memslots[slot].npages)
  		return &memslots[slot];
@@@ -1353,7 -1344,7 +1353,7 @@@ static inline void kvm_vcpu_set_dy_elig
  #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
  
  struct kvm_vcpu *kvm_get_running_vcpu(void);
- struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
+ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
  
  #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
  bool kvm_arch_has_irq_bypass(void);
diff --combined virt/kvm/arm/arm.c
index bfdba1c,4d864f8..376c6a7
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@@ -625,6 -625,14 +625,14 @@@ static void check_vcpu_requests(struct 
  
  		if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
  			kvm_update_stolen_time(vcpu);
+ 
+ 		if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
+ 			/* The distributor enable bits were changed */
+ 			preempt_disable();
+ 			vgic_v4_put(vcpu, false);
+ 			vgic_v4_load(vcpu);
+ 			preempt_enable();
+ 		}
  	}
  }
  
@@@ -742,9 -750,7 +750,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
  		guest_enter_irqoff();
  
  		if (has_vhe()) {
- 			kvm_arm_vhe_guest_enter();
  			ret = kvm_vcpu_run_vhe(vcpu);
- 			kvm_arm_vhe_guest_exit();
  		} else {
  			ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
  		}
@@@ -1183,15 -1189,55 +1189,15 @@@ long kvm_arch_vcpu_ioctl(struct file *f
  	return r;
  }
  
 -/**
 - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
 - * @kvm: kvm instance
 - * @log: slot id and address to which we copy the log
 - *
 - * Steps 1-4 below provide general overview of dirty page logging. See
 - * kvm_get_dirty_log_protect() function description for additional details.
 - *
 - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
 - * always flush the TLB (step 4) even if previous step failed  and the dirty
 - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
 - * does not preclude user space subsequent dirty log read. Flushing TLB ensures
 - * writes will be marked dirty for next log read.
 - *
 - *   1. Take a snapshot of the bit and clear it if needed.
 - *   2. Write protect the corresponding page.
 - *   3. Copy the snapshot to the userspace.
 - *   4. Flush TLB's if needed.
 - */
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
  {
 -	bool flush = false;
 -	int r;
 -
 -	mutex_lock(&kvm->slots_lock);
 -
 -	r = kvm_get_dirty_log_protect(kvm, log, &flush);
  
 -	if (flush)
 -		kvm_flush_remote_tlbs(kvm);
 -
 -	mutex_unlock(&kvm->slots_lock);
 -	return r;
  }
  
 -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
 +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
 +					struct kvm_memory_slot *memslot)
  {
 -	bool flush = false;
 -	int r;
 -
 -	mutex_lock(&kvm->slots_lock);
 -
 -	r = kvm_clear_dirty_log_protect(kvm, log, &flush);
 -
 -	if (flush)
 -		kvm_flush_remote_tlbs(kvm);
 -
 -	mutex_unlock(&kvm->slots_lock);
 -	return r;
 +	kvm_flush_remote_tlbs(kvm);
  }
  
  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,