Merge tag 'kvm-4.20-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 00:57:35 +0000 (17:57 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 00:57:35 +0000 (17:57 -0700)
Pull KVM updates from Radim Krčmář:
 "ARM:
   - Improved guest IPA space support (32 to 52 bits)

   - RAS event delivery for 32bit

   - PMU fixes

   - Guest entry hardening

   - Various cleanups

   - Port of dirty_log_test selftest

  PPC:
   - Nested HV KVM support for radix guests on POWER9. The performance
     is much better than with PR KVM. Migration and arbitrary level of
     nesting is supported.

   - Disable nested HV-KVM on early POWER9 chips that need a particular
     hardware bug workaround

   - One VM per core mode to prevent potential data leaks

   - PCI pass-through optimization

   - merge ppc-kvm topic branch and kvm-ppc-fixes to get a better base

  s390:
   - Initial version of AP crypto virtualization via vfio-mdev

   - Improvement for vfio-ap

   - Set the host program identifier

   - Optimize page table locking

  x86:
   - Enable nested virtualization by default

   - Implement Hyper-V IPI hypercalls

   - Improve #PF and #DB handling

   - Allow guests to use Enlightened VMCS

   - Add migration selftests for VMCS and Enlightened VMCS

   - Allow coalesced PIO accesses

   - Add an option to perform nested VMCS host state consistency check
     through hardware

   - Automatic tuning of lapic_timer_advance_ns

   - Many fixes, minor improvements, and cleanups"

* tag 'kvm-4.20-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (204 commits)
  KVM/nVMX: Do not validate that posted_intr_desc_addr is page aligned
  Revert "kvm: x86: optimize dr6 restore"
  KVM: PPC: Optimize clearing TCEs for sparse tables
  x86/kvm/nVMX: tweak shadow fields
  selftests/kvm: add missing executables to .gitignore
  KVM: arm64: Safety check PSTATE when entering guest and handle IL
  KVM: PPC: Book3S HV: Don't use streamlined entry path on early POWER9 chips
  arm/arm64: KVM: Enable 32 bits kvm vcpu events support
  arm/arm64: KVM: Rename function kvm_arch_dev_ioctl_check_extension()
  KVM: arm64: Fix caching of host MDCR_EL2 value
  KVM: VMX: enable nested virtualization by default
  KVM/x86: Use 32bit xor to clear registers in svm.c
  kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD
  kvm: vmx: Defer setting of DR6 until #DB delivery
  kvm: x86: Defer setting of CR2 until #PF delivery
  kvm: x86: Add payload operands to kvm_multiple_exception
  kvm: x86: Add exception payload fields to kvm_vcpu_events
  kvm: x86: Add has_payload and payload to kvm_queued_exception
  KVM: Documentation: Fix omission in struct kvm_vcpu_events
  KVM: selftests: add Enlightened VMCS test
  ...

138 files changed:
Documentation/s390/vfio-ap.txt [new file with mode: 0644]
Documentation/virtual/kvm/api.txt
MAINTAINERS
arch/arm/include/asm/kvm_arm.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_mmu.h
arch/arm/include/asm/stage2_pgtable.h
arch/arm64/include/asm/cpufeature.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_hyp.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/ptrace.h
arch/arm64/include/asm/stage2_pgtable-nopmd.h [deleted file]
arch/arm64/include/asm/stage2_pgtable-nopud.h [deleted file]
arch/arm64/include/asm/stage2_pgtable.h
arch/arm64/kvm/guest.c
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/hyp/Makefile
arch/arm64/kvm/hyp/hyp-entry.S
arch/arm64/kvm/hyp/s2-setup.c [deleted file]
arch/arm64/kvm/hyp/switch.c
arch/arm64/kvm/hyp/sysreg-sr.c
arch/arm64/kvm/hyp/tlb.c
arch/arm64/kvm/reset.c
arch/powerpc/include/asm/asm-prototypes.h
arch/powerpc/include/asm/book3s/64/mmu-hash.h
arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/iommu.h
arch/powerpc/include/asm/kvm_asm.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/ppc-opcode.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/uapi/asm/kvm.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/cpu_setup_power.S
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_mmu_radix.c
arch/powerpc/kvm/book3s_64_vio.c
arch/powerpc/kvm/book3s_64_vio_hv.c
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_interrupts.S
arch/powerpc/kvm/book3s_hv_nested.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_ras.c
arch/powerpc/kvm/book3s_hv_rm_xics.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_hv_tm.c
arch/powerpc/kvm/book3s_hv_tm_builtin.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/book3s_xics.c
arch/powerpc/kvm/book3s_xive.c
arch/powerpc/kvm/book3s_xive_template.c
arch/powerpc/kvm/bookehv_interrupts.S
arch/powerpc/kvm/emulate_loadstore.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/tm.S
arch/powerpc/kvm/trace_book3s.h
arch/powerpc/mm/tlb-radix.c
arch/s390/Kconfig
arch/s390/include/asm/kvm_host.h
arch/s390/include/uapi/asm/kvm.h
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/vsie.c
arch/s390/mm/gmap.c
arch/s390/tools/gen_facilities.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/virtext.h
arch/x86/include/asm/vmx.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/kvm/hyperv.c
arch/x86/kvm/hyperv.h
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/vmx_shadow_fields.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
drivers/iommu/Kconfig
drivers/s390/crypto/Makefile
drivers/s390/crypto/vfio_ap_drv.c [new file with mode: 0644]
drivers/s390/crypto/vfio_ap_ops.c [new file with mode: 0644]
drivers/s390/crypto/vfio_ap_private.h [new file with mode: 0644]
drivers/vfio/vfio_iommu_spapr_tce.c
include/linux/irqchip/arm-gic-v3.h
include/uapi/linux/kvm.h
include/uapi/linux/vfio.h
tools/arch/x86/include/uapi/asm/kvm.h
tools/include/uapi/linux/kvm.h
tools/perf/arch/powerpc/util/book3s_hv_exits.h
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/dirty_log_test.c
tools/testing/selftests/kvm/include/aarch64/processor.h [new file with mode: 0644]
tools/testing/selftests/kvm/include/evmcs.h [new file with mode: 0644]
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/include/sparsebit.h
tools/testing/selftests/kvm/include/test_util.h
tools/testing/selftests/kvm/include/x86_64/processor.h [moved from tools/testing/selftests/kvm/include/x86.h with 97% similarity]
tools/testing/selftests/kvm/include/x86_64/vmx.h [moved from tools/testing/selftests/kvm/include/vmx.h with 96% similarity]
tools/testing/selftests/kvm/lib/aarch64/processor.c [new file with mode: 0644]
tools/testing/selftests/kvm/lib/assert.c
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/kvm_util_internal.h
tools/testing/selftests/kvm/lib/ucall.c [new file with mode: 0644]
tools/testing/selftests/kvm/lib/x86_64/processor.c [moved from tools/testing/selftests/kvm/lib/x86.c with 80% similarity]
tools/testing/selftests/kvm/lib/x86_64/vmx.c [moved from tools/testing/selftests/kvm/lib/vmx.c with 88% similarity]
tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c [moved from tools/testing/selftests/kvm/cr4_cpuid_sync_test.c with 91% similarity]
tools/testing/selftests/kvm/x86_64/evmcs_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/platform_info_test.c [moved from tools/testing/selftests/kvm/platform_info_test.c with 90% similarity]
tools/testing/selftests/kvm/x86_64/set_sregs_test.c [moved from tools/testing/selftests/kvm/set_sregs_test.c with 98% similarity]
tools/testing/selftests/kvm/x86_64/state_test.c [moved from tools/testing/selftests/kvm/state_test.c with 89% similarity]
tools/testing/selftests/kvm/x86_64/sync_regs_test.c [moved from tools/testing/selftests/kvm/sync_regs_test.c with 99% similarity]
tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c [moved from tools/testing/selftests/kvm/vmx_tsc_adjust_test.c with 91% similarity]
virt/kvm/arm/arm.c
virt/kvm/arm/mmu.c
virt/kvm/arm/vgic/vgic-its.c
virt/kvm/arm/vgic/vgic-kvm-device.c
virt/kvm/arm/vgic/vgic-mmio-v3.c
virt/kvm/coalesced_mmio.c
virt/kvm/kvm_main.c

diff --git a/Documentation/s390/vfio-ap.txt b/Documentation/s390/vfio-ap.txt
new file mode 100644 (file)
index 0000000..65167cf
--- /dev/null
@@ -0,0 +1,837 @@
+Introduction:
+============
+The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised
+of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards.
+The AP devices provide cryptographic functions to all CPUs assigned to a
+linux system running in an IBM Z system LPAR.
+
+The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap
+is to make AP cards available to KVM guests using the VFIO mediated device
+framework. This implementation relies considerably on the s390 virtualization
+facilities which do most of the hard work of providing direct access to AP
+devices.
+
+AP Architectural Overview:
+=========================
+To facilitate the comprehension of the design, let's start with some
+definitions:
+
+* AP adapter
+
+  An AP adapter is an IBM Z adapter card that can perform cryptographic
+  functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters
+  assigned to the LPAR in which a linux host is running will be available to
+  the linux host. Each adapter is identified by a number from 0 to 255; however,
+  the maximum adapter number is determined by machine model and/or adapter type.
+  When installed, an AP adapter is accessed by AP instructions executed by any
+  CPU.
+
+  The AP adapter cards are assigned to a given LPAR via the system's Activation
+  Profile which can be edited via the HMC. When the linux host system is IPL'd
+  in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and
+  creates a sysfs device for each assigned adapter. For example, if AP adapters
+  4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following
+  sysfs device entries:
+
+    /sys/devices/ap/card04
+    /sys/devices/ap/card0a
+
+  Symbolic links to these devices will also be created in the AP bus devices
+  sub-directory:
+
+    /sys/bus/ap/devices/[card04]
+    /sys/bus/ap/devices/[card04]
+
+* AP domain
+
+  An adapter is partitioned into domains. An adapter can hold up to 256 domains
+  depending upon the adapter type and hardware configuration. A domain is
+  identified by a number from 0 to 255; however, the maximum domain number is
+  determined by machine model and/or adapter type.. A domain can be thought of
+  as a set of hardware registers and memory used for processing AP commands. A
+  domain can be configured with a secure private key used for clear key
+  encryption. A domain is classified in one of two ways depending upon how it
+  may be accessed:
+
+    * Usage domains are domains that are targeted by an AP instruction to
+      process an AP command.
+
+    * Control domains are domains that are changed by an AP command sent to a
+      usage domain; for example, to set the secure private key for the control
+      domain.
+
+  The AP usage and control domains are assigned to a given LPAR via the system's
+  Activation Profile which can be edited via the HMC. When a linux host system
+  is IPL'd in the LPAR, the AP bus module detects the AP usage and control
+  domains assigned to the LPAR. The domain number of each usage domain and
+  adapter number of each AP adapter are combined to create AP queue devices
+  (see AP Queue section below). The domain number of each control domain will be
+  represented in a bitmask and stored in a sysfs file
+  /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least
+  significant bit, correspond to domains 0-255.
+
+* AP Queue
+
+  An AP queue is the means by which an AP command is sent to a usage domain
+  inside a specific adapter. An AP queue is identified by a tuple
+  comprised of an AP adapter ID (APID) and an AP queue index (APQI). The
+  APQI corresponds to a given usage domain number within the adapter. This tuple
+  forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP
+  instructions include a field containing the APQN to identify the AP queue to
+  which the AP command is to be sent for processing.
+
+  The AP bus will create a sysfs device for each APQN that can be derived from
+  the cross product of the AP adapter and usage domain numbers detected when the
+  AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage
+  domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the
+  following sysfs entries:
+
+    /sys/devices/ap/card04/04.0006
+    /sys/devices/ap/card04/04.0047
+    /sys/devices/ap/card0a/0a.0006
+    /sys/devices/ap/card0a/0a.0047
+
+  The following symbolic links to these devices will be created in the AP bus
+  devices subdirectory:
+
+    /sys/bus/ap/devices/[04.0006]
+    /sys/bus/ap/devices/[04.0047]
+    /sys/bus/ap/devices/[0a.0006]
+    /sys/bus/ap/devices/[0a.0047]
+
+* AP Instructions:
+
+  There are three AP instructions:
+
+  * NQAP: to enqueue an AP command-request message to a queue
+  * DQAP: to dequeue an AP command-reply message from a queue
+  * PQAP: to administer the queues
+
+  AP instructions identify the domain that is targeted to process the AP
+  command; this must be one of the usage domains. An AP command may modify a
+  domain that is not one of the usage domains, but the modified domain
+  must be one of the control domains.
+
+AP and SIE:
+==========
+Let's now take a look at how AP instructions executed on a guest are interpreted
+by the hardware.
+
+A satellite control block called the Crypto Control Block (CRYCB) is attached to
+our main hardware virtualization control block. The CRYCB contains three fields
+to identify the adapters, usage domains and control domains assigned to the KVM
+guest:
+
+* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned
+  to the KVM guest. Each bit in the mask, from left to right (i.e. from most
+  significant to least significant bit in big endian order), corresponds to
+  an APID from 0-255. If a bit is set, the corresponding adapter is valid for
+  use by the KVM guest.
+
+* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains
+  assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from
+  most significant to least significant bit in big endian order), corresponds to
+  an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue
+  is valid for use by the KVM guest.
+
+* The AP Domain Mask field is a bit mask that identifies the AP control domains
+  assigned to the KVM guest. The ADM bit mask controls which domains can be
+  changed by an AP command-request message sent to a usage domain from the
+  guest. Each bit in the mask, from left to right (i.e. from most significant to
+  least significant bit in big endian order), corresponds to a domain from
+  0-255. If a bit is set, the corresponding domain can be modified by an AP
+  command-request message sent to a usage domain.
+
+If you recall from the description of an AP Queue, AP instructions include
+an APQN to identify the AP queue to which an AP command-request message is to be
+sent (NQAP and PQAP instructions), or from which a command-reply message is to
+be received (DQAP instruction). The validity of an APQN is defined by the matrix
+calculated from the APM and AQM; it is the cross product of all assigned adapter
+numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1
+and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6),
+(2,5) and (2,6) will be valid for the guest.
+
+The APQNs can provide secure key functionality - i.e., a private key is stored
+on the adapter card for each of its domains - so each APQN must be assigned to
+at most one guest or to the linux host.
+
+   Example 1: Valid configuration:
+   ------------------------------
+   Guest1: adapters 1,2  domains 5,6
+   Guest2: adapter  1,2  domain 7
+
+   This is valid because both guests have a unique set of APQNs:
+      Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
+      Guest2 has APQNs (1,7), (2,7)
+
+   Example 2: Valid configuration:
+   ------------------------------
+   Guest1: adapters 1,2 domains 5,6
+   Guest2: adapters 3,4 domains 5,6
+
+   This is also valid because both guests have a unique set of APQNs:
+      Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
+      Guest2 has APQNs (3,5), (3,6), (4,5), (4,6)
+
+   Example 3: Invalid configuration:
+   --------------------------------
+   Guest1: adapters 1,2  domains 5,6
+   Guest2: adapter  1    domains 6,7
+
+   This is an invalid configuration because both guests have access to
+   APQN (1,6).
+
+The Design:
+===========
+The design introduces three new objects:
+
+1. AP matrix device
+2. VFIO AP device driver (vfio_ap.ko)
+3. VFIO AP mediated matrix pass-through device
+
+The VFIO AP device driver
+-------------------------
+The VFIO AP (vfio_ap) device driver serves the following purposes:
+
+1. Provides the interfaces to secure APQNs for exclusive use of KVM guests.
+
+2. Sets up the VFIO mediated device interfaces to manage a mediated matrix
+   device and creates the sysfs interfaces for assigning adapters, usage
+   domains, and control domains comprising the matrix for a KVM guest.
+
+3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's
+   SIE state description to grant the guest access to a matrix of AP devices
+
+Reserve APQNs for exclusive use of KVM guests
+---------------------------------------------
+The following block diagram illustrates the mechanism by which APQNs are
+reserved:
+
+                              +------------------+
+               7 remove       |                  |
+         +--------------------> cex4queue driver |
+         |                    |                  |
+         |                    +------------------+
+         |
+         |
+         |                    +------------------+          +-----------------+
+         |  5 register driver |                  | 3 create |                 |
+         |   +---------------->   Device core    +---------->  matrix device  |
+         |   |                |                  |          |                 |
+         |   |                +--------^---------+          +-----------------+
+         |   |                         |
+         |   |                         +-------------------+
+         |   | +-----------------------------------+       |
+         |   | |      4 register AP driver         |       | 2 register device
+         |   | |                                   |       |
++--------+---+-v---+                      +--------+-------+-+
+|                  |                      |                  |
+|      ap_bus      +--------------------- >  vfio_ap driver  |
+|                  |       8 probe        |                  |
++--------^---------+                      +--^--^------------+
+6 edit   |                                   |  |
+  apmask |     +-----------------------------+  | 9 mdev create
+  aqmask |     |           1 modprobe           |
++--------+-----+---+           +----------------+-+         +------------------+
+|                  |           |                  |8 create |     mediated     |
+|      admin       |           | VFIO device core |--------->     matrix       |
+|                  +           |                  |         |     device       |
++------+-+---------+           +--------^---------+         +--------^---------+
+       | |                              |                            |
+       | | 9 create vfio_ap-passthrough |                            |
+       | +------------------------------+                            |
+       +-------------------------------------------------------------+
+                   10  assign adapter/domain/control domain
+
+The process for reserving an AP queue for use by a KVM guest is:
+
+1. The administrator loads the vfio_ap device driver
+2. The vfio-ap driver during its initialization will register a single 'matrix'
+   device with the device core. This will serve as the parent device for
+   all mediated matrix devices used to configure an AP matrix for a guest.
+3. The /sys/devices/vfio_ap/matrix device is created by the device core
+4  The vfio_ap device driver will register with the AP bus for AP queue devices
+   of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap
+   driver's probe and remove callback interfaces. Devices older than CEX4 queues
+   are not supported to simplify the implementation by not needlessly
+   complicating the design by supporting older devices that will go out of
+   service in the relatively near future, and for which there are few older
+   systems around on which to test.
+5. The AP bus registers the vfio_ap device driver with the device core
+6. The administrator edits the AP adapter and queue masks to reserve AP queues
+   for use by the vfio_ap device driver.
+7. The AP bus removes the AP queues reserved for the vfio_ap driver from the
+   default zcrypt cex4queue driver.
+8. The AP bus probes the vfio_ap device driver to bind the queues reserved for
+   it.
+9. The administrator creates a passthrough type mediated matrix device to be
+   used by a guest
+10 The administrator assigns the adapters, usage domains and control domains
+   to be exclusively used by a guest.
+
+Set up the VFIO mediated device interfaces
+------------------------------------------
+The VFIO AP device driver utilizes the common interface of the VFIO mediated
+device core driver to:
+* Register an AP mediated bus driver to add a mediated matrix device to and
+  remove it from a VFIO group.
+* Create and destroy a mediated matrix device
+* Add a mediated matrix device to and remove it from the AP mediated bus driver
+* Add a mediated matrix device to and remove it from an IOMMU group
+
+The following high-level block diagram shows the main components and interfaces
+of the VFIO AP mediated matrix device driver:
+
+ +-------------+
+ |             |
+ | +---------+ | mdev_register_driver() +--------------+
+ | |  Mdev   | +<-----------------------+              |
+ | |  bus    | |                        | vfio_mdev.ko |
+ | | driver  | +----------------------->+              |<-> VFIO user
+ | +---------+ |    probe()/remove()    +--------------+    APIs
+ |             |
+ |  MDEV CORE  |
+ |   MODULE    |
+ |   mdev.ko   |
+ | +---------+ | mdev_register_device() +--------------+
+ | |Physical | +<-----------------------+              |
+ | | device  | |                        |  vfio_ap.ko  |<-> matrix
+ | |interface| +----------------------->+              |    device
+ | +---------+ |       callback         +--------------+
+ +-------------+
+
+During initialization of the vfio_ap module, the matrix device is registered
+with an 'mdev_parent_ops' structure that provides the sysfs attribute
+structures, mdev functions and callback interfaces for managing the mediated
+matrix device.
+
+* sysfs attribute structures:
+  * supported_type_groups
+    The VFIO mediated device framework supports creation of user-defined
+    mediated device types. These mediated device types are specified
+    via the 'supported_type_groups' structure when a device is registered
+    with the mediated device framework. The registration process creates the
+    sysfs structures for each mediated device type specified in the
+    'mdev_supported_types' sub-directory of the device being registered. Along
+    with the device type, the sysfs attributes of the mediated device type are
+    provided.
+
+    The VFIO AP device driver will register one mediated device type for
+    passthrough devices:
+      /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough
+    Only the read-only attributes required by the VFIO mdev framework will
+    be provided:
+        ... name
+        ... device_api
+        ... available_instances
+        ... device_api
+        Where:
+        * name: specifies the name of the mediated device type
+        * device_api: the mediated device type's API
+        * available_instances: the number of mediated matrix passthrough devices
+                               that can be created
+        * device_api: specifies the VFIO API
+  * mdev_attr_groups
+    This attribute group identifies the user-defined sysfs attributes of the
+    mediated device. When a device is registered with the VFIO mediated device
+    framework, the sysfs attribute files identified in the 'mdev_attr_groups'
+    structure will be created in the mediated matrix device's directory. The
+    sysfs attributes for a mediated matrix device are:
+    * assign_adapter:
+    * unassign_adapter:
+      Write-only attributes for assigning/unassigning an AP adapter to/from the
+      mediated matrix device. To assign/unassign an adapter, the APID of the
+      adapter is echoed to the respective attribute file.
+    * assign_domain:
+    * unassign_domain:
+      Write-only attributes for assigning/unassigning an AP usage domain to/from
+      the mediated matrix device. To assign/unassign a domain, the domain
+      number of the the usage domain is echoed to the respective attribute
+      file.
+    * matrix:
+      A read-only file for displaying the APQNs derived from the cross product
+      of the adapter and domain numbers assigned to the mediated matrix device.
+    * assign_control_domain:
+    * unassign_control_domain:
+      Write-only attributes for assigning/unassigning an AP control domain
+      to/from the mediated matrix device. To assign/unassign a control domain,
+      the ID of the domain to be assigned/unassigned is echoed to the respective
+      attribute file.
+    * control_domains:
+      A read-only file for displaying the control domain numbers assigned to the
+      mediated matrix device.
+
+* functions:
+  * create:
+    allocates the ap_matrix_mdev structure used by the vfio_ap driver to:
+    * Store the reference to the KVM structure for the guest using the mdev
+    * Store the AP matrix configuration for the adapters, domains, and control
+      domains assigned via the corresponding sysfs attributes files
+  * remove:
+    deallocates the mediated matrix device's ap_matrix_mdev structure. This will
+    be allowed only if a running guest is not using the mdev.
+
+* callback interfaces
+  * open:
+    The vfio_ap driver uses this callback to register a
+    VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix
+    device. The open is invoked when QEMU connects the VFIO iommu group
+    for the mdev matrix device to the MDEV bus. Access to the KVM structure used
+    to configure the KVM guest is provided via this callback. The KVM structure,
+    is used to configure the guest's access to the AP matrix defined via the
+    mediated matrix device's sysfs attribute files.
+  * release:
+    unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the
+    mdev matrix device and deconfigures the guest's AP matrix.
+
+Configure the APM, AQM and ADM in the CRYCB:
+-------------------------------------------
+Configuring the AP matrix for a KVM guest will be performed when the
+VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier
+function is called when QEMU connects to KVM. The guest's AP matrix is
+configured via it's CRYCB by:
+* Setting the bits in the APM corresponding to the APIDs assigned to the
+  mediated matrix device via its 'assign_adapter' interface.
+* Setting the bits in the AQM corresponding to the domains assigned to the
+  mediated matrix device via its 'assign_domain' interface.
+* Setting the bits in the ADM corresponding to the domain dIDs assigned to the
+  mediated matrix device via its 'assign_control_domains' interface.
+
+The CPU model features for AP
+-----------------------------
+The AP stack relies on the presence of the AP instructions as well as two
+facilities: The AP Facilities Test (APFT) facility; and the AP Query
+Configuration Information (QCI) facility. These features/facilities are made
+available to a KVM guest via the following CPU model features:
+
+1. ap: Indicates whether the AP instructions are installed on the guest. This
+   feature will be enabled by KVM only if the AP instructions are installed
+   on the host.
+
+2. apft: Indicates the APFT facility is available on the guest. This facility
+   can be made available to the guest only if it is available on the host (i.e.,
+   facility bit 15 is set).
+
+3. apqci: Indicates the AP QCI facility is available on the guest. This facility
+   can be made available to the guest only if it is available on the host (i.e.,
+   facility bit 12 is set).
+
+Note: If the user chooses to specify a CPU model different than the 'host'
+model to QEMU, the CPU model features and facilities need to be turned on
+explicitly; for example:
+
+     /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on
+
+A guest can be precluded from using AP features/facilities by turning them off
+explicitly; for example:
+
+     /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off
+
+Note: If the APFT facility is turned off (apft=off) for the guest, the guest
+will not see any AP devices. The zcrypt device drivers that register for type 10
+and newer AP devices - i.e., the cex4card and cex4queue device drivers - need
+the APFT facility to ascertain the facilities installed on a given AP device. If
+the APFT facility is not installed on the guest, then the probe of device
+drivers will fail since only type 10 and newer devices can be configured for
+guest use.
+
+Example:
+=======
+Let's now provide an example to illustrate how KVM guests may be given
+access to AP facilities. For this example, we will show how to configure
+three guests such that executing the lszcrypt command on the guests would
+look like this:
+
+Guest1
+------
+CARD.DOMAIN TYPE  MODE
+------------------------------
+05          CEX5C CCA-Coproc
+05.0004     CEX5C CCA-Coproc
+05.00ab     CEX5C CCA-Coproc
+06          CEX5A Accelerator
+06.0004     CEX5A Accelerator
+06.00ab     CEX5C CCA-Coproc
+
+Guest2
+------
+CARD.DOMAIN TYPE  MODE
+------------------------------
+05          CEX5A Accelerator
+05.0047     CEX5A Accelerator
+05.00ff     CEX5A Accelerator
+
+Guest2
+------
+CARD.DOMAIN TYPE  MODE
+------------------------------
+06          CEX5A Accelerator
+06.0047     CEX5A Accelerator
+06.00ff     CEX5A Accelerator
+
+These are the steps:
+
+1. Install the vfio_ap module on the linux host. The dependency chain for the
+   vfio_ap module is:
+   * iommu
+   * s390
+   * zcrypt
+   * vfio
+   * vfio_mdev
+   * vfio_mdev_device
+   * KVM
+
+   To build the vfio_ap module, the kernel build must be configured with the
+   following Kconfig elements selected:
+   * IOMMU_SUPPORT
+   * S390
+   * ZCRYPT
+   * S390_AP_IOMMU
+   * VFIO
+   * VFIO_MDEV
+   * VFIO_MDEV_DEVICE
+   * KVM
+
+   If using make menuconfig select the following to build the vfio_ap module:
+   -> Device Drivers
+      -> IOMMU Hardware Support
+         select S390 AP IOMMU Support
+      -> VFIO Non-Privileged userspace driver framework
+         -> Mediated device driver frramework
+            -> VFIO driver for Mediated devices
+   -> I/O subsystem
+      -> VFIO support for AP devices
+
+2. Secure the AP queues to be used by the three guests so that the host can not
+   access them. To secure them, there are two sysfs files that specify
+   bitmasks marking a subset of the APQN range as 'usable by the default AP
+   queue device drivers' or 'not usable by the default device drivers' and thus
+   available for use by the vfio_ap device driver'. The location of the sysfs
+   files containing the masks are:
+
+   /sys/bus/ap/apmask
+   /sys/bus/ap/aqmask
+
+   The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs
+   (APID). Each bit in the mask, from left to right (i.e., from most significant
+   to least significant bit in big endian order), corresponds to an APID from
+   0-255. If a bit is set, the APID is marked as usable only by the default AP
+   queue device drivers; otherwise, the APID is usable by the vfio_ap
+   device driver.
+
+   The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes
+   (APQI). Each bit in the mask, from left to right (i.e., from most significant
+   to least significant bit in big endian order), corresponds to an APQI from
+   0-255. If a bit is set, the APQI is marked as usable only by the default AP
+   queue device drivers; otherwise, the APQI is usable by the vfio_ap device
+   driver.
+
+   Take, for example, the following mask:
+
+      0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+
+    It indicates:
+
+      1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6
+      belong to the vfio_ap device driver's pool.
+
+   The APQN of each AP queue device assigned to the linux host is checked by the
+   AP bus against the set of APQNs derived from the cross product of APIDs
+   and APQIs marked as usable only by the default AP queue device drivers. If a
+   match is detected,  only the default AP queue device drivers will be probed;
+   otherwise, the vfio_ap device driver will be probed.
+
+   By default, the two masks are set to reserve all APQNs for use by the default
+   AP queue device drivers. There are two ways the default masks can be changed:
+
+   1. The sysfs mask files can be edited by echoing a string into the
+      respective sysfs mask file in one of two formats:
+
+      * An absolute hex string starting with 0x - like "0x12345678" - sets
+        the mask. If the given string is shorter than the mask, it is padded
+        with 0s on the right; for example, specifying a mask value of 0x41 is
+        the same as specifying:
+
+           0x4100000000000000000000000000000000000000000000000000000000000000
+
+        Keep in mind that the mask reads from left to right (i.e., most
+        significant to least significant bit in big endian order), so the mask
+        above identifies device numbers 1 and 7 (01000001).
+
+        If the string is longer than the mask, the operation is terminated with
+        an error (EINVAL).
+
+      * Individual bits in the mask can be switched on and off by specifying
+        each bit number to be switched in a comma separated list. Each bit
+        number string must be prepended with a ('+') or minus ('-') to indicate
+        the corresponding bit is to be switched on ('+') or off ('-'). Some
+        valid values are:
+
+           "+0"    switches bit 0 on
+           "-13"   switches bit 13 off
+           "+0x41" switches bit 65 on
+           "-0xff" switches bit 255 off
+
+           The following example:
+              +0,-6,+0x47,-0xf0
+
+              Switches bits 0 and 71 (0x47) on
+              Switches bits 6 and 240 (0xf0) off
+
+        Note that the bits not specified in the list remain as they were before
+        the operation.
+
+   2. The masks can also be changed at boot time via parameters on the kernel
+      command line like this:
+
+         ap.apmask=0xffff ap.aqmask=0x40
+
+         This would create the following masks:
+
+            apmask:
+            0xffff000000000000000000000000000000000000000000000000000000000000
+
+            aqmask:
+            0x4000000000000000000000000000000000000000000000000000000000000000
+
+         Resulting in these two pools:
+
+            default drivers pool:    adapter 0-15, domain 1
+            alternate drivers pool:  adapter 16-255, domains 0, 2-255
+
+   Securing the APQNs for our example:
+   ----------------------------------
+   To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047,
+   06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding
+   APQNs can either be removed from the default masks:
+
+      echo -5,-6 > /sys/bus/ap/apmask
+
+      echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask
+
+   Or the masks can be set as follows:
+
+      echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \
+      > apmask
+
+      echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \
+      > aqmask
+
+   This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004,
+   06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The
+   sysfs directory for the vfio_ap device driver will now contain symbolic links
+   to the AP queue devices bound to it:
+
+   /sys/bus/ap
+   ... [drivers]
+   ...... [vfio_ap]
+   ......... [05.0004]
+   ......... [05.0047]
+   ......... [05.00ab]
+   ......... [05.00ff]
+   ......... [06.0004]
+   ......... [06.0047]
+   ......... [06.00ab]
+   ......... [06.00ff]
+
+   Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later)
+   can be bound to the vfio_ap device driver. The reason for this is to
+   simplify the implementation by not needlessly complicating the design by
+   supporting older devices that will go out of service in the relatively near
+   future and for which there are few older systems on which to test.
+
+   The administrator, therefore, must take care to secure only AP queues that
+   can be bound to the vfio_ap device driver. The device type for a given AP
+   queue device can be read from the parent card's sysfs directory. For example,
+   to see the hardware type of the queue 05.0004:
+
+   cat /sys/bus/ap/devices/card05/hwtype
+
+   The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the
+   vfio_ap device driver.
+
+3. Create the mediated devices needed to configure the AP matrixes for the
+   three guests and to provide an interface to the vfio_ap driver for
+   use by the guests:
+
+   /sys/devices/vfio_ap/matrix/
+   --- [mdev_supported_types]
+   ------ [vfio_ap-passthrough] (passthrough mediated matrix device type)
+   --------- create
+   --------- [devices]
+
+   To create the mediated devices for the three guests:
+
+       uuidgen > create
+       uuidgen > create
+       uuidgen > create
+
+        or
+
+        echo $uuid1 > create
+        echo $uuid2 > create
+        echo $uuid3 > create
+
+   This will create three mediated devices in the [devices] subdirectory named
+   after the UUID written to the create attribute file. We call them $uuid1,
+   $uuid2 and $uuid3 and this is the sysfs directory structure after creation:
+
+   /sys/devices/vfio_ap/matrix/
+   --- [mdev_supported_types]
+   ------ [vfio_ap-passthrough]
+   --------- [devices]
+   ------------ [$uuid1]
+   --------------- assign_adapter
+   --------------- assign_control_domain
+   --------------- assign_domain
+   --------------- matrix
+   --------------- unassign_adapter
+   --------------- unassign_control_domain
+   --------------- unassign_domain
+
+   ------------ [$uuid2]
+   --------------- assign_adapter
+   --------------- assign_control_domain
+   --------------- assign_domain
+   --------------- matrix
+   --------------- unassign_adapter
+   ----------------unassign_control_domain
+   ----------------unassign_domain
+
+   ------------ [$uuid3]
+   --------------- assign_adapter
+   --------------- assign_control_domain
+   --------------- assign_domain
+   --------------- matrix
+   --------------- unassign_adapter
+   ----------------unassign_control_domain
+   ----------------unassign_domain
+
+4. The administrator now needs to configure the matrixes for the mediated
+   devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3).
+
+   This is how the matrix is configured for Guest1:
+
+      echo 5 > assign_adapter
+      echo 6 > assign_adapter
+      echo 4 > assign_domain
+      echo 0xab > assign_domain
+
+      Control domains can similarly be assigned using the assign_control_domain
+      sysfs file.
+
+      If a mistake is made configuring an adapter, domain or control domain,
+      you can use the unassign_xxx files to unassign the adapter, domain or
+      control domain.
+
+      To display the matrix configuration for Guest1:
+
+         cat matrix
+
+   This is how the matrix is configured for Guest2:
+
+      echo 5 > assign_adapter
+      echo 0x47 > assign_domain
+      echo 0xff > assign_domain
+
+   This is how the matrix is configured for Guest3:
+
+      echo 6 > assign_adapter
+      echo 0x47 > assign_domain
+      echo 0xff > assign_domain
+
+   In order to successfully assign an adapter:
+
+   * The adapter number specified must represent a value from 0 up to the
+     maximum adapter number configured for the system. If an adapter number
+     higher than the maximum is specified, the operation will terminate with
+     an error (ENODEV).
+
+   * All APQNs that can be derived from the adapter ID and the IDs of
+     the previously assigned domains must be bound to the vfio_ap device
+     driver. If no domains have yet been assigned, then there must be at least
+     one APQN with the specified APID bound to the vfio_ap driver. If no such
+     APQNs are bound to the driver, the operation will terminate with an
+     error (EADDRNOTAVAIL).
+
+     No APQN that can be derived from the adapter ID and the IDs of the
+     previously assigned domains can be assigned to another mediated matrix
+     device. If an APQN is assigned to another mediated matrix device, the
+     operation will terminate with an error (EADDRINUSE).
+
+   In order to successfully assign a domain:
+
+   * The domain number specified must represent a value from 0 up to the
+     maximum domain number configured for the system. If a domain number
+     higher than the maximum is specified, the operation will terminate with
+     an error (ENODEV).
+
+   * All APQNs that can be derived from the domain ID and the IDs of
+     the previously assigned adapters must be bound to the vfio_ap device
+     driver. If no domains have yet been assigned, then there must be at least
+     one APQN with the specified APQI bound to the vfio_ap driver. If no such
+     APQNs are bound to the driver, the operation will terminate with an
+     error (EADDRNOTAVAIL).
+
+     No APQN that can be derived from the domain ID and the IDs of the
+     previously assigned adapters can be assigned to another mediated matrix
+     device. If an APQN is assigned to another mediated matrix device, the
+     operation will terminate with an error (EADDRINUSE).
+
+   In order to successfully assign a control domain, the domain number
+   specified must represent a value from 0 up to the maximum domain number
+   configured for the system. If a control domain number higher than the maximum
+   is specified, the operation will terminate with an error (ENODEV).
+
+5. Start Guest1:
+
+   /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+      -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ...
+
+7. Start Guest2:
+
+   /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+      -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ...
+
+7. Start Guest3:
+
+   /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+      -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ...
+
+When the guest is shut down, the mediated matrix devices may be removed.
+
+Using our example again, to remove the mediated matrix device $uuid1:
+
+   /sys/devices/vfio_ap/matrix/
+      --- [mdev_supported_types]
+      ------ [vfio_ap-passthrough]
+      --------- [devices]
+      ------------ [$uuid1]
+      --------------- remove
+
+
+   echo 1 > remove
+
+   This will remove all of the mdev matrix device's sysfs structures including
+   the mdev device itself. To recreate and reconfigure the mdev matrix device,
+   all of the steps starting with step 3 will have to be performed again. Note
+   that the remove will fail if a guest using the mdev is still running.
+
+   It is not necessary to remove an mdev matrix device, but one may want to
+   remove it if no guest will use it during the remaining lifetime of the linux
+   host. If the mdev matrix device is removed, one may want to also reconfigure
+   the pool of adapters and queues reserved for use by the default drivers.
+
+Limitations
+===========
+* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN
+  to the default drivers pool of a queue that is still assigned to a mediated
+  device in use by a guest. It is incumbent upon the administrator to
+  ensure there is no mediated device in use by a guest to which the APQN is
+  assigned lest the host be given access to the private data of the AP queue
+  device such as a private key configured specifically for the guest.
+
+* Dynamically modifying the AP matrix for a running guest (which would amount to
+  hot(un)plug of AP devices for the guest) is currently not supported
+
+* Live guest migration is not supported for guests using AP devices.
index 647f941..cd209f7 100644 (file)
@@ -123,6 +123,37 @@ memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the
 flag KVM_VM_MIPS_VZ.
 
 
+On arm64, the physical address size for a VM (IPA Size limit) is limited
+to 40bits by default. The limit can be configured if the host supports the
+extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use
+KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type
+identifier, where IPA_Bits is the maximum width of any physical
+address used by the VM. The IPA_Bits is encoded in bits[7-0] of the
+machine type identifier.
+
+e.g, to configure a guest to use 48bit physical address size :
+
+    vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48));
+
+The requested size (IPA_Bits) must be :
+  0 - Implies default size, 40bits (for backward compatibility)
+
+  or
+
+  N - Implies N bits, where N is a positive integer such that,
+      32 <= N <= Host_IPA_Limit
+
+Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and
+is dependent on the CPU capability and the kernel configuration. The limit can
+be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION
+ioctl() at run-time.
+
+Please note that configuring the IPA size does not affect the capability
+exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects
+size of the address translated by the stage2 level (guest physical to
+host physical address translations).
+
+
 4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST
 
 Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST
@@ -850,7 +881,7 @@ struct kvm_vcpu_events {
                __u8 injected;
                __u8 nr;
                __u8 has_error_code;
-               __u8 pad;
+               __u8 pending;
                __u32 error_code;
        } exception;
        struct {
@@ -873,15 +904,23 @@ struct kvm_vcpu_events {
                __u8 smm_inside_nmi;
                __u8 latched_init;
        } smi;
+       __u8 reserved[27];
+       __u8 exception_has_payload;
+       __u64 exception_payload;
 };
 
-Only two fields are defined in the flags field:
+The following bits are defined in the flags field:
 
-- KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
+- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that
   interrupt.shadow contains a valid state.
 
-- KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
-  smi contains a valid state.
+- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a
+  valid state.
+
+- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the
+  exception_has_payload, exception_payload, and exception.pending
+  fields contain a valid state. This bit will be set whenever
+  KVM_CAP_EXCEPTION_PAYLOAD is enabled.
 
 ARM/ARM64:
 
@@ -961,6 +1000,11 @@ shall be written into the VCPU.
 
 KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
 
+If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD
+can be set in the flags field to signal that the
+exception_has_payload, exception_payload, and exception.pending fields
+contain a valid state and shall be written into the VCPU.
+
 ARM/ARM64:
 
 Set the pending SError exception state for this VCPU. It is not possible to
@@ -1922,6 +1966,7 @@ registers, find a list below:
   PPC   | KVM_REG_PPC_TIDR              | 64
   PPC   | KVM_REG_PPC_PSSCR             | 64
   PPC   | KVM_REG_PPC_DEC_EXPIRY        | 64
+  PPC   | KVM_REG_PPC_PTCR              | 64
   PPC   | KVM_REG_PPC_TM_GPR0           | 64
           ...
   PPC   | KVM_REG_PPC_TM_GPR31          | 64
@@ -2269,6 +2314,10 @@ The supported flags are:
         The emulated MMU supports 1T segments in addition to the
         standard 256M ones.
 
+    - KVM_PPC_NO_HASH
+       This flag indicates that HPT guests are not supported by KVM,
+       thus all guests must use radix MMU mode.
+
 The "slb_size" field indicates how many SLB entries are supported
 
 The "sps" array contains 8 entries indicating the supported base
@@ -3676,6 +3725,34 @@ Returns: 0 on success, -1 on error
 This copies the vcpu's kvm_nested_state struct from userspace to the kernel.  For
 the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
 
+4.116 KVM_(UN)REGISTER_COALESCED_MMIO
+
+Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio)
+           KVM_CAP_COALESCED_PIO (for coalesced pio)
+Architectures: all
+Type: vm ioctl
+Parameters: struct kvm_coalesced_mmio_zone
+Returns: 0 on success, < 0 on error
+
+Coalesced I/O is a performance optimization that defers hardware
+register write emulation so that userspace exits are avoided.  It is
+typically used to reduce the overhead of emulating frequently accessed
+hardware registers.
+
+When a hardware register is configured for coalesced I/O, write accesses
+do not exit to userspace and their value is recorded in a ring buffer
+that is shared between kernel and userspace.
+
+Coalesced I/O is used if one or more write accesses to a hardware
+register can be deferred until a read or a write to another hardware
+register on the same device.  This last access will cause a vmexit and
+userspace will process accesses from the ring buffer before emulating
+it. That will avoid exiting to userspace on repeated writes.
+
+Coalesced pio is based on coalesced mmio. There is little difference
+between coalesced mmio and pio except that coalesced pio records accesses
+to I/O ports.
+
 5. The kvm_run structure
 ------------------------
 
@@ -4522,7 +4599,7 @@ hpage module parameter is not set to 1, -EINVAL is returned.
 While it is generally possible to create a huge page backed VM without
 this capability, the VM will not be able to run.
 
-7.14 KVM_CAP_MSR_PLATFORM_INFO
+7.15 KVM_CAP_MSR_PLATFORM_INFO
 
 Architectures: x86
 Parameters: args[0] whether feature should be enabled or not
@@ -4531,6 +4608,45 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise,
 a #GP would be raised when the guest tries to access. Currently, this
 capability does not enable write permissions of this MSR for the guest.
 
+7.16 KVM_CAP_PPC_NESTED_HV
+
+Architectures: ppc
+Parameters: none
+Returns: 0 on success, -EINVAL when the implementation doesn't support
+        nested-HV virtualization.
+
+HV-KVM on POWER9 and later systems allows for "nested-HV"
+virtualization, which provides a way for a guest VM to run guests that
+can run using the CPU's supervisor mode (privileged non-hypervisor
+state).  Enabling this capability on a VM depends on the CPU having
+the necessary functionality and on the facility being enabled with a
+kvm-hv module parameter.
+
+7.17 KVM_CAP_EXCEPTION_PAYLOAD
+
+Architectures: x86
+Parameters: args[0] whether feature should be enabled or not
+
+With this capability enabled, CR2 will not be modified prior to the
+emulated VM-exit when L1 intercepts a #PF exception that occurs in
+L2. Similarly, for kvm-intel only, DR6 will not be modified prior to
+the emulated VM-exit when L1 intercepts a #DB exception that occurs in
+L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or
+#DB) exception for L2, exception.has_payload will be set and the
+faulting address (or the new DR6 bits*) will be reported in the
+exception_payload field. Similarly, when userspace injects a #PF (or
+#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set
+exception.has_payload and to put the faulting address (or the new DR6
+bits*) in the exception_payload field.
+
+This capability also enables exception.pending in struct
+kvm_vcpu_events, which allows userspace to distinguish between pending
+and injected exceptions.
+
+
+* For the new DR6 bits, note that bit 16 is set iff the #DB exception
+  will clear DR6.RTM.
+
 8. Other capabilities.
 ----------------------
 
@@ -4772,3 +4888,10 @@ CPU when the exception is taken. If this virtual SError is taken to EL1 using
 AArch64, this value will be reported in the ISS field of ESR_ELx.
 
 See KVM_CAP_VCPU_EVENTS for more details.
+8.20 KVM_CAP_HYPERV_SEND_IPI
+
+Architectures: x86
+
+This capability indicates that KVM supports paravirtualized Hyper-V IPI send
+hypercalls:
+HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
index 8f22f6a..bd702ad 100644 (file)
@@ -12800,6 +12800,18 @@ W:     http://www.ibm.com/developerworks/linux/linux390/
 S:     Supported
 F:     drivers/s390/crypto/
 
+S390 VFIO AP DRIVER
+M:     Tony Krowiak <akrowiak@linux.ibm.com>
+M:     Pierre Morel <pmorel@linux.ibm.com>
+M:     Halil Pasic <pasic@linux.ibm.com>
+L:     linux-s390@vger.kernel.org
+W:     http://www.ibm.com/developerworks/linux/linux390/
+S:     Supported
+F:     drivers/s390/crypto/vfio_ap_drv.c
+F:     drivers/s390/crypto/vfio_ap_private.h
+F:     drivers/s390/crypto/vfio_ap_ops.c
+F:     Documentation/s390/vfio-ap.txt
+
 S390 ZFCP DRIVER
 M:     Steffen Maier <maier@linux.ibm.com>
 M:     Benjamin Block <bblock@linux.ibm.com>
index 2d43dca..b95f8d0 100644 (file)
  * space.
  */
 #define KVM_PHYS_SHIFT (40)
-#define KVM_PHYS_SIZE  (_AC(1, ULL) << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK  (KVM_PHYS_SIZE - _AC(1, ULL))
+
 #define PTRS_PER_S2_PGD        (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30))
 
 /* Virtualization Translation Control Register (VTCR) bits */
index 3ad482d..5ca5d9a 100644 (file)
@@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void)
        kvm_call_hyp(__init_stage2_translation);
 }
 
-static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
+static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 {
        return 0;
 }
@@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {}
 struct kvm *kvm_arch_alloc_vm(void);
 void kvm_arch_free_vm(struct kvm *kvm);
 
+static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
+{
+       /*
+        * On 32bit ARM, VMs get a static 40bit IPA stage2 setup,
+        * so any non-zero value used as type is illegal.
+        */
+       if (type)
+               return -EINVAL;
+       return 0;
+}
+
 #endif /* __ARM_KVM_HOST_H__ */
index 847f01f..1098ffc 100644 (file)
                addr;                                                   \
        })
 
-/*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
- */
-#define KVM_MMU_CACHE_MIN_PAGES        2
-
 #ifndef __ASSEMBLY__
 
 #include <linux/highmem.h>
 #include <asm/cacheflush.h>
 #include <asm/cputype.h>
+#include <asm/kvm_arm.h>
 #include <asm/kvm_hyp.h>
 #include <asm/pgalloc.h>
 #include <asm/stage2_pgtable.h>
 /* Ensure compatibility with arm64 */
 #define VA_BITS                        32
 
+#define kvm_phys_shift(kvm)            KVM_PHYS_SHIFT
+#define kvm_phys_size(kvm)             (1ULL << kvm_phys_shift(kvm))
+#define kvm_phys_mask(kvm)             (kvm_phys_size(kvm) - 1ULL)
+#define kvm_vttbr_baddr_mask(kvm)      VTTBR_BADDR_MASK
+
+#define stage2_pgd_size(kvm)           (PTRS_PER_S2_PGD * sizeof(pgd_t))
+
 int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
                           void __iomem **kaddr,
@@ -355,6 +358,8 @@ static inline int hyp_map_aux_data(void)
 
 #define kvm_phys_to_vttbr(addr)                (addr)
 
+static inline void kvm_set_ipa_limit(void) {}
+
 static inline bool kvm_cpu_has_cnp(void)
 {
        return false;
index 460d616..f6a7ea8 100644 (file)
 #ifndef __ARM_S2_PGTABLE_H_
 #define __ARM_S2_PGTABLE_H_
 
-#define stage2_pgd_none(pgd)                   pgd_none(pgd)
-#define stage2_pgd_clear(pgd)                  pgd_clear(pgd)
-#define stage2_pgd_present(pgd)                        pgd_present(pgd)
-#define stage2_pgd_populate(pgd, pud)          pgd_populate(NULL, pgd, pud)
-#define stage2_pud_offset(pgd, address)                pud_offset(pgd, address)
-#define stage2_pud_free(pud)                   pud_free(NULL, pud)
-
-#define stage2_pud_none(pud)                   pud_none(pud)
-#define stage2_pud_clear(pud)                  pud_clear(pud)
-#define stage2_pud_present(pud)                        pud_present(pud)
-#define stage2_pud_populate(pud, pmd)          pud_populate(NULL, pud, pmd)
-#define stage2_pmd_offset(pud, address)                pmd_offset(pud, address)
-#define stage2_pmd_free(pmd)                   pmd_free(NULL, pmd)
-
-#define stage2_pud_huge(pud)                   pud_huge(pud)
+/*
+ * kvm_mmu_cache_min_pages() is the number of pages required
+ * to install a stage-2 translation. We pre-allocate the entry
+ * level table at VM creation. Since we have a 3 level page-table,
+ * we need only two pages to add a new mapping.
+ */
+#define kvm_mmu_cache_min_pages(kvm)   2
+
+#define stage2_pgd_none(kvm, pgd)              pgd_none(pgd)
+#define stage2_pgd_clear(kvm, pgd)             pgd_clear(pgd)
+#define stage2_pgd_present(kvm, pgd)           pgd_present(pgd)
+#define stage2_pgd_populate(kvm, pgd, pud)     pgd_populate(NULL, pgd, pud)
+#define stage2_pud_offset(kvm, pgd, address)   pud_offset(pgd, address)
+#define stage2_pud_free(kvm, pud)              pud_free(NULL, pud)
+
+#define stage2_pud_none(kvm, pud)              pud_none(pud)
+#define stage2_pud_clear(kvm, pud)             pud_clear(pud)
+#define stage2_pud_present(kvm, pud)           pud_present(pud)
+#define stage2_pud_populate(kvm, pud, pmd)     pud_populate(NULL, pud, pmd)
+#define stage2_pmd_offset(kvm, pud, address)   pmd_offset(pud, address)
+#define stage2_pmd_free(kvm, pmd)              pmd_free(NULL, pmd)
+
+#define stage2_pud_huge(kvm, pud)              pud_huge(pud)
 
 /* Open coded p*d_addr_end that can deal with 64bit addresses */
-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 {
        phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK;
 
        return (boundary - 1 < end - 1) ? boundary : end;
 }
 
-#define stage2_pud_addr_end(addr, end)         (end)
+#define stage2_pud_addr_end(kvm, addr, end)    (end)
 
-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 {
        phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK;
 
        return (boundary - 1 < end - 1) ? boundary : end;
 }
 
-#define stage2_pgd_index(addr)                         pgd_index(addr)
+#define stage2_pgd_index(kvm, addr)            pgd_index(addr)
 
-#define stage2_pte_table_empty(ptep)                   kvm_page_empty(ptep)
-#define stage2_pmd_table_empty(pmdp)                   kvm_page_empty(pmdp)
-#define stage2_pud_table_empty(pudp)                   false
+#define stage2_pte_table_empty(kvm, ptep)      kvm_page_empty(ptep)
+#define stage2_pmd_table_empty(kvm, pmdp)      kvm_page_empty(pmdp)
+#define stage2_pud_table_empty(kvm, pudp)      false
 
 #endif /* __ARM_S2_PGTABLE_H_ */
index 6db48d9..7e2ec64 100644 (file)
@@ -537,6 +537,27 @@ static inline void arm64_set_ssbd_mitigation(bool state) {}
 #endif
 
 extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
+
+static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
+{
+       switch (parange) {
+       case 0: return 32;
+       case 1: return 36;
+       case 2: return 40;
+       case 3: return 42;
+       case 4: return 44;
+       case 5: return 48;
+       case 6: return 52;
+       /*
+        * A future PE could use a value unknown to the kernel.
+        * However, by the "D10.1.4 Principles of the ID scheme
+        * for fields in ID registers", ARM DDI 0487C.a, any new
+        * value is guaranteed to be higher than what we know already.
+        * As a safe limit, we return the limit supported by the kernel.
+        */
+       default: return CONFIG_ARM64_PA_BITS;
+       }
+}
 #endif /* __ASSEMBLY__ */
 
 #endif
index b476bc4..6f602af 100644 (file)
 #define VTCR_EL2_RES1          (1 << 31)
 #define VTCR_EL2_HD            (1 << 22)
 #define VTCR_EL2_HA            (1 << 21)
+#define VTCR_EL2_PS_SHIFT      TCR_EL2_PS_SHIFT
 #define VTCR_EL2_PS_MASK       TCR_EL2_PS_MASK
 #define VTCR_EL2_TG0_MASK      TCR_TG0_MASK
 #define VTCR_EL2_TG0_4K                TCR_TG0_4K
 #define VTCR_EL2_IRGN0_WBWA    TCR_IRGN0_WBWA
 #define VTCR_EL2_SL0_SHIFT     6
 #define VTCR_EL2_SL0_MASK      (3 << VTCR_EL2_SL0_SHIFT)
-#define VTCR_EL2_SL0_LVL1      (1 << VTCR_EL2_SL0_SHIFT)
 #define VTCR_EL2_T0SZ_MASK     0x3f
-#define VTCR_EL2_T0SZ_40B      24
 #define VTCR_EL2_VS_SHIFT      19
 #define VTCR_EL2_VS_8BIT       (0 << VTCR_EL2_VS_SHIFT)
 #define VTCR_EL2_VS_16BIT      (1 << VTCR_EL2_VS_SHIFT)
 
+#define VTCR_EL2_T0SZ(x)       TCR_T0SZ(x)
+
 /*
  * We configure the Stage-2 page tables to always restrict the IPA space to be
  * 40 bits wide (T0SZ = 24).  Systems with a PARange smaller than 40 bits are
  * not known to exist and will break with this configuration.
  *
- * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time
- * (see hyp-init.S).
+ * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
  *
  * Note that when using 4K pages, we concatenate two first level page tables
  * together. With 16K pages, we concatenate 16 first level page tables.
  *
- * The magic numbers used for VTTBR_X in this patch can be found in Tables
- * D4-23 and D4-25 in ARM DDI 0487A.b.
  */
 
-#define VTCR_EL2_T0SZ_IPA      VTCR_EL2_T0SZ_40B
 #define VTCR_EL2_COMMON_BITS   (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
                                 VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
 
-#ifdef CONFIG_ARM64_64K_PAGES
 /*
- * Stage2 translation configuration:
- * 64kB pages (TG0 = 1)
- * 2 level page tables (SL = 1)
+ * VTCR_EL2:SL0 indicates the entry level for Stage2 translation.
+ * Interestingly, it depends on the page size.
+ * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a
+ *
+ *     -----------------------------------------
+ *     | Entry level           |  4K  | 16K/64K |
+ *     ------------------------------------------
+ *     | Level: 0              |  2   |   -     |
+ *     ------------------------------------------
+ *     | Level: 1              |  1   |   2     |
+ *     ------------------------------------------
+ *     | Level: 2              |  0   |   1     |
+ *     ------------------------------------------
+ *     | Level: 3              |  -   |   0     |
+ *     ------------------------------------------
+ *
+ * The table roughly translates to :
+ *
+ *     SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level
+ *
+ * Where TGRAN_SL0_BASE is a magic number depending on the page size:
+ *     TGRAN_SL0_BASE(4K) = 2
+ *     TGRAN_SL0_BASE(16K) = 3
+ *     TGRAN_SL0_BASE(64K) = 3
+ * provided we take care of ruling out the unsupported cases and
+ * Entry_Level = 4 - Number_of_levels.
+ *
  */
-#define VTCR_EL2_TGRAN_FLAGS           (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC            38
+#ifdef CONFIG_ARM64_64K_PAGES
+
+#define VTCR_EL2_TGRAN                 VTCR_EL2_TG0_64K
+#define VTCR_EL2_TGRAN_SL0_BASE                3UL
+
 #elif defined(CONFIG_ARM64_16K_PAGES)
-/*
- * Stage2 translation configuration:
- * 16kB pages (TG0 = 2)
- * 2 level page tables (SL = 1)
- */
-#define VTCR_EL2_TGRAN_FLAGS           (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC            42
+
+#define VTCR_EL2_TGRAN                 VTCR_EL2_TG0_16K
+#define VTCR_EL2_TGRAN_SL0_BASE                3UL
+
 #else  /* 4K */
-/*
- * Stage2 translation configuration:
- * 4kB pages (TG0 = 0)
- * 3 level page tables (SL = 1)
- */
-#define VTCR_EL2_TGRAN_FLAGS           (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC            37
+
+#define VTCR_EL2_TGRAN                 VTCR_EL2_TG0_4K
+#define VTCR_EL2_TGRAN_SL0_BASE                2UL
+
 #endif
 
-#define VTCR_EL2_FLAGS                 (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
-#define VTTBR_X                                (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
+#define VTCR_EL2_LVLS_TO_SL0(levels)   \
+       ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT)
+#define VTCR_EL2_SL0_TO_LVLS(sl0)      \
+       ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE)
+#define VTCR_EL2_LVLS(vtcr)            \
+       VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT)
+
+#define VTCR_EL2_FLAGS                 (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN)
+#define VTCR_EL2_IPA(vtcr)             (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK))
+
+/*
+ * ARM VMSAv8-64 defines an algorithm for finding the translation table
+ * descriptors in section D4.2.8 in ARM DDI 0487C.a.
+ *
+ * The algorithm defines the expectations on the translation table
+ * addresses for each level, based on PAGE_SIZE, entry level
+ * and the translation table size (T0SZ). The variable "x" in the
+ * algorithm determines the alignment of a table base address at a given
+ * level and thus determines the alignment of VTTBR:BADDR for stage2
+ * page table entry level.
+ * Since the number of bits resolved at the entry level could vary
+ * depending on the T0SZ, the value of "x" is defined based on a
+ * Magic constant for a given PAGE_SIZE and Entry Level. The
+ * intermediate levels must be always aligned to the PAGE_SIZE (i.e,
+ * x = PAGE_SHIFT).
+ *
+ * The value of "x" for entry level is calculated as :
+ *    x = Magic_N - T0SZ
+ *
+ * where Magic_N is an integer depending on the page size and the entry
+ * level of the page table as below:
+ *
+ *     --------------------------------------------
+ *     | Entry level           |  4K    16K   64K |
+ *     --------------------------------------------
+ *     | Level: 0 (4 levels)   | 28   |  -  |  -  |
+ *     --------------------------------------------
+ *     | Level: 1 (3 levels)   | 37   | 31  | 25  |
+ *     --------------------------------------------
+ *     | Level: 2 (2 levels)   | 46   | 42  | 38  |
+ *     --------------------------------------------
+ *     | Level: 3 (1 level)    | -    | 53  | 51  |
+ *     --------------------------------------------
+ *
+ * We have a magic formula for the Magic_N below:
+ *
+ *  Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels)
+ *
+ * where Number_of_levels = (4 - Level). We are only interested in the
+ * value for Entry_Level for the stage2 page table.
+ *
+ * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows:
+ *
+ *     x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT)
+ *       = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels)
+ *
+ * Here is one way to explain the Magic Formula:
+ *
+ *  x = log2(Size_of_Entry_Level_Table)
+ *
+ * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another
+ * PAGE_SHIFT bits in the PTE, we have :
+ *
+ *  Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT)
+ *                  = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3
+ *  where n = number of levels, and since each pointer is 8bytes, we have:
+ *
+ *  x = Bits_Entry_Level + 3
+ *    = IPA_SHIFT - (PAGE_SHIFT - 3) * n
+ *
+ * The only constraint here is that, we have to find the number of page table
+ * levels for a given IPA size (which we do, see stage2_pt_levels())
+ */
+#define ARM64_VTTBR_X(ipa, levels)     ((ipa) - ((levels) * (PAGE_SHIFT - 3)))
 
 #define VTTBR_CNP_BIT     (UL(1))
-#define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X)
 #define VTTBR_VMID_SHIFT  (UL(48))
 #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
 
 
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
 #define HPFAR_MASK     (~UL(0xf))
+/*
+ * We have
+ *     PAR     [PA_Shift - 1   : 12] = PA      [PA_Shift - 1 : 12]
+ *     HPFAR   [PA_Shift - 9   : 4]  = FIPA    [PA_Shift - 1 : 12]
+ */
+#define PAR_TO_HPFAR(par)              \
+       (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
 
 #define kvm_arm_exception_type \
        {0, "IRQ" },            \
index 102b5a5..aea01a0 100644 (file)
@@ -30,6 +30,7 @@
 #define ARM_EXCEPTION_IRQ        0
 #define ARM_EXCEPTION_EL1_SERROR  1
 #define ARM_EXCEPTION_TRAP       2
+#define ARM_EXCEPTION_IL         3
 /* The hyp-stub will return this for any kvm_call_hyp() call */
 #define ARM_EXCEPTION_HYP_GONE   HVC_STUB_ERR
 
@@ -72,8 +73,6 @@ extern void __vgic_v3_init_lrs(void);
 
 extern u32 __kvm_get_mdcr_el2(void);
 
-extern u32 __init_stage2_translation(void);
-
 /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */
 #define __hyp_this_cpu_ptr(sym)                                                \
        ({                                                              \
index 2842bf1..52fbc82 100644 (file)
@@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
 void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
 
 struct kvm_arch {
@@ -61,11 +61,13 @@ struct kvm_arch {
        u64    vmid_gen;
        u32    vmid;
 
-       /* 1-level 2nd stage table, protected by kvm->mmu_lock */
+       /* stage2 entry level table */
        pgd_t *pgd;
 
        /* VTTBR value associated with above pgd and vmid */
        u64    vttbr;
+       /* VTCR_EL2 value for this VM */
+       u64    vtcr;
 
        /* The last vcpu id that ran on each physical CPU */
        int __percpu *last_vcpu_ran;
@@ -451,13 +453,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
                               struct kvm_device_attr *attr);
 
-static inline void __cpu_init_stage2(void)
-{
-       u32 parange = kvm_call_hyp(__init_stage2_translation);
-
-       WARN_ONCE(parange < 40,
-                 "PARange is %d bits, unsupported configuration!", parange);
-}
+static inline void __cpu_init_stage2(void) {}
 
 /* Guest/host FPSIMD coordination helpers */
 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
@@ -520,8 +516,12 @@ static inline int kvm_arm_have_ssbd(void)
 void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
 void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
 
+void kvm_set_ipa_limit(void);
+
 #define __KVM_HAVE_ARCH_VM_ALLOC
 struct kvm *kvm_arch_alloc_vm(void);
 void kvm_arch_free_vm(struct kvm *kvm);
 
+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
+
 #endif /* __ARM64_KVM_HOST_H__ */
index 384c343..23aca66 100644 (file)
@@ -155,5 +155,15 @@ void deactivate_traps_vhe_put(void);
 u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
 void __noreturn __hyp_do_panic(unsigned long, ...);
 
+/*
+ * Must be called from hyp code running at EL2 with an updated VTTBR
+ * and interrupts disabled.
+ */
+static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm)
+{
+       write_sysreg(kvm->arch.vtcr, vtcr_el2);
+       write_sysreg(kvm->arch.vttbr, vttbr_el2);
+}
+
 #endif /* __ARM64_KVM_HYP_H__ */
 
index 64337af..6586573 100644 (file)
@@ -141,8 +141,16 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
  * We currently only support a 40bit IPA.
  */
 #define KVM_PHYS_SHIFT (40)
-#define KVM_PHYS_SIZE  (1UL << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK  (KVM_PHYS_SIZE - 1UL)
+
+#define kvm_phys_shift(kvm)            VTCR_EL2_IPA(kvm->arch.vtcr)
+#define kvm_phys_size(kvm)             (_AC(1, ULL) << kvm_phys_shift(kvm))
+#define kvm_phys_mask(kvm)             (kvm_phys_size(kvm) - _AC(1, ULL))
+
+static inline bool kvm_page_empty(void *ptr)
+{
+       struct page *ptr_page = virt_to_page(ptr);
+       return page_count(ptr_page) == 1;
+}
 
 #include <asm/stage2_pgtable.h>
 
@@ -238,12 +246,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
        return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
 }
 
-static inline bool kvm_page_empty(void *ptr)
-{
-       struct page *ptr_page = virt_to_page(ptr);
-       return page_count(ptr_page) == 1;
-}
-
 #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
 
 #ifdef __PAGETABLE_PMD_FOLDED
@@ -517,6 +519,30 @@ static inline int hyp_map_aux_data(void)
 
 #define kvm_phys_to_vttbr(addr)                phys_to_ttbr(addr)
 
+/*
+ * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
+ * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
+ * 52bit IPS.
+ */
+static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
+{
+       int x = ARM64_VTTBR_X(ipa_shift, levels);
+
+       return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
+}
+
+static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
+{
+       unsigned int x = arm64_vttbr_x(ipa_shift, levels);
+
+       return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
+}
+
+static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
+{
+       return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
+}
+
 static inline bool kvm_cpu_has_cnp(void)
 {
        return system_supports_cnp();
index 6bc4388..fce22c4 100644 (file)
@@ -25,6 +25,9 @@
 #define CurrentEL_EL1          (1 << 2)
 #define CurrentEL_EL2          (2 << 2)
 
+/* Additional SPSR bits not exposed in the UABI */
+#define PSR_IL_BIT             (1 << 20)
+
 /* AArch32-specific ptrace requests */
 #define COMPAT_PTRACE_GETREGS          12
 #define COMPAT_PTRACE_SETREGS          13
diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h
deleted file mode 100644 (file)
index 2656a0f..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ARM64_S2_PGTABLE_NOPMD_H_
-#define __ARM64_S2_PGTABLE_NOPMD_H_
-
-#include <asm/stage2_pgtable-nopud.h>
-
-#define __S2_PGTABLE_PMD_FOLDED
-
-#define S2_PMD_SHIFT           S2_PUD_SHIFT
-#define S2_PTRS_PER_PMD                1
-#define S2_PMD_SIZE            (1UL << S2_PMD_SHIFT)
-#define S2_PMD_MASK            (~(S2_PMD_SIZE-1))
-
-#define stage2_pud_none(pud)                   (0)
-#define stage2_pud_present(pud)                        (1)
-#define stage2_pud_clear(pud)                  do { } while (0)
-#define stage2_pud_populate(pud, pmd)          do { } while (0)
-#define stage2_pmd_offset(pud, address)                ((pmd_t *)(pud))
-
-#define stage2_pmd_free(pmd)                   do { } while (0)
-
-#define stage2_pmd_addr_end(addr, end)         (end)
-
-#define stage2_pud_huge(pud)                   (0)
-#define stage2_pmd_table_empty(pmdp)           (0)
-
-#endif
diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h
deleted file mode 100644 (file)
index 5ee87b5..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ARM64_S2_PGTABLE_NOPUD_H_
-#define __ARM64_S2_PGTABLE_NOPUD_H_
-
-#define __S2_PGTABLE_PUD_FOLDED
-
-#define S2_PUD_SHIFT           S2_PGDIR_SHIFT
-#define S2_PTRS_PER_PUD                1
-#define S2_PUD_SIZE            (_AC(1, UL) << S2_PUD_SHIFT)
-#define S2_PUD_MASK            (~(S2_PUD_SIZE-1))
-
-#define stage2_pgd_none(pgd)                   (0)
-#define stage2_pgd_present(pgd)                        (1)
-#define stage2_pgd_clear(pgd)                  do { } while (0)
-#define stage2_pgd_populate(pgd, pud)  do { } while (0)
-
-#define stage2_pud_offset(pgd, address)                ((pud_t *)(pgd))
-
-#define stage2_pud_free(x)                     do { } while (0)
-
-#define stage2_pud_addr_end(addr, end)         (end)
-#define stage2_pud_table_empty(pmdp)           (0)
-
-#endif
index 8b68099..d352f6d 100644 (file)
 #ifndef __ARM64_S2_PGTABLE_H_
 #define __ARM64_S2_PGTABLE_H_
 
+#include <linux/hugetlb.h>
 #include <asm/pgtable.h>
 
 /*
+ * PGDIR_SHIFT determines the size a top-level page table entry can map
+ * and depends on the number of levels in the page table. Compute the
+ * PGDIR_SHIFT for a given number of levels.
+ */
+#define pt_levels_pgdir_shift(lvls)    ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls))
+
+/*
  * The hardware supports concatenation of up to 16 tables at stage2 entry level
  * and we use the feature whenever possible.
  *
  * On arm64, the smallest PAGE_SIZE supported is 4k, which means
  *             (PAGE_SHIFT - 3) > 4 holds for all page sizes.
  * This implies, the total number of page table levels at stage2 expected
- * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4)
+ * by the hardware is actually the number of levels required for (IPA_SHIFT - 4)
  * in normal translations(e.g, stage1), since we cannot have another level in
- * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4).
+ * the range (IPA_SHIFT, IPA_SHIFT - 4).
  */
-#define STAGE2_PGTABLE_LEVELS          ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)
+#define stage2_pgtable_levels(ipa)     ARM64_HW_PGTABLE_LEVELS((ipa) - 4)
+#define kvm_stage2_levels(kvm)         VTCR_EL2_LVLS(kvm->arch.vtcr)
 
-/*
- * With all the supported VA_BITs and 40bit guest IPA, the following condition
- * is always true:
- *
- *       STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS
- *
- * We base our stage-2 page table walker helpers on this assumption and
- * fall back to using the host version of the helper wherever possible.
- * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back
- * to using the host version, since it is guaranteed it is not folded at host.
- *
- * If the condition breaks in the future, we can rearrange the host level
- * definitions and reuse them for stage2. Till then...
- */
-#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS
-#error "Unsupported combination of guest IPA and host VA_BITS."
-#endif
-
-/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */
-#define S2_PGDIR_SHIFT                 ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS)
-#define S2_PGDIR_SIZE                  (_AC(1, UL) << S2_PGDIR_SHIFT)
-#define S2_PGDIR_MASK                  (~(S2_PGDIR_SIZE - 1))
+/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */
+#define stage2_pgdir_shift(kvm)                pt_levels_pgdir_shift(kvm_stage2_levels(kvm))
+#define stage2_pgdir_size(kvm)         (1ULL << stage2_pgdir_shift(kvm))
+#define stage2_pgdir_mask(kvm)         ~(stage2_pgdir_size(kvm) - 1)
 
 /*
  * The number of PTRS across all concatenated stage2 tables given by the
  * number of bits resolved at the initial level.
+ * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
+ * in which case, stage2_pgd_ptrs will have one entry.
  */
-#define PTRS_PER_S2_PGD                        (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT))
+#define pgd_ptrs_shift(ipa, pgdir_shift)       \
+       ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
+#define __s2_pgd_ptrs(ipa, lvls)               \
+       (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
+#define __s2_pgd_size(ipa, lvls)       (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
+
+#define stage2_pgd_ptrs(kvm)           __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
+#define stage2_pgd_size(kvm)           __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
 
 /*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
- * levels in addition to the PGD.
+ * kvm_mmmu_cache_min_pages() is the number of pages required to install
+ * a stage-2 translation. We pre-allocate the entry level page table at
+ * the VM creation.
  */
-#define KVM_MMU_CACHE_MIN_PAGES                (STAGE2_PGTABLE_LEVELS - 1)
+#define kvm_mmu_cache_min_pages(kvm)   (kvm_stage2_levels(kvm) - 1)
 
-
-#if STAGE2_PGTABLE_LEVELS > 3
+/* Stage2 PUD definitions when the level is present */
+static inline bool kvm_stage2_has_pud(struct kvm *kvm)
+{
+       return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
+}
 
 #define S2_PUD_SHIFT                   ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
-#define S2_PUD_SIZE                    (_AC(1, UL) << S2_PUD_SHIFT)
+#define S2_PUD_SIZE                    (1UL << S2_PUD_SHIFT)
 #define S2_PUD_MASK                    (~(S2_PUD_SIZE - 1))
 
-#define stage2_pgd_none(pgd)                           pgd_none(pgd)
-#define stage2_pgd_clear(pgd)                          pgd_clear(pgd)
-#define stage2_pgd_present(pgd)                                pgd_present(pgd)
-#define stage2_pgd_populate(pgd, pud)                  pgd_populate(NULL, pgd, pud)
-#define stage2_pud_offset(pgd, address)                        pud_offset(pgd, address)
-#define stage2_pud_free(pud)                           pud_free(NULL, pud)
+static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd)
+{
+       if (kvm_stage2_has_pud(kvm))
+               return pgd_none(pgd);
+       else
+               return 0;
+}
 
-#define stage2_pud_table_empty(pudp)                   kvm_page_empty(pudp)
+static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp)
+{
+       if (kvm_stage2_has_pud(kvm))
+               pgd_clear(pgdp);
+}
 
-static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd)
 {
-       phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
+       if (kvm_stage2_has_pud(kvm))
+               return pgd_present(pgd);
+       else
+               return 1;
+}
 
-       return (boundary - 1 < end - 1) ? boundary : end;
+static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud)
+{
+       if (kvm_stage2_has_pud(kvm))
+               pgd_populate(NULL, pgd, pud);
+}
+
+static inline pud_t *stage2_pud_offset(struct kvm *kvm,
+                                      pgd_t *pgd, unsigned long address)
+{
+       if (kvm_stage2_has_pud(kvm))
+               return pud_offset(pgd, address);
+       else
+               return (pud_t *)pgd;
 }
 
-#endif         /* STAGE2_PGTABLE_LEVELS > 3 */
+static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
+{
+       if (kvm_stage2_has_pud(kvm))
+               pud_free(NULL, pud);
+}
 
+static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
+{
+       if (kvm_stage2_has_pud(kvm))
+               return kvm_page_empty(pudp);
+       else
+               return false;
+}
 
-#if STAGE2_PGTABLE_LEVELS > 2
+static inline phys_addr_t
+stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+       if (kvm_stage2_has_pud(kvm)) {
+               phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
+
+               return (boundary - 1 < end - 1) ? boundary : end;
+       } else {
+               return end;
+       }
+}
+
+/* Stage2 PMD definitions when the level is present */
+static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
+{
+       return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
+}
 
 #define S2_PMD_SHIFT                   ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
-#define S2_PMD_SIZE                    (_AC(1, UL) << S2_PMD_SHIFT)
+#define S2_PMD_SIZE                    (1UL << S2_PMD_SHIFT)
 #define S2_PMD_MASK                    (~(S2_PMD_SIZE - 1))
 
-#define stage2_pud_none(pud)                           pud_none(pud)
-#define stage2_pud_clear(pud)                          pud_clear(pud)
-#define stage2_pud_present(pud)                                pud_present(pud)
-#define stage2_pud_populate(pud, pmd)                  pud_populate(NULL, pud, pmd)
-#define stage2_pmd_offset(pud, address)                        pmd_offset(pud, address)
-#define stage2_pmd_free(pmd)                           pmd_free(NULL, pmd)
+static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
+{
+       if (kvm_stage2_has_pmd(kvm))
+               return pud_none(pud);
+       else
+               return 0;
+}
+
+static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
+{
+       if (kvm_stage2_has_pmd(kvm))
+               pud_clear(pud);
+}
 
-#define stage2_pud_huge(pud)                           pud_huge(pud)
-#define stage2_pmd_table_empty(pmdp)                   kvm_page_empty(pmdp)
+static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
+{
+       if (kvm_stage2_has_pmd(kvm))
+               return pud_present(pud);
+       else
+               return 1;
+}
 
-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
 {
-       phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
+       if (kvm_stage2_has_pmd(kvm))
+               pud_populate(NULL, pud, pmd);
+}
 
-       return (boundary - 1 < end - 1) ? boundary : end;
+static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
+                                      pud_t *pud, unsigned long address)
+{
+       if (kvm_stage2_has_pmd(kvm))
+               return pmd_offset(pud, address);
+       else
+               return (pmd_t *)pud;
 }
 
-#endif         /* STAGE2_PGTABLE_LEVELS > 2 */
+static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
+{
+       if (kvm_stage2_has_pmd(kvm))
+               pmd_free(NULL, pmd);
+}
+
+static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
+{
+       if (kvm_stage2_has_pmd(kvm))
+               return pud_huge(pud);
+       else
+               return 0;
+}
+
+static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
+{
+       if (kvm_stage2_has_pmd(kvm))
+               return kvm_page_empty(pmdp);
+       else
+               return 0;
+}
 
-#define stage2_pte_table_empty(ptep)                   kvm_page_empty(ptep)
+static inline phys_addr_t
+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+       if (kvm_stage2_has_pmd(kvm)) {
+               phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
 
-#if STAGE2_PGTABLE_LEVELS == 2
-#include <asm/stage2_pgtable-nopmd.h>
-#elif STAGE2_PGTABLE_LEVELS == 3
-#include <asm/stage2_pgtable-nopud.h>
-#endif
+               return (boundary - 1 < end - 1) ? boundary : end;
+       } else {
+               return end;
+       }
+}
 
+static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
+{
+       return kvm_page_empty(ptep);
+}
 
-#define stage2_pgd_index(addr)                         (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
+static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
+{
+       return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
+}
 
-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 {
-       phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK;
+       phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm);
 
        return (boundary - 1 < end - 1) ? boundary : end;
 }
index a6c9fba..dd436a5 100644 (file)
@@ -391,15 +391,15 @@ int __attribute_const__ kvm_target_cpu(void)
                        return KVM_ARM_TARGET_CORTEX_A53;
                case ARM_CPU_PART_CORTEX_A57:
                        return KVM_ARM_TARGET_CORTEX_A57;
-               };
+               }
                break;
        case ARM_CPU_IMP_APM:
                switch (part_number) {
                case APM_CPU_PART_POTENZA:
                        return KVM_ARM_TARGET_XGENE_POTENZA;
-               };
+               }
                break;
-       };
+       }
 
        /* Return a default generic target */
        return KVM_ARM_TARGET_GENERIC_V8;
index e5e741b..35a81be 100644 (file)
@@ -284,6 +284,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 */
                run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                return 0;
+       case ARM_EXCEPTION_IL:
+               /*
+                * We attempted an illegal exception return.  Guest state must
+                * have been corrupted somehow.  Give up.
+                */
+               run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+               return -EINVAL;
        default:
                kvm_pr_unimpl("Unsupported exception type: %d",
                              exception_index);
index 2fabc2d..82d1904 100644 (file)
@@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o
 obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
 obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
 obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
-obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o
 
 # KVM code is run at a different exception code with a different map, so
 # compiler instrumentation that inserts callbacks or checks into the code may
index 24b4fba..b1f14f7 100644 (file)
@@ -162,6 +162,20 @@ el1_error:
        mov     x0, #ARM_EXCEPTION_EL1_SERROR
        b       __guest_exit
 
+el2_sync:
+       /* Check for illegal exception return, otherwise panic */
+       mrs     x0, spsr_el2
+
+       /* if this was something else, then panic! */
+       tst     x0, #PSR_IL_BIT
+       b.eq    __hyp_panic
+
+       /* Let's attempt a recovery from the illegal exception return */
+       get_vcpu_ptr    x1, x0
+       mov     x0, #ARM_EXCEPTION_IL
+       b       __guest_exit
+
+
 el2_error:
        ldp     x0, x1, [sp], #16
 
@@ -240,7 +254,7 @@ ENTRY(__kvm_hyp_vector)
        invalid_vect    el2t_fiq_invalid        // FIQ EL2t
        invalid_vect    el2t_error_invalid      // Error EL2t
 
-       invalid_vect    el2h_sync_invalid       // Synchronous EL2h
+       valid_vect      el2_sync                // Synchronous EL2h
        invalid_vect    el2h_irq_invalid        // IRQ EL2h
        invalid_vect    el2h_fiq_invalid        // FIQ EL2h
        valid_vect      el2_error               // Error EL2h
diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c
deleted file mode 100644 (file)
index 603e1ee..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/types.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_hyp.h>
-
-u32 __hyp_text __init_stage2_translation(void)
-{
-       u64 val = VTCR_EL2_FLAGS;
-       u64 parange;
-       u64 tmp;
-
-       /*
-        * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS
-        * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while
-        * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2...
-        */
-       parange = read_sysreg(id_aa64mmfr0_el1) & 7;
-       if (parange > ID_AA64MMFR0_PARANGE_MAX)
-               parange = ID_AA64MMFR0_PARANGE_MAX;
-       val |= parange << 16;
-
-       /* Compute the actual PARange... */
-       switch (parange) {
-       case 0:
-               parange = 32;
-               break;
-       case 1:
-               parange = 36;
-               break;
-       case 2:
-               parange = 40;
-               break;
-       case 3:
-               parange = 42;
-               break;
-       case 4:
-               parange = 44;
-               break;
-       case 5:
-       default:
-               parange = 48;
-               break;
-       }
-
-       /*
-        * ... and clamp it to 40 bits, unless we have some braindead
-        * HW that implements less than that. In all cases, we'll
-        * return that value for the rest of the kernel to decide what
-        * to do.
-        */
-       val |= 64 - (parange > 40 ? 40 : parange);
-
-       /*
-        * Check the availability of Hardware Access Flag / Dirty Bit
-        * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2.
-        */
-       tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf;
-       if (tmp)
-               val |= VTCR_EL2_HA;
-
-       /*
-        * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS
-        * bit in VTCR_EL2.
-        */
-       tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf;
-       val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ?
-                       VTCR_EL2_VS_16BIT :
-                       VTCR_EL2_VS_8BIT;
-
-       write_sysreg(val, vtcr_el2);
-
-       return parange;
-}
index ca46153..7cc175c 100644 (file)
@@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void)
 
 static void __hyp_text __activate_vm(struct kvm *kvm)
 {
-       write_sysreg(kvm->arch.vttbr, vttbr_el2);
+       __load_guest_stage2(kvm);
 }
 
 static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
@@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar)
                return false; /* Translation failed, back to guest */
 
        /* Convert PAR to HPFAR format */
-       *hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4;
+       *hpfar = PAR_TO_HPFAR(tmp);
        return true;
 }
 
index 76d016b..68d6f7c 100644 (file)
@@ -152,8 +152,25 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
 static void __hyp_text
 __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
 {
+       u64 pstate = ctxt->gp_regs.regs.pstate;
+       u64 mode = pstate & PSR_AA32_MODE_MASK;
+
+       /*
+        * Safety check to ensure we're setting the CPU up to enter the guest
+        * in a less privileged mode.
+        *
+        * If we are attempting a return to EL2 or higher in AArch64 state,
+        * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that
+        * we'll take an illegal exception state exception immediately after
+        * the ERET to the guest.  Attempts to return to AArch32 Hyp will
+        * result in an illegal exception return because EL2's execution state
+        * is determined by SCR_EL3.RW.
+        */
+       if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t)
+               pstate = PSR_MODE_EL2h | PSR_IL_BIT;
+
        write_sysreg_el2(ctxt->gp_regs.regs.pc,         elr);
-       write_sysreg_el2(ctxt->gp_regs.regs.pstate,     spsr);
+       write_sysreg_el2(pstate,                        spsr);
 
        if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
                write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);
index 131c777..4dbd9c6 100644 (file)
@@ -30,7 +30,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
         * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so
         * let's flip TGE before executing the TLB operation.
         */
-       write_sysreg(kvm->arch.vttbr, vttbr_el2);
+       __load_guest_stage2(kvm);
        val = read_sysreg(hcr_el2);
        val &= ~HCR_TGE;
        write_sysreg(val, hcr_el2);
@@ -39,7 +39,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
 
 static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm)
 {
-       write_sysreg(kvm->arch.vttbr, vttbr_el2);
+       __load_guest_stage2(kvm);
        isb();
 }
 
index e37c78b..b72a3dd 100644 (file)
@@ -26,6 +26,7 @@
 
 #include <kvm/arm_arch_timer.h>
 
+#include <asm/cpufeature.h>
 #include <asm/cputype.h>
 #include <asm/ptrace.h>
 #include <asm/kvm_arm.h>
@@ -33,6 +34,9 @@
 #include <asm/kvm_coproc.h>
 #include <asm/kvm_mmu.h>
 
+/* Maximum phys_shift supported for any VM on this host */
+static u32 kvm_ipa_limit;
+
 /*
  * ARMv8 Reset Values
  */
@@ -55,12 +59,12 @@ static bool cpu_has_32bit_el1(void)
 }
 
 /**
- * kvm_arch_dev_ioctl_check_extension
+ * kvm_arch_vm_ioctl_check_extension
  *
  * We currently assume that the number of HW registers is uniform
  * across all CPUs (see cpuinfo_sanity_check).
  */
-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 {
        int r;
 
@@ -82,9 +86,11 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
                break;
        case KVM_CAP_SET_GUEST_DEBUG:
        case KVM_CAP_VCPU_ATTRIBUTES:
-       case KVM_CAP_VCPU_EVENTS:
                r = 1;
                break;
+       case KVM_CAP_ARM_VM_IPA_SIZE:
+               r = kvm_ipa_limit;
+               break;
        default:
                r = 0;
        }
@@ -133,3 +139,99 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        /* Reset timer */
        return kvm_timer_vcpu_reset(vcpu);
 }
+
+void kvm_set_ipa_limit(void)
+{
+       unsigned int ipa_max, pa_max, va_max, parange;
+
+       parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7;
+       pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+       /* Clamp the IPA limit to the PA size supported by the kernel */
+       ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
+       /*
+        * Since our stage2 table is dependent on the stage1 page table code,
+        * we must always honor the following condition:
+        *
+        *  Number of levels in Stage1 >= Number of levels in Stage2.
+        *
+        * So clamp the ipa limit further down to limit the number of levels.
+        * Since we can concatenate upto 16 tables at entry level, we could
+        * go upto 4bits above the maximum VA addressible with the current
+        * number of levels.
+        */
+       va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
+       va_max += 4;
+
+       if (va_max < ipa_max)
+               ipa_max = va_max;
+
+       /*
+        * If the final limit is lower than the real physical address
+        * limit of the CPUs, report the reason.
+        */
+       if (ipa_max < pa_max)
+               pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
+                       (va_max < pa_max) ? "Virtual" : "Physical");
+
+       WARN(ipa_max < KVM_PHYS_SHIFT,
+            "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
+       kvm_ipa_limit = ipa_max;
+       kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+}
+
+/*
+ * Configure the VTCR_EL2 for this VM. The VTCR value is common
+ * across all the physical CPUs on the system. We use system wide
+ * sanitised values to fill in different fields, except for Hardware
+ * Management of Access Flags. HA Flag is set unconditionally on
+ * all CPUs, as it is safe to run with or without the feature and
+ * the bit is RES0 on CPUs that don't support it.
+ */
+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
+{
+       u64 vtcr = VTCR_EL2_FLAGS;
+       u32 parange, phys_shift;
+       u8 lvls;
+
+       if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+               return -EINVAL;
+
+       phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
+       if (phys_shift) {
+               if (phys_shift > kvm_ipa_limit ||
+                   phys_shift < 32)
+                       return -EINVAL;
+       } else {
+               phys_shift = KVM_PHYS_SHIFT;
+       }
+
+       parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7;
+       if (parange > ID_AA64MMFR0_PARANGE_MAX)
+               parange = ID_AA64MMFR0_PARANGE_MAX;
+       vtcr |= parange << VTCR_EL2_PS_SHIFT;
+
+       vtcr |= VTCR_EL2_T0SZ(phys_shift);
+       /*
+        * Use a minimum 2 level page table to prevent splitting
+        * host PMD huge pages at stage2.
+        */
+       lvls = stage2_pgtable_levels(phys_shift);
+       if (lvls < 2)
+               lvls = 2;
+       vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+
+       /*
+        * Enable the Hardware Access Flag management, unconditionally
+        * on all CPUs. The features is RES0 on CPUs without the support
+        * and must be ignored by the CPUs.
+        */
+       vtcr |= VTCR_EL2_HA;
+
+       /* Set the vmid bits */
+       vtcr |= (kvm_get_vmid_bits() == 16) ?
+               VTCR_EL2_VS_16BIT :
+               VTCR_EL2_VS_8BIT;
+       kvm->arch.vtcr = vtcr;
+       return 0;
+}
index 1f4691c..c55ba3b 100644 (file)
@@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache;
 
 extern long flush_count_cache;
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
+void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
+#else
+static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
+                                    bool preserve_nv) { }
+static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
+                                       bool preserve_nv) { }
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+void kvmhv_save_host_pmu(void);
+void kvmhv_load_host_pmu(void);
+void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
+void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
+
+int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
+
+long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
+long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,
+                       unsigned long dabrx);
+
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
index b3520b5..66db23e 100644 (file)
@@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
        BUG();
 }
 
+static inline unsigned int ap_to_shift(unsigned long ap)
+{
+       int psize;
+
+       for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+               if (mmu_psize_defs[psize].ap == ap)
+                       return mmu_psize_defs[psize].shift;
+       }
+
+       return -1;
+}
+
 static inline unsigned long get_sllp_encoding(int psize)
 {
        unsigned long sllp;
index 1154a6d..671316f 100644 (file)
@@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
                                        unsigned long addr,
                                        unsigned long page_size);
 extern void radix__flush_pwc_lpid(unsigned int lpid);
+extern void radix__flush_tlb_lpid(unsigned int lpid);
 extern void radix__local_flush_tlb_lpid(unsigned int lpid);
 extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
 
index a0b17f9..45e8789 100644 (file)
 #define H_GET_24X7_DATA                0xF07C
 #define H_GET_PERF_COUNTER_INFO        0xF080
 
+/* Platform-specific hcalls used for nested HV KVM */
+#define H_SET_PARTITION_TABLE  0xF800
+#define H_ENTER_NESTED         0xF804
+#define H_TLB_INVALIDATE       0xF808
+
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR          1
 #define H_SET_MODE_RESOURCE_SET_DAWR           2
@@ -461,6 +466,42 @@ struct h_cpu_char_result {
        u64 behaviour;
 };
 
+/* Register state for entering a nested guest with H_ENTER_NESTED */
+struct hv_guest_state {
+       u64 version;            /* version of this structure layout */
+       u32 lpid;
+       u32 vcpu_token;
+       /* These registers are hypervisor privileged (at least for writing) */
+       u64 lpcr;
+       u64 pcr;
+       u64 amor;
+       u64 dpdes;
+       u64 hfscr;
+       s64 tb_offset;
+       u64 dawr0;
+       u64 dawrx0;
+       u64 ciabr;
+       u64 hdec_expiry;
+       u64 purr;
+       u64 spurr;
+       u64 ic;
+       u64 vtb;
+       u64 hdar;
+       u64 hdsisr;
+       u64 heir;
+       u64 asdr;
+       /* These are OS privileged but need to be set late in guest entry */
+       u64 srr0;
+       u64 srr1;
+       u64 sprg[4];
+       u64 pidr;
+       u64 cfar;
+       u64 ppr;
+};
+
+/* Latest version of hv_guest_state structure */
+#define HV_GUEST_STATE_VERSION 1
+
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
index 3d4b88c..35db0cb 100644 (file)
@@ -126,7 +126,7 @@ struct iommu_table {
        int it_nid;
 };
 
-#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
+#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
                ((tbl)->it_ops->useraddrptr((tbl), (entry), false))
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
                ((tbl)->it_ops->useraddrptr((tbl), (entry), true))
index a790d5c..1f32191 100644 (file)
@@ -84,7 +84,6 @@
 #define BOOK3S_INTERRUPT_INST_STORAGE  0x400
 #define BOOK3S_INTERRUPT_INST_SEGMENT  0x480
 #define BOOK3S_INTERRUPT_EXTERNAL      0x500
-#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL        0x501
 #define BOOK3S_INTERRUPT_EXTERNAL_HV   0x502
 #define BOOK3S_INTERRUPT_ALIGNMENT     0x600
 #define BOOK3S_INTERRUPT_PROGRAM       0x700
 #define BOOK3S_IRQPRIO_EXTERNAL                        14
 #define BOOK3S_IRQPRIO_DECREMENTER             15
 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR     16
-#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL          17
-#define BOOK3S_IRQPRIO_MAX                     18
+#define BOOK3S_IRQPRIO_MAX                     17
 
 #define BOOK3S_HFLAG_DCBZ32                    0x1
 #define BOOK3S_HFLAG_SLB                       0x2
index 83a9aa3..09f8e9b 100644 (file)
@@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
 extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
                        struct kvm_vcpu *vcpu,
                        unsigned long ea, unsigned long dsisr);
+extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                     struct kvmppc_pte *gpte, u64 root,
+                                     u64 *pte_ret_p);
+extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
+                       struct kvmppc_pte *gpte, u64 table,
+                       int table_index, u64 *pte_ret_p);
 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
                        struct kvmppc_pte *gpte, bool data, bool iswrite);
+extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
+                       unsigned int shift, struct kvm_memory_slot *memslot,
+                       unsigned int lpid);
+extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
+                                   bool writing, unsigned long gpa,
+                                   unsigned int lpid);
+extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
+                               unsigned long gpa,
+                               struct kvm_memory_slot *memslot,
+                               bool writing, bool kvm_ro,
+                               pte_t *inserted_pte, unsigned int *levelp);
 extern int kvmppc_init_vm_radix(struct kvm *kvm);
 extern void kvmppc_free_radix(struct kvm *kvm);
+extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
+                                     unsigned int lpid);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
 extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        unsigned long gfn);
+extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
+                            unsigned long gpa, unsigned int shift,
+                            struct kvm_memory_slot *memslot,
+                            unsigned int lpid);
 extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        unsigned long gfn);
 extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
@@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
+long kvmhv_nested_init(void);
+void kvmhv_nested_exit(void);
+void kvmhv_vm_nested_init(struct kvm *kvm);
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
+void kvmhv_release_all_nested(struct kvm *kvm);
+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
+                         u64 time_limit, unsigned long lpcr);
+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
+                                  struct hv_guest_state *hr);
+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
+
 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 
 extern int kvm_irq_bypass;
@@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 
 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
 {
-       vcpu->arch.cr = val;
+       vcpu->arch.regs.ccr = val;
 }
 
 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.cr;
+       return vcpu->arch.regs.ccr;
 }
 
 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
@@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
 /* TO = 31 for unconditional trap */
 #define INS_TW                         0x7fe00008
 
-/* LPIDs we support with this build -- runtime limit may be lower */
-#define KVMPPC_NR_LPIDS                        (LPID_RSVD + 1)
-
 #define SPLIT_HACK_MASK                        0xff000000
 #define SPLIT_HACK_OFFS                        0xfb000000
 
index dc435a5..6d29814 100644 (file)
 #include <linux/string.h>
 #include <asm/bitops.h>
 #include <asm/book3s/64/mmu-hash.h>
+#include <asm/cpu_has_feature.h>
+#include <asm/ppc-opcode.h>
+
+#ifdef CONFIG_PPC_PSERIES
+static inline bool kvmhv_on_pseries(void)
+{
+       return !cpu_has_feature(CPU_FTR_HVMODE);
+}
+#else
+static inline bool kvmhv_on_pseries(void)
+{
+       return false;
+}
+#endif
+
+/*
+ * Structure for a nested guest, that is, for a guest that is managed by
+ * one of our guests.
+ */
+struct kvm_nested_guest {
+       struct kvm *l1_host;            /* L1 VM that owns this nested guest */
+       int l1_lpid;                    /* lpid L1 guest thinks this guest is */
+       int shadow_lpid;                /* real lpid of this nested guest */
+       pgd_t *shadow_pgtable;          /* our page table for this guest */
+       u64 l1_gr_to_hr;                /* L1's addr of part'n-scoped table */
+       u64 process_table;              /* process table entry for this guest */
+       long refcnt;                    /* number of pointers to this struct */
+       struct mutex tlb_lock;          /* serialize page faults and tlbies */
+       struct kvm_nested_guest *next;
+       cpumask_t need_tlb_flush;
+       cpumask_t cpu_in_guest;
+       short prev_cpu[NR_CPUS];
+};
+
+/*
+ * We define a nested rmap entry as a single 64-bit quantity
+ * 0xFFF0000000000000  12-bit lpid field
+ * 0x000FFFFFFFFFF000  40-bit guest 4k page frame number
+ * 0x0000000000000001  1-bit  single entry flag
+ */
+#define RMAP_NESTED_LPID_MASK          0xFFF0000000000000UL
+#define RMAP_NESTED_LPID_SHIFT         (52)
+#define RMAP_NESTED_GPA_MASK           0x000FFFFFFFFFF000UL
+#define RMAP_NESTED_IS_SINGLE_ENTRY    0x0000000000000001UL
+
+/* Structure for a nested guest rmap entry */
+struct rmap_nested {
+       struct llist_node list;
+       u64 rmap;
+};
+
+/*
+ * for_each_nest_rmap_safe - iterate over the list of nested rmap entries
+ *                          safe against removal of the list entry or NULL list
+ * @pos:       a (struct rmap_nested *) to use as a loop cursor
+ * @node:      pointer to the first entry
+ *             NOTE: this can be NULL
+ * @rmapp:     an (unsigned long *) in which to return the rmap entries on each
+ *             iteration
+ *             NOTE: this must point to already allocated memory
+ *
+ * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the
+ * rmap entry in the memslot. The list is always terminated by a "single entry"
+ * stored in the list element of the final entry of the llist. If there is ONLY
+ * a single entry then this is itself in the rmap entry of the memslot, not a
+ * llist head pointer.
+ *
+ * Note that the iterator below assumes that a nested rmap entry is always
+ * non-zero.  This is true for our usage because the LPID field is always
+ * non-zero (zero is reserved for the host).
+ *
+ * This should be used to iterate over the list of rmap_nested entries with
+ * processing done on the u64 rmap value given by each iteration. This is safe
+ * against removal of list entries and it is always safe to call free on (pos).
+ *
+ * e.g.
+ * struct rmap_nested *cursor;
+ * struct llist_node *first;
+ * unsigned long rmap;
+ * for_each_nest_rmap_safe(cursor, first, &rmap) {
+ *     do_something(rmap);
+ *     free(cursor);
+ * }
+ */
+#define for_each_nest_rmap_safe(pos, node, rmapp)                             \
+       for ((pos) = llist_entry((node), typeof(*(pos)), list);                \
+            (node) &&                                                         \
+            (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?     \
+                         ((u64) (node)) : ((pos)->rmap))) &&                  \
+            (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?      \
+                        ((struct llist_node *) ((pos) = NULL)) :              \
+                        (pos)->list.next)), true);                            \
+            (pos) = llist_entry((node), typeof(*(pos)), list))
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+                                         bool create);
+void kvmhv_put_nested(struct kvm_nested_guest *gp);
+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid);
+
+/* Encoding of first parameter for H_TLB_INVALIDATE */
+#define H_TLBIE_P1_ENC(ric, prs, r)    (___PPC_RIC(ric) | ___PPC_PRS(prs) | \
+                                        ___PPC_R(r))
 
 /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
 #define PPC_MIN_HPT_ORDER      18
@@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 }
 
 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
+extern void kvmhv_radix_debugfs_init(struct kvm *kvm);
 
 extern void kvmhv_rm_send_ipi(int cpu);
 
@@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr)
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.cr  = vcpu->arch.cr_tm;
+       vcpu->arch.regs.ccr  = vcpu->arch.cr_tm;
        vcpu->arch.regs.xer = vcpu->arch.xer_tm;
        vcpu->arch.regs.link  = vcpu->arch.lr_tm;
        vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
@@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
 
 static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.cr_tm  = vcpu->arch.cr;
+       vcpu->arch.cr_tm  = vcpu->arch.regs.ccr;
        vcpu->arch.xer_tm = vcpu->arch.regs.xer;
        vcpu->arch.lr_tm  = vcpu->arch.regs.link;
        vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
@@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
+extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
+                            unsigned long gpa, unsigned int level,
+                            unsigned long mmu_seq, unsigned int lpid,
+                            unsigned long *rmapp, struct rmap_nested **n_rmap);
+extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
+                                  struct rmap_nested **n_rmap);
+extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
+                               struct kvm_memory_slot *memslot,
+                               unsigned long gpa, unsigned long hpa,
+                               unsigned long nbytes);
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
index d978fdf..eb3ba63 100644 (file)
@@ -25,6 +25,9 @@
 #define XICS_MFRR              0xc
 #define XICS_IPI               2       /* interrupt source # for IPIs */
 
+/* LPIDs we support with this build -- runtime limit may be lower */
+#define KVMPPC_NR_LPIDS                        (LPID_RSVD + 1)
+
 /* Maximum number of threads per physical core */
 #define MAX_SMT_THREADS                8
 
index d513e3e..f0cef62 100644 (file)
@@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 
 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
 {
-       vcpu->arch.cr = val;
+       vcpu->arch.regs.ccr = val;
 }
 
 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.cr;
+       return vcpu->arch.regs.ccr;
 }
 
 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
index 906bcbd..fac6f63 100644 (file)
@@ -46,6 +46,7 @@
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #include <asm/kvm_book3s_asm.h>                /* for MAX_SMT_THREADS */
 #define KVM_MAX_VCPU_ID                (MAX_SMT_THREADS * KVM_MAX_VCORES)
+#define KVM_MAX_NESTED_GUESTS  KVMPPC_NR_LPIDS
 
 #else
 #define KVM_MAX_VCPU_ID                KVM_MAX_VCPUS
@@ -94,6 +95,7 @@ struct dtl_entry;
 
 struct kvmppc_vcpu_book3s;
 struct kvmppc_book3s_shadow_vcpu;
+struct kvm_nested_guest;
 
 struct kvm_vm_stat {
        ulong remote_tlb_flush;
@@ -287,10 +289,12 @@ struct kvm_arch {
        u8 radix;
        u8 fwnmi_enabled;
        bool threads_indep;
+       bool nested_enable;
        pgd_t *pgtable;
        u64 process_table;
        struct dentry *debugfs_dir;
        struct dentry *htab_dentry;
+       struct dentry *radix_dentry;
        struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
@@ -311,6 +315,9 @@ struct kvm_arch {
 #endif
        struct kvmppc_ops *kvm_ops;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+       u64 l1_ptcr;
+       int max_nested_lpid;
+       struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
        /* This array can grow quite large, keep it at the end */
        struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 #endif
@@ -360,7 +367,9 @@ struct kvmppc_pte {
        bool may_write          : 1;
        bool may_execute        : 1;
        unsigned long wimg;
+       unsigned long rc;
        u8 page_size;           /* MMU_PAGE_xxx */
+       u8 page_shift;
 };
 
 struct kvmppc_mmu {
@@ -537,8 +546,6 @@ struct kvm_vcpu_arch {
        ulong tar;
 #endif
 
-       u32 cr;
-
 #ifdef CONFIG_PPC_BOOK3S
        ulong hflags;
        ulong guest_owned_ext;
@@ -707,6 +714,7 @@ struct kvm_vcpu_arch {
        u8 hcall_needed;
        u8 epr_flags; /* KVMPPC_EPR_xxx */
        u8 epr_needed;
+       u8 external_oneshot;    /* clear external irq after delivery */
 
        u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
@@ -781,6 +789,10 @@ struct kvm_vcpu_arch {
        u32 emul_inst;
 
        u32 online;
+
+       /* For support of nested guests */
+       struct kvm_nested_guest *nested;
+       u32 nested_vcpu_id;
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
index e991821..9b89b19 100644 (file)
@@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
                (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
                                (stt)->size, (ioba), (npages)) ?        \
                                H_PARAMETER : H_SUCCESS)
-extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
-               unsigned long tce);
-extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
                unsigned long *ua, unsigned long **prmap);
 extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
                unsigned long idx, unsigned long tce);
@@ -327,6 +325,7 @@ struct kvmppc_ops {
        int (*set_smt_mode)(struct kvm *kvm, unsigned long mode,
                            unsigned long flags);
        void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr);
+       int (*enable_nested)(struct kvm *kvm);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
@@ -585,6 +584,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 
 extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
                               int level, bool line_status);
+extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
 #else
 static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
                                       u32 priority) { return -1; }
@@ -607,6 +607,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
 
 static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
                                      int level, bool line_status) { return -ENODEV; }
+static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
 #endif /* CONFIG_KVM_XIVE */
 
 /*
@@ -652,6 +653,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
                     unsigned long mfrr);
 int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
 int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu);
 
 /*
  * Host-side operations we want to set up while running in real
index 665af14..6093bc8 100644 (file)
 #define OP_31_XOP_LHZUX     311
 #define OP_31_XOP_MSGSNDP   142
 #define OP_31_XOP_MSGCLRP   174
+#define OP_31_XOP_TLBIE     306
 #define OP_31_XOP_MFSPR     339
 #define OP_31_XOP_LWAX      341
 #define OP_31_XOP_LHAX      343
index e5b314e..c906989 100644 (file)
 #define   HFSCR_DSCR   __MASK(FSCR_DSCR_LG)
 #define   HFSCR_VECVSX __MASK(FSCR_VECVSX_LG)
 #define   HFSCR_FP     __MASK(FSCR_FP_LG)
+#define   HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56)     /* interrupt cause */
 #define SPRN_TAR       0x32f   /* Target Address Register */
 #define SPRN_LPCR      0x13E   /* LPAR Control Register */
 #define   LPCR_VPM0            ASM_CONST(0x8000000000000000)
 #define SPRN_HSRR0     0x13A   /* Save/Restore Register 0 */
 #define SPRN_HSRR1     0x13B   /* Save/Restore Register 1 */
 #define   HSRR1_DENORM         0x00100000 /* Denorm exception */
+#define   HSRR1_HISI_WRITE     0x00010000 /* HISI bcs couldn't update mem */
 
 #define SPRN_TBCTL     0x35f   /* PA6T Timebase control register */
 #define   TBCTL_FREEZE         0x0000000000000000ull /* Freeze all tbs */
index 1b32b56..8c876c1 100644 (file)
@@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char {
 
 #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
 #define KVM_REG_PPC_ONLINE     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf)
+#define KVM_REG_PPC_PTCR       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0)
 
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
index 041a115..d68b9ef 100644 (file)
@@ -438,7 +438,7 @@ int main(void)
 #ifdef CONFIG_PPC_BOOK3S
        OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
 #endif
-       OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
+       OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
        OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
@@ -503,6 +503,7 @@ int main(void)
        OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
        OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
        OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
+       OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
        OFFSET(VCPU_CPU, kvm_vcpu, cpu);
        OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
 #endif
@@ -695,7 +696,7 @@ int main(void)
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
-       OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
+       OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
        OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
        OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
        OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
index 458b928..c317080 100644 (file)
@@ -147,8 +147,8 @@ __init_hvmode_206:
        rldicl. r0,r3,4,63
        bnelr
        ld      r5,CPU_SPEC_FEATURES(r4)
-       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
-       xor     r5,r5,r6
+       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST)
+       andc    r5,r5,r6
        std     r5,CPU_SPEC_FEATURES(r4)
        blr
 
index f872c04..e814f40 100644 (file)
@@ -75,7 +75,8 @@ kvm-hv-y += \
        book3s_hv.o \
        book3s_hv_interrupts.o \
        book3s_64_mmu_hv.o \
-       book3s_64_mmu_radix.o
+       book3s_64_mmu_radix.o \
+       book3s_hv_nested.o
 
 kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
        book3s_hv_tm.o
index 87348e4..fd9893b 100644 (file)
@@ -78,8 +78,11 @@ void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu)
 {
        if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
                ulong pc = kvmppc_get_pc(vcpu);
+               ulong lr = kvmppc_get_lr(vcpu);
                if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
                        kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
+               if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
+                       kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK);
                vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
        }
 }
@@ -150,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
        case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE;         break;
        case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT;         break;
        case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL;             break;
-       case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL;       break;
        case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT;            break;
        case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM;              break;
        case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL;           break;
@@ -236,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                 struct kvm_interrupt *irq)
 {
-       unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
-
-       if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
-               vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
+       /*
+        * This case (KVM_INTERRUPT_SET) should never actually arise for
+        * a pseries guest (because pseries guests expect their interrupt
+        * controllers to continue asserting an external interrupt request
+        * until it is acknowledged at the interrupt controller), but is
+        * included to avoid ABI breakage and potentially for other
+        * sorts of guest.
+        *
+        * There is a subtlety here: HV KVM does not test the
+        * external_oneshot flag in the code that synthesizes
+        * external interrupts for the guest just before entering
+        * the guest.  That is OK even if userspace did do a
+        * KVM_INTERRUPT_SET on a pseries guest vcpu, because the
+        * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick()
+        * which ends up doing a smp_send_reschedule(), which will
+        * pull the guest all the way out to the host, meaning that
+        * we will call kvmppc_core_prepare_to_enter() before entering
+        * the guest again, and that will handle the external_oneshot
+        * flag correctly.
+        */
+       if (irq->irq == KVM_INTERRUPT_SET)
+               vcpu->arch.external_oneshot = 1;
 
-       kvmppc_book3s_queue_irqprio(vcpu, vec);
+       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
 }
 
 void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
 {
        kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
-       kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
 }
 
 void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
@@ -278,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
                vec = BOOK3S_INTERRUPT_DECREMENTER;
                break;
        case BOOK3S_IRQPRIO_EXTERNAL:
-       case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
                deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
                vec = BOOK3S_INTERRUPT_EXTERNAL;
                break;
@@ -352,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
                case BOOK3S_IRQPRIO_DECREMENTER:
                        /* DEC interrupts get cleared by mtdec */
                        return false;
-               case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
-                       /* External interrupts get cleared by userspace */
+               case BOOK3S_IRQPRIO_EXTERNAL:
+                       /*
+                        * External interrupts get cleared by userspace
+                        * except when set by the KVM_INTERRUPT ioctl with
+                        * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL).
+                        */
+                       if (vcpu->arch.external_oneshot) {
+                               vcpu->arch.external_oneshot = 0;
+                               return true;
+                       }
                        return false;
        }
 
index 68e14af..c615617 100644 (file)
@@ -268,14 +268,13 @@ int kvmppc_mmu_hv_init(void)
 {
        unsigned long host_lpid, rsvd_lpid;
 
-       if (!cpu_has_feature(CPU_FTR_HVMODE))
-               return -EINVAL;
-
        if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
                return -EINVAL;
 
        /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
-       host_lpid = mfspr(SPRN_LPID);
+       host_lpid = 0;
+       if (cpu_has_feature(CPU_FTR_HVMODE))
+               host_lpid = mfspr(SPRN_LPID);
        rsvd_lpid = LPID_RSVD;
 
        kvmppc_init_lpid(rsvd_lpid + 1);
index 998f8d0..d68162e 100644 (file)
@@ -10,6 +10,9 @@
 #include <linux/string.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/debugfs.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
  */
 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
 
-int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                          struct kvmppc_pte *gpte, bool data, bool iswrite)
+int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
+                              struct kvmppc_pte *gpte, u64 root,
+                              u64 *pte_ret_p)
 {
        struct kvm *kvm = vcpu->kvm;
-       u32 pid;
        int ret, level, ps;
-       __be64 prte, rpte;
-       unsigned long ptbl;
-       unsigned long root, pte, index;
-       unsigned long rts, bits, offset;
-       unsigned long gpa;
-       unsigned long proc_tbl_size;
-
-       /* Work out effective PID */
-       switch (eaddr >> 62) {
-       case 0:
-               pid = vcpu->arch.pid;
-               break;
-       case 3:
-               pid = 0;
-               break;
-       default:
-               return -EINVAL;
-       }
-       proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
-       if (pid * 16 >= proc_tbl_size)
-               return -EINVAL;
-
-       /* Read partition table to find root of tree for effective PID */
-       ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
-       ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
-       if (ret)
-               return ret;
+       unsigned long rts, bits, offset, index;
+       u64 pte, base, gpa;
+       __be64 rpte;
 
-       root = be64_to_cpu(prte);
        rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
                ((root & RTS2_MASK) >> RTS2_SHIFT);
        bits = root & RPDS_MASK;
-       root = root & RPDB_MASK;
+       base = root & RPDB_MASK;
 
        offset = rts + 31;
 
-       /* current implementations only support 52-bit space */
+       /* Current implementations only support 52-bit space */
        if (offset != 52)
                return -EINVAL;
 
+       /* Walk each level of the radix tree */
        for (level = 3; level >= 0; --level) {
+               u64 addr;
+               /* Check a valid size */
                if (level && bits != p9_supported_radix_bits[level])
                        return -EINVAL;
                if (level == 0 && !(bits == 5 || bits == 9))
                        return -EINVAL;
                offset -= bits;
                index = (eaddr >> offset) & ((1UL << bits) - 1);
-               /* check that low bits of page table base are zero */
-               if (root & ((1UL << (bits + 3)) - 1))
+               /* Check that low bits of page table base are zero */
+               if (base & ((1UL << (bits + 3)) - 1))
                        return -EINVAL;
-               ret = kvm_read_guest(kvm, root + index * 8,
-                                    &rpte, sizeof(rpte));
-               if (ret)
+               /* Read the entry from guest memory */
+               addr = base + (index * sizeof(rpte));
+               ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
+               if (ret) {
+                       if (pte_ret_p)
+                               *pte_ret_p = addr;
                        return ret;
+               }
                pte = __be64_to_cpu(rpte);
                if (!(pte & _PAGE_PRESENT))
                        return -ENOENT;
+               /* Check if a leaf entry */
                if (pte & _PAGE_PTE)
                        break;
-               bits = pte & 0x1f;
-               root = pte & 0x0fffffffffffff00ul;
+               /* Get ready to walk the next level */
+               base = pte & RPDB_MASK;
+               bits = pte & RPDS_MASK;
        }
-       /* need a leaf at lowest level; 512GB pages not supported */
+
+       /* Need a leaf at lowest level; 512GB pages not supported */
        if (level < 0 || level == 3)
                return -EINVAL;
 
-       /* offset is now log base 2 of the page size */
+       /* We found a valid leaf PTE */
+       /* Offset is now log base 2 of the page size */
        gpa = pte & 0x01fffffffffff000ul;
        if (gpa & ((1ul << offset) - 1))
                return -EINVAL;
-       gpa += eaddr & ((1ul << offset) - 1);
+       gpa |= eaddr & ((1ul << offset) - 1);
        for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
                if (offset == mmu_psize_defs[ps].shift)
                        break;
        gpte->page_size = ps;
+       gpte->page_shift = offset;
 
        gpte->eaddr = eaddr;
        gpte->raddr = gpa;
@@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
        gpte->may_read = !!(pte & _PAGE_READ);
        gpte->may_write = !!(pte & _PAGE_WRITE);
        gpte->may_execute = !!(pte & _PAGE_EXEC);
+
+       gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
+
+       if (pte_ret_p)
+               *pte_ret_p = pte;
+
+       return 0;
+}
+
+/*
+ * Used to walk a partition or process table radix tree in guest memory
+ * Note: We exploit the fact that a partition table and a process
+ * table have the same layout, a partition-scoped page table and a
+ * process-scoped page table have the same layout, and the 2nd
+ * doubleword of a partition table entry has the same layout as
+ * the PTCR register.
+ */
+int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                    struct kvmppc_pte *gpte, u64 table,
+                                    int table_index, u64 *pte_ret_p)
+{
+       struct kvm *kvm = vcpu->kvm;
+       int ret;
+       unsigned long size, ptbl, root;
+       struct prtb_entry entry;
+
+       if ((table & PRTS_MASK) > 24)
+               return -EINVAL;
+       size = 1ul << ((table & PRTS_MASK) + 12);
+
+       /* Is the table big enough to contain this entry? */
+       if ((table_index * sizeof(entry)) >= size)
+               return -EINVAL;
+
+       /* Read the table to find the root of the radix tree */
+       ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
+       ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
+       if (ret)
+               return ret;
+
+       /* Root is stored in the first double word */
+       root = be64_to_cpu(entry.prtb0);
+
+       return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
+}
+
+int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+                          struct kvmppc_pte *gpte, bool data, bool iswrite)
+{
+       u32 pid;
+       u64 pte;
+       int ret;
+
+       /* Work out effective PID */
+       switch (eaddr >> 62) {
+       case 0:
+               pid = vcpu->arch.pid;
+               break;
+       case 3:
+               pid = 0;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
+                               vcpu->kvm->arch.process_table, pid, &pte);
+       if (ret)
+               return ret;
+
+       /* Check privilege (applies only to process scoped translations) */
        if (kvmppc_get_msr(vcpu) & MSR_PR) {
                if (pte & _PAGE_PRIVILEGED) {
                        gpte->may_read = 0;
@@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 }
 
 static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
-                                   unsigned int pshift)
+                                   unsigned int pshift, unsigned int lpid)
 {
        unsigned long psize = PAGE_SIZE;
+       int psi;
+       long rc;
+       unsigned long rb;
 
        if (pshift)
                psize = 1UL << pshift;
+       else
+               pshift = PAGE_SHIFT;
 
        addr &= ~(psize - 1);
-       radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
+
+       if (!kvmhv_on_pseries()) {
+               radix__flush_tlb_lpid_page(lpid, addr, psize);
+               return;
+       }
+
+       psi = shift_to_mmu_psize(pshift);
+       rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
+       rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
+                               lpid, rb);
+       if (rc)
+               pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
 }
 
-static void kvmppc_radix_flush_pwc(struct kvm *kvm)
+static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
 {
-       radix__flush_pwc_lpid(kvm->arch.lpid);
+       long rc;
+
+       if (!kvmhv_on_pseries()) {
+               radix__flush_pwc_lpid(lpid);
+               return;
+       }
+
+       rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
+                               lpid, TLBIEL_INVAL_SET_LPID);
+       if (rc)
+               pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
 }
 
 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
@@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
        kmem_cache_free(kvm_pmd_cache, pmdp);
 }
 
-static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
-                            unsigned long gpa, unsigned int shift)
+/* Called with kvm->mmu_lock held */
+void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
+                     unsigned int shift, struct kvm_memory_slot *memslot,
+                     unsigned int lpid)
 
 {
-       unsigned long page_size = 1ul << shift;
        unsigned long old;
+       unsigned long gfn = gpa >> PAGE_SHIFT;
+       unsigned long page_size = PAGE_SIZE;
+       unsigned long hpa;
 
        old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
-       kvmppc_radix_tlbie_page(kvm, gpa, shift);
-       if (old & _PAGE_DIRTY) {
-               unsigned long gfn = gpa >> PAGE_SHIFT;
-               struct kvm_memory_slot *memslot;
+       kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
+
+       /* The following only applies to L1 entries */
+       if (lpid != kvm->arch.lpid)
+               return;
 
+       if (!memslot) {
                memslot = gfn_to_memslot(kvm, gfn);
-               if (memslot && memslot->dirty_bitmap)
-                       kvmppc_update_dirty_map(memslot, gfn, page_size);
+               if (!memslot)
+                       return;
        }
+       if (shift)
+               page_size = 1ul << shift;
+
+       gpa &= ~(page_size - 1);
+       hpa = old & PTE_RPN_MASK;
+       kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
+
+       if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
+               kvmppc_update_dirty_map(memslot, gfn, page_size);
 }
 
 /*
@@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
  * and emit a warning if encountered, but there may already be data
  * corruption due to the unexpected mappings.
  */
-static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
+static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
+                                 unsigned int lpid)
 {
        if (full) {
                memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
@@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
                        WARN_ON_ONCE(1);
                        kvmppc_unmap_pte(kvm, p,
                                         pte_pfn(*p) << PAGE_SHIFT,
-                                        PAGE_SHIFT);
+                                        PAGE_SHIFT, NULL, lpid);
                }
        }
 
        kvmppc_pte_free(pte);
 }
 
-static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
+static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
+                                 unsigned int lpid)
 {
        unsigned long im;
        pmd_t *p = pmd;
@@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
                                WARN_ON_ONCE(1);
                                kvmppc_unmap_pte(kvm, (pte_t *)p,
                                         pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
-                                        PMD_SHIFT);
+                                        PMD_SHIFT, NULL, lpid);
                        }
                } else {
                        pte_t *pte;
 
                        pte = pte_offset_map(p, 0);
-                       kvmppc_unmap_free_pte(kvm, pte, full);
+                       kvmppc_unmap_free_pte(kvm, pte, full, lpid);
                        pmd_clear(p);
                }
        }
        kvmppc_pmd_free(pmd);
 }
 
-static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
+static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
+                                 unsigned int lpid)
 {
        unsigned long iu;
        pud_t *p = pud;
@@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
                        pmd_t *pmd;
 
                        pmd = pmd_offset(p, 0);
-                       kvmppc_unmap_free_pmd(kvm, pmd, true);
+                       kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
                        pud_clear(p);
                }
        }
        pud_free(kvm->mm, pud);
 }
 
-void kvmppc_free_radix(struct kvm *kvm)
+void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
 {
        unsigned long ig;
-       pgd_t *pgd;
 
-       if (!kvm->arch.pgtable)
-               return;
-       pgd = kvm->arch.pgtable;
        for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
                pud_t *pud;
 
                if (!pgd_present(*pgd))
                        continue;
                pud = pud_offset(pgd, 0);
-               kvmppc_unmap_free_pud(kvm, pud);
+               kvmppc_unmap_free_pud(kvm, pud, lpid);
                pgd_clear(pgd);
        }
-       pgd_free(kvm->mm, kvm->arch.pgtable);
-       kvm->arch.pgtable = NULL;
+}
+
+void kvmppc_free_radix(struct kvm *kvm)
+{
+       if (kvm->arch.pgtable) {
+               kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
+                                         kvm->arch.lpid);
+               pgd_free(kvm->mm, kvm->arch.pgtable);
+               kvm->arch.pgtable = NULL;
+       }
 }
 
 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
-                                             unsigned long gpa)
+                                       unsigned long gpa, unsigned int lpid)
 {
        pte_t *pte = pte_offset_kernel(pmd, 0);
 
@@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
         * flushing the PWC again.
         */
        pmd_clear(pmd);
-       kvmppc_radix_flush_pwc(kvm);
+       kvmppc_radix_flush_pwc(kvm, lpid);
 
-       kvmppc_unmap_free_pte(kvm, pte, false);
+       kvmppc_unmap_free_pte(kvm, pte, false, lpid);
 }
 
 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
-                                       unsigned long gpa)
+                                       unsigned long gpa, unsigned int lpid)
 {
        pmd_t *pmd = pmd_offset(pud, 0);
 
@@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
         * so can be freed without flushing the PWC again.
         */
        pud_clear(pud);
-       kvmppc_radix_flush_pwc(kvm);
+       kvmppc_radix_flush_pwc(kvm, lpid);
 
-       kvmppc_unmap_free_pmd(kvm, pmd, false);
+       kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
 }
 
 /*
@@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
  */
 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
 
-static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
-                            unsigned int level, unsigned long mmu_seq)
+int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
+                     unsigned long gpa, unsigned int level,
+                     unsigned long mmu_seq, unsigned int lpid,
+                     unsigned long *rmapp, struct rmap_nested **n_rmap)
 {
        pgd_t *pgd;
        pud_t *pud, *new_pud = NULL;
@@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
        int ret;
 
        /* Traverse the guest's 2nd-level tree, allocate new levels needed */
-       pgd = kvm->arch.pgtable + pgd_index(gpa);
+       pgd = pgtable + pgd_index(gpa);
        pud = NULL;
        if (pgd_present(*pgd))
                pud = pud_offset(pgd, gpa);
@@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                        goto out_unlock;
                }
                /* Valid 1GB page here already, remove it */
-               kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT);
+               kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
+                                lpid);
        }
        if (level == 2) {
                if (!pud_none(*pud)) {
@@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                         * install a large page, so remove and free the page
                         * table page.
                         */
-                       kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
+                       kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
                }
                kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
+               if (rmapp && n_rmap)
+                       kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
                ret = 0;
                goto out_unlock;
        }
@@ -458,7 +572,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                        WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
                                                        PTE_BITS_MUST_MATCH);
                        kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
-                                             0, pte_val(pte), lgpa, PMD_SHIFT);
+                                       0, pte_val(pte), lgpa, PMD_SHIFT);
                        ret = 0;
                        goto out_unlock;
                }
@@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                        goto out_unlock;
                }
                /* Valid 2MB page here already, remove it */
-               kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT);
+               kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
+                                lpid);
        }
        if (level == 1) {
                if (!pmd_none(*pmd)) {
@@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                         * install a large page, so remove and free the page
                         * table page.
                         */
-                       kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
+                       kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
                }
                kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+               if (rmapp && n_rmap)
+                       kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
                ret = 0;
                goto out_unlock;
        }
@@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                goto out_unlock;
        }
        kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+       if (rmapp && n_rmap)
+               kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
        ret = 0;
 
  out_unlock:
@@ -521,95 +640,49 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
        return ret;
 }
 
-int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                                  unsigned long ea, unsigned long dsisr)
+bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
+                            unsigned long gpa, unsigned int lpid)
+{
+       unsigned long pgflags;
+       unsigned int shift;
+       pte_t *ptep;
+
+       /*
+        * Need to set an R or C bit in the 2nd-level tables;
+        * since we are just helping out the hardware here,
+        * it is sufficient to do what the hardware does.
+        */
+       pgflags = _PAGE_ACCESSED;
+       if (writing)
+               pgflags |= _PAGE_DIRTY;
+       /*
+        * We are walking the secondary (partition-scoped) page table here.
+        * We can do this without disabling irq because the Linux MM
+        * subsystem doesn't do THP splits and collapses on this tree.
+        */
+       ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
+       if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
+               kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
+               return true;
+       }
+       return false;
+}
+
+int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
+                                  unsigned long gpa,
+                                  struct kvm_memory_slot *memslot,
+                                  bool writing, bool kvm_ro,
+                                  pte_t *inserted_pte, unsigned int *levelp)
 {
        struct kvm *kvm = vcpu->kvm;
-       unsigned long mmu_seq;
-       unsigned long gpa, gfn, hva;
-       struct kvm_memory_slot *memslot;
        struct page *page = NULL;
-       long ret;
-       bool writing;
+       unsigned long mmu_seq;
+       unsigned long hva, gfn = gpa >> PAGE_SHIFT;
        bool upgrade_write = false;
        bool *upgrade_p = &upgrade_write;
        pte_t pte, *ptep;
-       unsigned long pgflags;
        unsigned int shift, level;
-
-       /* Check for unusual errors */
-       if (dsisr & DSISR_UNSUPP_MMU) {
-               pr_err("KVM: Got unsupported MMU fault\n");
-               return -EFAULT;
-       }
-       if (dsisr & DSISR_BADACCESS) {
-               /* Reflect to the guest as DSI */
-               pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
-               kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
-               return RESUME_GUEST;
-       }
-
-       /* Translate the logical address and get the page */
-       gpa = vcpu->arch.fault_gpa & ~0xfffUL;
-       gpa &= ~0xF000000000000000ul;
-       gfn = gpa >> PAGE_SHIFT;
-       if (!(dsisr & DSISR_PRTABLE_FAULT))
-               gpa |= ea & 0xfff;
-       memslot = gfn_to_memslot(kvm, gfn);
-
-       /* No memslot means it's an emulated MMIO region */
-       if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
-               if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
-                            DSISR_SET_RC)) {
-                       /*
-                        * Bad address in guest page table tree, or other
-                        * unusual error - reflect it to the guest as DSI.
-                        */
-                       kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
-                       return RESUME_GUEST;
-               }
-               return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
-                                             dsisr & DSISR_ISSTORE);
-       }
-
-       writing = (dsisr & DSISR_ISSTORE) != 0;
-       if (memslot->flags & KVM_MEM_READONLY) {
-               if (writing) {
-                       /* give the guest a DSI */
-                       dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
-                       kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
-                       return RESUME_GUEST;
-               }
-               upgrade_p = NULL;
-       }
-
-       if (dsisr & DSISR_SET_RC) {
-               /*
-                * Need to set an R or C bit in the 2nd-level tables;
-                * since we are just helping out the hardware here,
-                * it is sufficient to do what the hardware does.
-                */
-               pgflags = _PAGE_ACCESSED;
-               if (writing)
-                       pgflags |= _PAGE_DIRTY;
-               /*
-                * We are walking the secondary page table here. We can do this
-                * without disabling irq.
-                */
-               spin_lock(&kvm->mmu_lock);
-               ptep = __find_linux_pte(kvm->arch.pgtable,
-                                       gpa, NULL, &shift);
-               if (ptep && pte_present(*ptep) &&
-                   (!writing || pte_write(*ptep))) {
-                       kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
-                                               gpa, shift);
-                       dsisr &= ~DSISR_SET_RC;
-               }
-               spin_unlock(&kvm->mmu_lock);
-               if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
-                              DSISR_PROTFAULT | DSISR_SET_RC)))
-                       return RESUME_GUEST;
-       }
+       int ret;
 
        /* used to check for invalidations in progress */
        mmu_seq = kvm->mmu_notifier_seq;
@@ -622,7 +695,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
         * is that the page is writable.
         */
        hva = gfn_to_hva_memslot(memslot, gfn);
-       if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
+       if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
                upgrade_write = true;
        } else {
                unsigned long pfn;
@@ -690,7 +763,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        }
 
        /* Allocate space in the tree and write the PTE */
-       ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+       ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
+                               mmu_seq, kvm->arch.lpid, NULL, NULL);
+       if (inserted_pte)
+               *inserted_pte = pte;
+       if (levelp)
+               *levelp = level;
 
        if (page) {
                if (!ret && (pte_val(pte) & _PAGE_WRITE))
@@ -698,6 +776,82 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                put_page(page);
        }
 
+       return ret;
+}
+
+int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                  unsigned long ea, unsigned long dsisr)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long gpa, gfn;
+       struct kvm_memory_slot *memslot;
+       long ret;
+       bool writing = !!(dsisr & DSISR_ISSTORE);
+       bool kvm_ro = false;
+
+       /* Check for unusual errors */
+       if (dsisr & DSISR_UNSUPP_MMU) {
+               pr_err("KVM: Got unsupported MMU fault\n");
+               return -EFAULT;
+       }
+       if (dsisr & DSISR_BADACCESS) {
+               /* Reflect to the guest as DSI */
+               pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
+               kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+               return RESUME_GUEST;
+       }
+
+       /* Translate the logical address */
+       gpa = vcpu->arch.fault_gpa & ~0xfffUL;
+       gpa &= ~0xF000000000000000ul;
+       gfn = gpa >> PAGE_SHIFT;
+       if (!(dsisr & DSISR_PRTABLE_FAULT))
+               gpa |= ea & 0xfff;
+
+       /* Get the corresponding memslot */
+       memslot = gfn_to_memslot(kvm, gfn);
+
+       /* No memslot means it's an emulated MMIO region */
+       if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+               if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
+                            DSISR_SET_RC)) {
+                       /*
+                        * Bad address in guest page table tree, or other
+                        * unusual error - reflect it to the guest as DSI.
+                        */
+                       kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+                       return RESUME_GUEST;
+               }
+               return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
+       }
+
+       if (memslot->flags & KVM_MEM_READONLY) {
+               if (writing) {
+                       /* give the guest a DSI */
+                       kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
+                                                      DSISR_PROTFAULT);
+                       return RESUME_GUEST;
+               }
+               kvm_ro = true;
+       }
+
+       /* Failed to set the reference/change bits */
+       if (dsisr & DSISR_SET_RC) {
+               spin_lock(&kvm->mmu_lock);
+               if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
+                                           writing, gpa, kvm->arch.lpid))
+                       dsisr &= ~DSISR_SET_RC;
+               spin_unlock(&kvm->mmu_lock);
+
+               if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+                              DSISR_PROTFAULT | DSISR_SET_RC)))
+                       return RESUME_GUEST;
+       }
+
+       /* Try to insert a pte */
+       ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
+                                            kvm_ro, NULL, NULL);
+
        if (ret == 0 || ret == -EAGAIN)
                ret = RESUME_GUEST;
        return ret;
@@ -710,20 +864,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
        unsigned int shift;
-       unsigned long old;
 
        ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
-       if (ptep && pte_present(*ptep)) {
-               old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
-                                             gpa, shift);
-               kvmppc_radix_tlbie_page(kvm, gpa, shift);
-               if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
-                       unsigned long psize = PAGE_SIZE;
-                       if (shift)
-                               psize = 1ul << shift;
-                       kvmppc_update_dirty_map(memslot, gfn, psize);
-               }
-       }
+       if (ptep && pte_present(*ptep))
+               kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
+                                kvm->arch.lpid);
        return 0;                               
 }
 
@@ -778,7 +923,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
                        ret = 1 << (shift - PAGE_SHIFT);
                kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
                                        gpa, shift);
-               kvmppc_radix_tlbie_page(kvm, gpa, shift);
+               kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
        }
        return ret;
 }
@@ -863,6 +1008,215 @@ static void pmd_ctor(void *addr)
        memset(addr, 0, RADIX_PMD_TABLE_SIZE);
 }
 
+struct debugfs_radix_state {
+       struct kvm      *kvm;
+       struct mutex    mutex;
+       unsigned long   gpa;
+       int             lpid;
+       int             chars_left;
+       int             buf_index;
+       char            buf[128];
+       u8              hdr;
+};
+
+static int debugfs_radix_open(struct inode *inode, struct file *file)
+{
+       struct kvm *kvm = inode->i_private;
+       struct debugfs_radix_state *p;
+
+       p = kzalloc(sizeof(*p), GFP_KERNEL);
+       if (!p)
+               return -ENOMEM;
+
+       kvm_get_kvm(kvm);
+       p->kvm = kvm;
+       mutex_init(&p->mutex);
+       file->private_data = p;
+
+       return nonseekable_open(inode, file);
+}
+
+static int debugfs_radix_release(struct inode *inode, struct file *file)
+{
+       struct debugfs_radix_state *p = file->private_data;
+
+       kvm_put_kvm(p->kvm);
+       kfree(p);
+       return 0;
+}
+
+static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
+                                size_t len, loff_t *ppos)
+{
+       struct debugfs_radix_state *p = file->private_data;
+       ssize_t ret, r;
+       unsigned long n;
+       struct kvm *kvm;
+       unsigned long gpa;
+       pgd_t *pgt;
+       struct kvm_nested_guest *nested;
+       pgd_t pgd, *pgdp;
+       pud_t pud, *pudp;
+       pmd_t pmd, *pmdp;
+       pte_t *ptep;
+       int shift;
+       unsigned long pte;
+
+       kvm = p->kvm;
+       if (!kvm_is_radix(kvm))
+               return 0;
+
+       ret = mutex_lock_interruptible(&p->mutex);
+       if (ret)
+               return ret;
+
+       if (p->chars_left) {
+               n = p->chars_left;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf + p->buf_index, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index += n;
+               buf += n;
+               len -= n;
+               ret = n;
+               if (r) {
+                       if (!n)
+                               ret = -EFAULT;
+                       goto out;
+               }
+       }
+
+       gpa = p->gpa;
+       nested = NULL;
+       pgt = NULL;
+       while (len != 0 && p->lpid >= 0) {
+               if (gpa >= RADIX_PGTABLE_RANGE) {
+                       gpa = 0;
+                       pgt = NULL;
+                       if (nested) {
+                               kvmhv_put_nested(nested);
+                               nested = NULL;
+                       }
+                       p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
+                       p->hdr = 0;
+                       if (p->lpid < 0)
+                               break;
+               }
+               if (!pgt) {
+                       if (p->lpid == 0) {
+                               pgt = kvm->arch.pgtable;
+                       } else {
+                               nested = kvmhv_get_nested(kvm, p->lpid, false);
+                               if (!nested) {
+                                       gpa = RADIX_PGTABLE_RANGE;
+                                       continue;
+                               }
+                               pgt = nested->shadow_pgtable;
+                       }
+               }
+               n = 0;
+               if (!p->hdr) {
+                       if (p->lpid > 0)
+                               n = scnprintf(p->buf, sizeof(p->buf),
+                                             "\nNested LPID %d: ", p->lpid);
+                       n += scnprintf(p->buf + n, sizeof(p->buf) - n,
+                                     "pgdir: %lx\n", (unsigned long)pgt);
+                       p->hdr = 1;
+                       goto copy;
+               }
+
+               pgdp = pgt + pgd_index(gpa);
+               pgd = READ_ONCE(*pgdp);
+               if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
+                       gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
+                       continue;
+               }
+
+               pudp = pud_offset(&pgd, gpa);
+               pud = READ_ONCE(*pudp);
+               if (!(pud_val(pud) & _PAGE_PRESENT)) {
+                       gpa = (gpa & PUD_MASK) + PUD_SIZE;
+                       continue;
+               }
+               if (pud_val(pud) & _PAGE_PTE) {
+                       pte = pud_val(pud);
+                       shift = PUD_SHIFT;
+                       goto leaf;
+               }
+
+               pmdp = pmd_offset(&pud, gpa);
+               pmd = READ_ONCE(*pmdp);
+               if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+                       gpa = (gpa & PMD_MASK) + PMD_SIZE;
+                       continue;
+               }
+               if (pmd_val(pmd) & _PAGE_PTE) {
+                       pte = pmd_val(pmd);
+                       shift = PMD_SHIFT;
+                       goto leaf;
+               }
+
+               ptep = pte_offset_kernel(&pmd, gpa);
+               pte = pte_val(READ_ONCE(*ptep));
+               if (!(pte & _PAGE_PRESENT)) {
+                       gpa += PAGE_SIZE;
+                       continue;
+               }
+               shift = PAGE_SHIFT;
+       leaf:
+               n = scnprintf(p->buf, sizeof(p->buf),
+                             " %lx: %lx %d\n", gpa, pte, shift);
+               gpa += 1ul << shift;
+       copy:
+               p->chars_left = n;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index = n;
+               buf += n;
+               len -= n;
+               ret += n;
+               if (r) {
+                       if (!ret)
+                               ret = -EFAULT;
+                       break;
+               }
+       }
+       p->gpa = gpa;
+       if (nested)
+               kvmhv_put_nested(nested);
+
+ out:
+       mutex_unlock(&p->mutex);
+       return ret;
+}
+
+static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
+                          size_t len, loff_t *ppos)
+{
+       return -EACCES;
+}
+
+static const struct file_operations debugfs_radix_fops = {
+       .owner   = THIS_MODULE,
+       .open    = debugfs_radix_open,
+       .release = debugfs_radix_release,
+       .read    = debugfs_radix_read,
+       .write   = debugfs_radix_write,
+       .llseek  = generic_file_llseek,
+};
+
+void kvmhv_radix_debugfs_init(struct kvm *kvm)
+{
+       kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
+                                                    kvm->arch.debugfs_dir, kvm,
+                                                    &debugfs_radix_fops);
+}
+
 int kvmppc_radix_init(void)
 {
        unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
index 9a3f264..62a8d03 100644 (file)
@@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
        return ret;
 }
 
+static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
+               unsigned long tce)
+{
+       unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+       enum dma_data_direction dir = iommu_tce_direction(tce);
+       struct kvmppc_spapr_tce_iommu_table *stit;
+       unsigned long ua = 0;
+
+       /* Allow userspace to poison TCE table */
+       if (dir == DMA_NONE)
+               return H_SUCCESS;
+
+       if (iommu_tce_check_gpa(stt->page_shift, gpa))
+               return H_TOO_HARD;
+
+       if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
+               return H_TOO_HARD;
+
+       list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+               unsigned long hpa = 0;
+               struct mm_iommu_table_group_mem_t *mem;
+               long shift = stit->tbl->it_page_shift;
+
+               mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
+               if (!mem)
+                       return H_TOO_HARD;
+
+               if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa))
+                       return H_TOO_HARD;
+       }
+
+       return H_SUCCESS;
+}
+
 static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
 {
        unsigned long hpa = 0;
@@ -376,11 +410,10 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
 {
        struct mm_iommu_table_group_mem_t *mem = NULL;
        const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-       __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+       __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 
        if (!pua)
-               /* it_userspace allocation might be delayed */
-               return H_TOO_HARD;
+               return H_SUCCESS;
 
        mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize);
        if (!mem)
@@ -401,7 +434,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
        long ret;
 
        if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
-               return H_HARDWARE;
+               return H_TOO_HARD;
 
        if (dir == DMA_NONE)
                return H_SUCCESS;
@@ -449,15 +482,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
                return H_TOO_HARD;
 
        if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
-               return H_HARDWARE;
+               return H_TOO_HARD;
 
        if (mm_iommu_mapped_inc(mem))
-               return H_CLOSED;
+               return H_TOO_HARD;
 
        ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
        if (WARN_ON_ONCE(ret)) {
                mm_iommu_mapped_dec(mem);
-               return H_HARDWARE;
+               return H_TOO_HARD;
        }
 
        if (dir != DMA_NONE)
@@ -517,8 +550,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
 
-       if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
-                       tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) {
+       if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
                ret = H_PARAMETER;
                goto unlock_exit;
        }
@@ -533,14 +565,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                        ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
                                        entry, ua, dir);
 
-               if (ret == H_SUCCESS)
-                       continue;
-
-               if (ret == H_TOO_HARD)
+               if (ret != H_SUCCESS) {
+                       kvmppc_clear_tce(stit->tbl, entry);
                        goto unlock_exit;
-
-               WARN_ON_ONCE(1);
-               kvmppc_clear_tce(stit->tbl, entry);
+               }
        }
 
        kvmppc_tce_put(stt, entry, tce);
@@ -583,7 +611,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                return ret;
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
+       if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
                ret = H_TOO_HARD;
                goto unlock_exit;
        }
@@ -599,10 +627,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                ret = kvmppc_tce_validate(stt, tce);
                if (ret != H_SUCCESS)
                        goto unlock_exit;
+       }
+
+       for (i = 0; i < npages; ++i) {
+               /*
+                * This looks unsafe, because we validate, then regrab
+                * the TCE from userspace which could have been changed by
+                * another thread.
+                *
+                * But it actually is safe, because the relevant checks will be
+                * re-executed in the following code.  If userspace tries to
+                * change this dodgily it will result in a messier failure mode
+                * but won't threaten the host.
+                */
+               if (get_user(tce, tces + i)) {
+                       ret = H_TOO_HARD;
+                       goto unlock_exit;
+               }
+               tce = be64_to_cpu(tce);
 
-               if (kvmppc_gpa_to_ua(vcpu->kvm,
-                               tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
-                               &ua, NULL))
+               if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
                        return H_PARAMETER;
 
                list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -610,14 +654,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                                        stit->tbl, entry + i, ua,
                                        iommu_tce_direction(tce));
 
-                       if (ret == H_SUCCESS)
-                               continue;
-
-                       if (ret == H_TOO_HARD)
+                       if (ret != H_SUCCESS) {
+                               kvmppc_clear_tce(stit->tbl, entry);
                                goto unlock_exit;
-
-                       WARN_ON_ONCE(1);
-                       kvmppc_clear_tce(stit->tbl, entry);
+                       }
                }
 
                kvmppc_tce_put(stt, entry + i, tce);
index 6821ead..2206bc7 100644 (file)
@@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvmppc_find_table);
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 /*
  * Validates TCE address.
  * At the moment flags and page mask are validated.
@@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table);
  * to the table and user space is supposed to process them), we can skip
  * checking other things (such as TCE is a guest RAM address or the page
  * was actually allocated).
- *
- * WARNING: This will be called in real-mode on HV KVM and virtual
- *          mode on PR KVM
  */
-long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
+               unsigned long tce)
 {
        unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
        enum dma_data_direction dir = iommu_tce_direction(tce);
+       struct kvmppc_spapr_tce_iommu_table *stit;
+       unsigned long ua = 0;
 
        /* Allow userspace to poison TCE table */
        if (dir == DMA_NONE)
@@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
        if (iommu_tce_check_gpa(stt->page_shift, gpa))
                return H_PARAMETER;
 
+       if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
+               return H_TOO_HARD;
+
+       list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+               unsigned long hpa = 0;
+               struct mm_iommu_table_group_mem_t *mem;
+               long shift = stit->tbl->it_page_shift;
+
+               mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift);
+               if (!mem)
+                       return H_TOO_HARD;
+
+               if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
+                       return H_TOO_HARD;
+       }
+
        return H_SUCCESS;
 }
-EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 /* Note on the use of page_address() in real mode,
  *
@@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
 }
 EXPORT_SYMBOL_GPL(kvmppc_tce_put);
 
-long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
                unsigned long *ua, unsigned long **prmap)
 {
-       unsigned long gfn = gpa >> PAGE_SHIFT;
+       unsigned long gfn = tce >> PAGE_SHIFT;
        struct kvm_memory_slot *memslot;
 
        memslot = search_memslots(kvm_memslots(kvm), gfn);
@@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
                return -EINVAL;
 
        *ua = __gfn_to_hva_memslot(memslot, gfn) |
-               (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+               (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        if (prmap)
@@ -184,7 +201,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
+EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua);
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
@@ -197,7 +214,7 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
 
        if (!ret && ((*direction == DMA_FROM_DEVICE) ||
                                (*direction == DMA_BIDIRECTIONAL))) {
-               __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+               __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
                /*
                 * kvmppc_rm_tce_iommu_do_map() updates the UA cache after
                 * calling this so we still get here a valid UA.
@@ -223,7 +240,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
 {
        struct mm_iommu_table_group_mem_t *mem = NULL;
        const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-       __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+       __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 
        if (!pua)
                /* it_userspace allocation might be delayed */
@@ -287,7 +304,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 {
        long ret;
        unsigned long hpa = 0;
-       __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+       __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
        struct mm_iommu_table_group_mem_t *mem;
 
        if (!pua)
@@ -300,10 +317,10 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 
        if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
                        &hpa)))
-               return H_HARDWARE;
+               return H_TOO_HARD;
 
        if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
-               return H_CLOSED;
+               return H_TOO_HARD;
 
        ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
        if (ret) {
@@ -368,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
        if (ret != H_SUCCESS)
                return ret;
 
-       ret = kvmppc_tce_validate(stt, tce);
+       ret = kvmppc_rm_tce_validate(stt, tce);
        if (ret != H_SUCCESS)
                return ret;
 
        dir = iommu_tce_direction(tce);
-       if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
-                       tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+       if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
                return H_PARAMETER;
 
        entry = ioba >> stt->page_shift;
@@ -387,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                        ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
                                        stit->tbl, entry, ua, dir);
 
-               if (ret == H_SUCCESS)
-                       continue;
-
-               if (ret == H_TOO_HARD)
+               if (ret != H_SUCCESS) {
+                       kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
                        return ret;
-
-               WARN_ON_ONCE_RM(1);
-               kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
+               }
        }
 
        kvmppc_tce_put(stt, entry, tce);
@@ -480,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                 */
                struct mm_iommu_table_group_mem_t *mem;
 
-               if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
+               if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
                        return H_TOO_HARD;
 
                mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
@@ -496,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                 * We do not require memory to be preregistered in this case
                 * so lock rmap and do __find_linux_pte_or_hugepte().
                 */
-               if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+               if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
                        return H_TOO_HARD;
 
                rmap = (void *) vmalloc_to_phys(rmap);
                if (WARN_ON_ONCE_RM(!rmap))
-                       return H_HARDWARE;
+                       return H_TOO_HARD;
 
                /*
                 * Synchronize with the MMU notifier callbacks in
@@ -521,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
        for (i = 0; i < npages; ++i) {
                unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
 
-               ret = kvmppc_tce_validate(stt, tce);
+               ret = kvmppc_rm_tce_validate(stt, tce);
                if (ret != H_SUCCESS)
                        goto unlock_exit;
+       }
+
+       for (i = 0; i < npages; ++i) {
+               unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
 
                ua = 0;
-               if (kvmppc_gpa_to_ua(vcpu->kvm,
-                               tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
-                               &ua, NULL))
+               if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
                        return H_PARAMETER;
 
                list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -536,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                                        stit->tbl, entry + i, ua,
                                        iommu_tce_direction(tce));
 
-                       if (ret == H_SUCCESS)
-                               continue;
-
-                       if (ret == H_TOO_HARD)
+                       if (ret != H_SUCCESS) {
+                               kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
+                                               entry);
                                goto unlock_exit;
-
-                       WARN_ON_ONCE_RM(1);
-                       kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
+                       }
                }
 
                kvmppc_tce_put(stt, entry + i, tce);
index 36b11c5..8c7e933 100644 (file)
@@ -36,7 +36,6 @@
 #define OP_31_XOP_MTSR         210
 #define OP_31_XOP_MTSRIN       242
 #define OP_31_XOP_TLBIEL       274
-#define OP_31_XOP_TLBIE                306
 /* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
 #define OP_31_XOP_FAKE_SC1     308
 #define OP_31_XOP_SLBMTE       402
@@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
        vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
        vcpu->arch.tar_tm = vcpu->arch.tar;
        vcpu->arch.lr_tm = vcpu->arch.regs.link;
-       vcpu->arch.cr_tm = vcpu->arch.cr;
+       vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
        vcpu->arch.xer_tm = vcpu->arch.regs.xer;
        vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
 }
@@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
        vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
        vcpu->arch.tar = vcpu->arch.tar_tm;
        vcpu->arch.regs.link = vcpu->arch.lr_tm;
-       vcpu->arch.cr = vcpu->arch.cr_tm;
+       vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
        vcpu->arch.regs.xer = vcpu->arch.xer_tm;
        vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
 }
@@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
        uint64_t texasr;
 
        /* CR0 = 0 | MSR[TS] | 0 */
-       vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+       vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
                (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
                 << CR0_SHIFT);
 
@@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
        tm_abort(ra_val);
 
        /* CR0 = 0 | MSR[TS] | 0 */
-       vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+       vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
                (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
                 << CR0_SHIFT);
 
@@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
                        if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
                                preempt_disable();
-                               vcpu->arch.cr = (CR0_TBEGIN_FAILURE |
-                                 (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)));
+                               vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE |
+                                 (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)));
 
                                vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
                                        (((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
index 3e3a715..bf8def2 100644 (file)
@@ -50,6 +50,7 @@
 #include <asm/reg.h>
 #include <asm/ppc-opcode.h>
 #include <asm/asm-prototypes.h>
+#include <asm/archrandom.h>
 #include <asm/debug.h>
 #include <asm/disassemble.h>
 #include <asm/cputable.h>
@@ -104,6 +105,10 @@ static bool indep_threads_mode = true;
 module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
 
+static bool one_vm_per_core;
+module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
+
 #ifdef CONFIG_KVM_XICS
 static struct kernel_param_ops module_param_ops = {
        .set = param_set_int,
@@ -117,6 +122,16 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
 
+/* If set, guests are allowed to create and control nested guests */
+static bool nested = true;
+module_param(nested, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
+
+static inline bool nesting_enabled(struct kvm *kvm)
+{
+       return kvm->arch.nested_enable && kvm_is_radix(kvm);
+}
+
 /* If set, the threads on each CPU core have to be in the same MMU mode */
 static bool no_mixing_hpt_and_radix;
 
@@ -173,6 +188,10 @@ static bool kvmppc_ipi_thread(int cpu)
 {
        unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 
+       /* If we're a nested hypervisor, fall back to ordinary IPIs for now */
+       if (kvmhv_on_pseries())
+               return false;
+
        /* On POWER9 we can use msgsnd to IPI any cpu */
        if (cpu_has_feature(CPU_FTR_ARCH_300)) {
                msg |= get_hard_smp_processor_id(cpu);
@@ -410,8 +429,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
               vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
        pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
               vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
-       pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
-              vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
+       pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
+              vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
        pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
        pr_err("fault dar = %.16lx dsisr = %.8x\n",
               vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
@@ -730,8 +749,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
        /*
         * Ensure that the read of vcore->dpdes comes after the read
         * of vcpu->doorbell_request.  This barrier matches the
-        * lwsync in book3s_hv_rmhandlers.S just before the
-        * fast_guest_return label.
+        * smb_wmb() in kvmppc_guest_entry_inject().
         */
        smp_rmb();
        vc = vcpu->arch.vcore;
@@ -912,6 +930,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                        break;
                }
                return RESUME_HOST;
+       case H_SET_DABR:
+               ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
+               break;
+       case H_SET_XDABR:
+               ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5));
+               break;
+       case H_GET_TCE:
+               ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
        case H_PUT_TCE:
                ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
                                                kvmppc_get_gpr(vcpu, 5),
@@ -935,6 +966,32 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                if (ret == H_TOO_HARD)
                        return RESUME_HOST;
                break;
+       case H_RANDOM:
+               if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
+                       ret = H_HARDWARE;
+               break;
+
+       case H_SET_PARTITION_TABLE:
+               ret = H_FUNCTION;
+               if (nesting_enabled(vcpu->kvm))
+                       ret = kvmhv_set_partition_table(vcpu);
+               break;
+       case H_ENTER_NESTED:
+               ret = H_FUNCTION;
+               if (!nesting_enabled(vcpu->kvm))
+                       break;
+               ret = kvmhv_enter_nested_guest(vcpu);
+               if (ret == H_INTERRUPT) {
+                       kvmppc_set_gpr(vcpu, 3, 0);
+                       return -EINTR;
+               }
+               break;
+       case H_TLB_INVALIDATE:
+               ret = H_FUNCTION;
+               if (nesting_enabled(vcpu->kvm))
+                       ret = kvmhv_do_nested_tlbie(vcpu);
+               break;
+
        default:
                return RESUME_HOST;
        }
@@ -943,6 +1000,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
        return RESUME_GUEST;
 }
 
+/*
+ * Handle H_CEDE in the nested virtualization case where we haven't
+ * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
+ * This has to be done early, not in kvmppc_pseries_do_hcall(), so
+ * that the cede logic in kvmppc_run_single_vcpu() works properly.
+ */
+static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.shregs.msr |= MSR_EE;
+       vcpu->arch.ceded = 1;
+       smp_mb();
+       if (vcpu->arch.prodded) {
+               vcpu->arch.prodded = 0;
+               smp_mb();
+               vcpu->arch.ceded = 0;
+       }
+}
+
 static int kvmppc_hcall_impl_hv(unsigned long cmd)
 {
        switch (cmd) {
@@ -1085,7 +1160,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
        return RESUME_GUEST;
 }
 
-/* Called with vcpu->arch.vcore->lock held */
 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                 struct task_struct *tsk)
 {
@@ -1190,7 +1264,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        case BOOK3S_INTERRUPT_H_INST_STORAGE:
                vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
-               vcpu->arch.fault_dsisr = 0;
+               vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
+                       DSISR_SRR1_MATCH_64S;
+               if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+                       vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
                r = RESUME_PAGE_FAULT;
                break;
        /*
@@ -1206,10 +1283,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                swab32(vcpu->arch.emul_inst) :
                                vcpu->arch.emul_inst;
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
-                       /* Need vcore unlocked to call kvmppc_get_last_inst */
-                       spin_unlock(&vcpu->arch.vcore->lock);
                        r = kvmppc_emulate_debug_inst(run, vcpu);
-                       spin_lock(&vcpu->arch.vcore->lock);
                } else {
                        kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
                        r = RESUME_GUEST;
@@ -1225,12 +1299,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
        case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
                r = EMULATE_FAIL;
                if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
-                   cpu_has_feature(CPU_FTR_ARCH_300)) {
-                       /* Need vcore unlocked to call kvmppc_get_last_inst */
-                       spin_unlock(&vcpu->arch.vcore->lock);
+                   cpu_has_feature(CPU_FTR_ARCH_300))
                        r = kvmppc_emulate_doorbell_instr(vcpu);
-                       spin_lock(&vcpu->arch.vcore->lock);
-               }
                if (r == EMULATE_FAIL) {
                        kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
                        r = RESUME_GUEST;
@@ -1265,6 +1335,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
        return r;
 }
 
+static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
+{
+       int r;
+       int srcu_idx;
+
+       vcpu->stat.sum_exits++;
+
+       /*
+        * This can happen if an interrupt occurs in the last stages
+        * of guest entry or the first stages of guest exit (i.e. after
+        * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
+        * and before setting it to KVM_GUEST_MODE_HOST_HV).
+        * That can happen due to a bug, or due to a machine check
+        * occurring at just the wrong time.
+        */
+       if (vcpu->arch.shregs.msr & MSR_HV) {
+               pr_emerg("KVM trap in HV mode while nested!\n");
+               pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+                        vcpu->arch.trap, kvmppc_get_pc(vcpu),
+                        vcpu->arch.shregs.msr);
+               kvmppc_dump_regs(vcpu);
+               return RESUME_HOST;
+       }
+       switch (vcpu->arch.trap) {
+       /* We're good on these - the host merely wanted to get our attention */
+       case BOOK3S_INTERRUPT_HV_DECREMENTER:
+               vcpu->stat.dec_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_EXTERNAL:
+               vcpu->stat.ext_intr_exits++;
+               r = RESUME_HOST;
+               break;
+       case BOOK3S_INTERRUPT_H_DOORBELL:
+       case BOOK3S_INTERRUPT_H_VIRT:
+               vcpu->stat.ext_intr_exits++;
+               r = RESUME_GUEST;
+               break;
+       /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
+       case BOOK3S_INTERRUPT_HMI:
+       case BOOK3S_INTERRUPT_PERFMON:
+       case BOOK3S_INTERRUPT_SYSTEM_RESET:
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_MACHINE_CHECK:
+               /* Pass the machine check to the L1 guest */
+               r = RESUME_HOST;
+               /* Print the MCE event to host console. */
+               machine_check_print_event_info(&vcpu->arch.mce_evt, false);
+               break;
+       /*
+        * We get these next two if the guest accesses a page which it thinks
+        * it has mapped but which is not actually present, either because
+        * it is for an emulated I/O device or because the corresonding
+        * host page has been paged out.
+        */
+       case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+               srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+               r = kvmhv_nested_page_fault(vcpu);
+               srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+               break;
+       case BOOK3S_INTERRUPT_H_INST_STORAGE:
+               vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+               vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
+                                        DSISR_SRR1_MATCH_64S;
+               if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+                       vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+               srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+               r = kvmhv_nested_page_fault(vcpu);
+               srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+               break;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       case BOOK3S_INTERRUPT_HV_SOFTPATCH:
+               /*
+                * This occurs for various TM-related instructions that
+                * we need to emulate on POWER9 DD2.2.  We have already
+                * handled the cases where the guest was in real-suspend
+                * mode and was transitioning to transactional state.
+                */
+               r = kvmhv_p9_tm_emulation(vcpu);
+               break;
+#endif
+
+       case BOOK3S_INTERRUPT_HV_RM_HARD:
+               vcpu->arch.trap = 0;
+               r = RESUME_GUEST;
+               if (!xive_enabled())
+                       kvmppc_xics_rm_complete(vcpu, 0);
+               break;
+       default:
+               r = RESUME_HOST;
+               break;
+       }
+
+       return r;
+}
+
 static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
                                            struct kvm_sregs *sregs)
 {
@@ -1555,6 +1723,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_ONLINE:
                *val = get_reg_val(id, vcpu->arch.online);
                break;
+       case KVM_REG_PPC_PTCR:
+               *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -1786,6 +1957,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
                        atomic_dec(&vcpu->arch.vcore->online_count);
                vcpu->arch.online = i;
                break;
+       case KVM_REG_PPC_PTCR:
+               vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -2019,15 +2193,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
         * Set the default HFSCR for the guest from the host value.
         * This value is only used on POWER9.
         * On POWER9, we want to virtualize the doorbell facility, so we
-        * turn off the HFSCR bit, which causes those instructions to trap.
+        * don't set the HFSCR_MSGP bit, and that causes those instructions
+        * to trap and then we emulate them.
         */
-       vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
-       if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+       vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
+               HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
+               if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+                       vcpu->arch.hfscr |= HFSCR_TM;
+       }
+       if (cpu_has_feature(CPU_FTR_TM_COMP))
                vcpu->arch.hfscr |= HFSCR_TM;
-       else if (!cpu_has_feature(CPU_FTR_TM_COMP))
-               vcpu->arch.hfscr &= ~HFSCR_TM;
-       if (cpu_has_feature(CPU_FTR_ARCH_300))
-               vcpu->arch.hfscr &= ~HFSCR_MSGP;
 
        kvmppc_mmu_book3s_hv_init(vcpu);
 
@@ -2242,10 +2419,18 @@ static void kvmppc_release_hwthread(int cpu)
 
 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
 {
+       struct kvm_nested_guest *nested = vcpu->arch.nested;
+       cpumask_t *cpu_in_guest;
        int i;
 
        cpu = cpu_first_thread_sibling(cpu);
-       cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+       if (nested) {
+               cpumask_set_cpu(cpu, &nested->need_tlb_flush);
+               cpu_in_guest = &nested->cpu_in_guest;
+       } else {
+               cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+               cpu_in_guest = &kvm->arch.cpu_in_guest;
+       }
        /*
         * Make sure setting of bit in need_tlb_flush precedes
         * testing of cpu_in_guest bits.  The matching barrier on
@@ -2253,13 +2438,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
         */
        smp_mb();
        for (i = 0; i < threads_per_core; ++i)
-               if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
+               if (cpumask_test_cpu(cpu + i, cpu_in_guest))
                        smp_call_function_single(cpu + i, do_nothing, NULL, 1);
 }
 
 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
 {
+       struct kvm_nested_guest *nested = vcpu->arch.nested;
        struct kvm *kvm = vcpu->kvm;
+       int prev_cpu;
+
+       if (!cpu_has_feature(CPU_FTR_HVMODE))
+               return;
+
+       if (nested)
+               prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
+       else
+               prev_cpu = vcpu->arch.prev_cpu;
 
        /*
         * With radix, the guest can do TLB invalidations itself,
@@ -2273,12 +2468,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
         * ran to flush the TLB.  The TLB is shared between threads,
         * so we use a single bit in .need_tlb_flush for all 4 threads.
         */
-       if (vcpu->arch.prev_cpu != pcpu) {
-               if (vcpu->arch.prev_cpu >= 0 &&
-                   cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+       if (prev_cpu != pcpu) {
+               if (prev_cpu >= 0 &&
+                   cpu_first_thread_sibling(prev_cpu) !=
                    cpu_first_thread_sibling(pcpu))
-                       radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
-               vcpu->arch.prev_cpu = pcpu;
+                       radix_flush_cpu(kvm, prev_cpu, vcpu);
+               if (nested)
+                       nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
+               else
+                       vcpu->arch.prev_cpu = pcpu;
+       }
+}
+
+static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
+                                             struct kvm_nested_guest *nested)
+{
+       cpumask_t *need_tlb_flush;
+       int lpid;
+
+       if (!cpu_has_feature(CPU_FTR_HVMODE))
+               return;
+
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               pcpu &= ~0x3UL;
+
+       if (nested) {
+               lpid = nested->shadow_lpid;
+               need_tlb_flush = &nested->need_tlb_flush;
+       } else {
+               lpid = kvm->arch.lpid;
+               need_tlb_flush = &kvm->arch.need_tlb_flush;
+       }
+
+       mtspr(SPRN_LPID, lpid);
+       isync();
+       smp_mb();
+
+       if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
+               radix__local_flush_tlb_lpid_guest(lpid);
+               /* Clear the bit after the TLB flush */
+               cpumask_clear_cpu(pcpu, need_tlb_flush);
        }
 }
 
@@ -2493,6 +2722,10 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
        if (!cpu_has_feature(CPU_FTR_ARCH_207S))
                return false;
 
+       /* In one_vm_per_core mode, require all vcores to be from the same vm */
+       if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
+               return false;
+
        /* Some POWER9 chips require all threads to be in the same MMU mode */
        if (no_mixing_hpt_and_radix &&
            kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
@@ -2600,6 +2833,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
        spin_lock(&vc->lock);
        now = get_tb();
        for_each_runnable_thread(i, vcpu, vc) {
+               /*
+                * It's safe to unlock the vcore in the loop here, because
+                * for_each_runnable_thread() is safe against removal of
+                * the vcpu, and the vcore state is VCORE_EXITING here,
+                * so any vcpus becoming runnable will have their arch.trap
+                * set to zero and can't actually run in the guest.
+                */
+               spin_unlock(&vc->lock);
                /* cancel pending dec exception if dec is positive */
                if (now < vcpu->arch.dec_expires &&
                    kvmppc_core_pending_dec(vcpu))
@@ -2615,6 +2856,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
 
+               spin_lock(&vc->lock);
                if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
                        if (vcpu->arch.pending_exceptions)
                                kvmppc_core_prepare_to_enter(vcpu);
@@ -2963,8 +3205,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                spin_unlock(&core_info.vc[sub]->lock);
 
        if (kvm_is_radix(vc->kvm)) {
-               int tmp = pcpu;
-
                /*
                 * Do we need to flush the process scoped TLB for the LPAR?
                 *
@@ -2975,17 +3215,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                 *
                 * Hash must be flushed in realmode in order to use tlbiel.
                 */
-               mtspr(SPRN_LPID, vc->kvm->arch.lpid);
-               isync();
-
-               if (cpu_has_feature(CPU_FTR_ARCH_300))
-                       tmp &= ~0x3UL;
-
-               if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
-                       radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
-                       /* Clear the bit after the TLB flush */
-                       cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
-               }
+               kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
        }
 
        /*
@@ -3080,6 +3310,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 }
 
 /*
+ * Load up hypervisor-mode registers on P9.
+ */
+static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
+                                    unsigned long lpcr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       s64 hdec;
+       u64 tb, purr, spurr;
+       int trap;
+       unsigned long host_hfscr = mfspr(SPRN_HFSCR);
+       unsigned long host_ciabr = mfspr(SPRN_CIABR);
+       unsigned long host_dawr = mfspr(SPRN_DAWR);
+       unsigned long host_dawrx = mfspr(SPRN_DAWRX);
+       unsigned long host_psscr = mfspr(SPRN_PSSCR);
+       unsigned long host_pidr = mfspr(SPRN_PID);
+
+       hdec = time_limit - mftb();
+       if (hdec < 0)
+               return BOOK3S_INTERRUPT_HV_DECREMENTER;
+       mtspr(SPRN_HDEC, hdec);
+
+       if (vc->tb_offset) {
+               u64 new_tb = mftb() + vc->tb_offset;
+               mtspr(SPRN_TBU40, new_tb);
+               tb = mftb();
+               if ((tb & 0xffffff) < (new_tb & 0xffffff))
+                       mtspr(SPRN_TBU40, new_tb + 0x1000000);
+               vc->tb_offset_applied = vc->tb_offset;
+       }
+
+       if (vc->pcr)
+               mtspr(SPRN_PCR, vc->pcr);
+       mtspr(SPRN_DPDES, vc->dpdes);
+       mtspr(SPRN_VTB, vc->vtb);
+
+       local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
+       local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
+       mtspr(SPRN_PURR, vcpu->arch.purr);
+       mtspr(SPRN_SPURR, vcpu->arch.spurr);
+
+       if (cpu_has_feature(CPU_FTR_DAWR)) {
+               mtspr(SPRN_DAWR, vcpu->arch.dawr);
+               mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
+       }
+       mtspr(SPRN_CIABR, vcpu->arch.ciabr);
+       mtspr(SPRN_IC, vcpu->arch.ic);
+       mtspr(SPRN_PID, vcpu->arch.pid);
+
+       mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
+             (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+
+       mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
+
+       mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
+       mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
+       mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
+       mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
+
+       mtspr(SPRN_AMOR, ~0UL);
+
+       mtspr(SPRN_LPCR, lpcr);
+       isync();
+
+       kvmppc_xive_push_vcpu(vcpu);
+
+       mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
+       mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
+
+       trap = __kvmhv_vcpu_entry_p9(vcpu);
+
+       /* Advance host PURR/SPURR by the amount used by guest */
+       purr = mfspr(SPRN_PURR);
+       spurr = mfspr(SPRN_SPURR);
+       mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
+             purr - vcpu->arch.purr);
+       mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
+             spurr - vcpu->arch.spurr);
+       vcpu->arch.purr = purr;
+       vcpu->arch.spurr = spurr;
+
+       vcpu->arch.ic = mfspr(SPRN_IC);
+       vcpu->arch.pid = mfspr(SPRN_PID);
+       vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
+
+       vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
+       vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
+       vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
+       vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
+
+       mtspr(SPRN_PSSCR, host_psscr);
+       mtspr(SPRN_HFSCR, host_hfscr);
+       mtspr(SPRN_CIABR, host_ciabr);
+       mtspr(SPRN_DAWR, host_dawr);
+       mtspr(SPRN_DAWRX, host_dawrx);
+       mtspr(SPRN_PID, host_pidr);
+
+       /*
+        * Since this is radix, do a eieio; tlbsync; ptesync sequence in
+        * case we interrupted the guest between a tlbie and a ptesync.
+        */
+       asm volatile("eieio; tlbsync; ptesync");
+
+       mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid);    /* restore host LPID */
+       isync();
+
+       vc->dpdes = mfspr(SPRN_DPDES);
+       vc->vtb = mfspr(SPRN_VTB);
+       mtspr(SPRN_DPDES, 0);
+       if (vc->pcr)
+               mtspr(SPRN_PCR, 0);
+
+       if (vc->tb_offset_applied) {
+               u64 new_tb = mftb() - vc->tb_offset_applied;
+               mtspr(SPRN_TBU40, new_tb);
+               tb = mftb();
+               if ((tb & 0xffffff) < (new_tb & 0xffffff))
+                       mtspr(SPRN_TBU40, new_tb + 0x1000000);
+               vc->tb_offset_applied = 0;
+       }
+
+       mtspr(SPRN_HDEC, 0x7fffffff);
+       mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
+
+       return trap;
+}
+
+/*
+ * Virtual-mode guest entry for POWER9 and later when the host and
+ * guest are both using the radix MMU.  The LPIDR has already been set.
+ */
+int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
+                        unsigned long lpcr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       unsigned long host_dscr = mfspr(SPRN_DSCR);
+       unsigned long host_tidr = mfspr(SPRN_TIDR);
+       unsigned long host_iamr = mfspr(SPRN_IAMR);
+       s64 dec;
+       u64 tb;
+       int trap, save_pmu;
+
+       dec = mfspr(SPRN_DEC);
+       tb = mftb();
+       if (dec < 512)
+               return BOOK3S_INTERRUPT_HV_DECREMENTER;
+       local_paca->kvm_hstate.dec_expires = dec + tb;
+       if (local_paca->kvm_hstate.dec_expires < time_limit)
+               time_limit = local_paca->kvm_hstate.dec_expires;
+
+       vcpu->arch.ceded = 0;
+
+       kvmhv_save_host_pmu();          /* saves it to PACA kvm_hstate */
+
+       kvmppc_subcore_enter_guest();
+
+       vc->entry_exit_map = 1;
+       vc->in_guest = 1;
+
+       if (vcpu->arch.vpa.pinned_addr) {
+               struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+               u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
+               lp->yield_count = cpu_to_be32(yield_count);
+               vcpu->arch.vpa.dirty = 1;
+       }
+
+       if (cpu_has_feature(CPU_FTR_TM) ||
+           cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+               kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+
+       kvmhv_load_guest_pmu(vcpu);
+
+       msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
+       load_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+       load_vr_state(&vcpu->arch.vr);
+#endif
+
+       mtspr(SPRN_DSCR, vcpu->arch.dscr);
+       mtspr(SPRN_IAMR, vcpu->arch.iamr);
+       mtspr(SPRN_PSPB, vcpu->arch.pspb);
+       mtspr(SPRN_FSCR, vcpu->arch.fscr);
+       mtspr(SPRN_TAR, vcpu->arch.tar);
+       mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
+       mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
+       mtspr(SPRN_BESCR, vcpu->arch.bescr);
+       mtspr(SPRN_WORT, vcpu->arch.wort);
+       mtspr(SPRN_TIDR, vcpu->arch.tid);
+       mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+       mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
+       mtspr(SPRN_AMR, vcpu->arch.amr);
+       mtspr(SPRN_UAMOR, vcpu->arch.uamor);
+
+       if (!(vcpu->arch.ctrl & 1))
+               mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+
+       mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
+
+       if (kvmhv_on_pseries()) {
+               /* call our hypervisor to load up HV regs and go */
+               struct hv_guest_state hvregs;
+
+               kvmhv_save_hv_regs(vcpu, &hvregs);
+               hvregs.lpcr = lpcr;
+               vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+               hvregs.version = HV_GUEST_STATE_VERSION;
+               if (vcpu->arch.nested) {
+                       hvregs.lpid = vcpu->arch.nested->shadow_lpid;
+                       hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
+               } else {
+                       hvregs.lpid = vcpu->kvm->arch.lpid;
+                       hvregs.vcpu_token = vcpu->vcpu_id;
+               }
+               hvregs.hdec_expiry = time_limit;
+               trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
+                                         __pa(&vcpu->arch.regs));
+               kvmhv_restore_hv_return_state(vcpu, &hvregs);
+               vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+               vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
+               vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
+
+               /* H_CEDE has to be handled now, not later */
+               if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
+                   kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
+                       kvmppc_nested_cede(vcpu);
+                       trap = 0;
+               }
+       } else {
+               trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
+       }
+
+       vcpu->arch.slb_max = 0;
+       dec = mfspr(SPRN_DEC);
+       tb = mftb();
+       vcpu->arch.dec_expires = dec + tb;
+       vcpu->cpu = -1;
+       vcpu->arch.thread_cpu = -1;
+       vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
+
+       vcpu->arch.iamr = mfspr(SPRN_IAMR);
+       vcpu->arch.pspb = mfspr(SPRN_PSPB);
+       vcpu->arch.fscr = mfspr(SPRN_FSCR);
+       vcpu->arch.tar = mfspr(SPRN_TAR);
+       vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
+       vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
+       vcpu->arch.bescr = mfspr(SPRN_BESCR);
+       vcpu->arch.wort = mfspr(SPRN_WORT);
+       vcpu->arch.tid = mfspr(SPRN_TIDR);
+       vcpu->arch.amr = mfspr(SPRN_AMR);
+       vcpu->arch.uamor = mfspr(SPRN_UAMOR);
+       vcpu->arch.dscr = mfspr(SPRN_DSCR);
+
+       mtspr(SPRN_PSPB, 0);
+       mtspr(SPRN_WORT, 0);
+       mtspr(SPRN_AMR, 0);
+       mtspr(SPRN_UAMOR, 0);
+       mtspr(SPRN_DSCR, host_dscr);
+       mtspr(SPRN_TIDR, host_tidr);
+       mtspr(SPRN_IAMR, host_iamr);
+       mtspr(SPRN_PSPB, 0);
+
+       msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
+       store_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+       store_vr_state(&vcpu->arch.vr);
+#endif
+
+       if (cpu_has_feature(CPU_FTR_TM) ||
+           cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+               kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+
+       save_pmu = 1;
+       if (vcpu->arch.vpa.pinned_addr) {
+               struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+               u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
+               lp->yield_count = cpu_to_be32(yield_count);
+               vcpu->arch.vpa.dirty = 1;
+               save_pmu = lp->pmcregs_in_use;
+       }
+
+       kvmhv_save_guest_pmu(vcpu, save_pmu);
+
+       vc->entry_exit_map = 0x101;
+       vc->in_guest = 0;
+
+       mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
+
+       kvmhv_load_host_pmu();
+
+       kvmppc_subcore_exit_guest();
+
+       return trap;
+}
+
+/*
  * Wait for some other vcpu thread to execute us, and
  * wake us up when we need to handle something in the host.
  */
@@ -3256,6 +3780,11 @@ out:
        trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
 }
 
+/*
+ * This never fails for a radix guest, as none of the operations it does
+ * for a radix guest can fail or have a way to report failure.
+ * kvmhv_run_single_vcpu() relies on this fact.
+ */
 static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
 {
        int r = 0;
@@ -3405,6 +3934,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        return vcpu->arch.ret;
 }
 
+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
+                         struct kvm_vcpu *vcpu, u64 time_limit,
+                         unsigned long lpcr)
+{
+       int trap, r, pcpu;
+       int srcu_idx;
+       struct kvmppc_vcore *vc;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_nested_guest *nested = vcpu->arch.nested;
+
+       trace_kvmppc_run_vcpu_enter(vcpu);
+
+       kvm_run->exit_reason = 0;
+       vcpu->arch.ret = RESUME_GUEST;
+       vcpu->arch.trap = 0;
+
+       vc = vcpu->arch.vcore;
+       vcpu->arch.ceded = 0;
+       vcpu->arch.run_task = current;
+       vcpu->arch.kvm_run = kvm_run;
+       vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
+       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+       vcpu->arch.busy_preempt = TB_NIL;
+       vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
+       vc->runnable_threads[0] = vcpu;
+       vc->n_runnable = 1;
+       vc->runner = vcpu;
+
+       /* See if the MMU is ready to go */
+       if (!kvm->arch.mmu_ready)
+               kvmhv_setup_mmu(vcpu);
+
+       if (need_resched())
+               cond_resched();
+
+       kvmppc_update_vpas(vcpu);
+
+       init_vcore_to_run(vc);
+       vc->preempt_tb = TB_NIL;
+
+       preempt_disable();
+       pcpu = smp_processor_id();
+       vc->pcpu = pcpu;
+       kvmppc_prepare_radix_vcpu(vcpu, pcpu);
+
+       local_irq_disable();
+       hard_irq_disable();
+       if (signal_pending(current))
+               goto sigpend;
+       if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
+               goto out;
+
+       if (!nested) {
+               kvmppc_core_prepare_to_enter(vcpu);
+               if (vcpu->arch.doorbell_request) {
+                       vc->dpdes = 1;
+                       smp_wmb();
+                       vcpu->arch.doorbell_request = 0;
+               }
+               if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
+                            &vcpu->arch.pending_exceptions))
+                       lpcr |= LPCR_MER;
+       } else if (vcpu->arch.pending_exceptions ||
+                  vcpu->arch.doorbell_request ||
+                  xive_interrupt_pending(vcpu)) {
+               vcpu->arch.ret = RESUME_HOST;
+               goto out;
+       }
+
+       kvmppc_clear_host_core(pcpu);
+
+       local_paca->kvm_hstate.tid = 0;
+       local_paca->kvm_hstate.napping = 0;
+       local_paca->kvm_hstate.kvm_split_mode = NULL;
+       kvmppc_start_thread(vcpu, vc);
+       kvmppc_create_dtl_entry(vcpu, vc);
+       trace_kvm_guest_enter(vcpu);
+
+       vc->vcore_state = VCORE_RUNNING;
+       trace_kvmppc_run_core(vc, 0);
+
+       if (cpu_has_feature(CPU_FTR_HVMODE))
+               kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested);
+
+       trace_hardirqs_on();
+       guest_enter_irqoff();
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+
+       this_cpu_disable_ftrace();
+
+       trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
+       vcpu->arch.trap = trap;
+
+       this_cpu_enable_ftrace();
+
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               mtspr(SPRN_LPID, kvm->arch.host_lpid);
+               isync();
+       }
+
+       trace_hardirqs_off();
+       set_irq_happened(trap);
+
+       kvmppc_set_host_core(pcpu);
+
+       local_irq_enable();
+       guest_exit();
+
+       cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
+
+       preempt_enable();
+
+       /* cancel pending decrementer exception if DEC is now positive */
+       if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
+               kvmppc_core_dequeue_dec(vcpu);
+
+       trace_kvm_guest_exit(vcpu);
+       r = RESUME_GUEST;
+       if (trap) {
+               if (!nested)
+                       r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
+               else
+                       r = kvmppc_handle_nested_exit(vcpu);
+       }
+       vcpu->arch.ret = r;
+
+       if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
+           !kvmppc_vcpu_woken(vcpu)) {
+               kvmppc_set_timer(vcpu);
+               while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
+                       if (signal_pending(current)) {
+                               vcpu->stat.signal_exits++;
+                               kvm_run->exit_reason = KVM_EXIT_INTR;
+                               vcpu->arch.ret = -EINTR;
+                               break;
+                       }
+                       spin_lock(&vc->lock);
+                       kvmppc_vcore_blocked(vc);
+                       spin_unlock(&vc->lock);
+               }
+       }
+       vcpu->arch.ceded = 0;
+
+       vc->vcore_state = VCORE_INACTIVE;
+       trace_kvmppc_run_core(vc, 1);
+
+ done:
+       kvmppc_remove_runnable(vc, vcpu);
+       trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
+
+       return vcpu->arch.ret;
+
+ sigpend:
+       vcpu->stat.signal_exits++;
+       kvm_run->exit_reason = KVM_EXIT_INTR;
+       vcpu->arch.ret = -EINTR;
+ out:
+       local_irq_enable();
+       preempt_enable();
+       goto done;
+}
+
 static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
        int r;
@@ -3480,7 +4174,20 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
 
        do {
-               r = kvmppc_run_vcpu(run, vcpu);
+               /*
+                * The early POWER9 chips that can't mix radix and HPT threads
+                * on the same core also need the workaround for the problem
+                * where the TLB would prefetch entries in the guest exit path
+                * for radix guests using the guest PIDR value and LPID 0.
+                * The workaround is in the old path (kvmppc_run_vcpu())
+                * but not the new path (kvmhv_run_single_vcpu()).
+                */
+               if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
+                   !no_mixing_hpt_and_radix)
+                       r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
+                                                 vcpu->arch.vcore->lpcr);
+               else
+                       r = kvmppc_run_vcpu(run, vcpu);
 
                if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
                    !(vcpu->arch.shregs.msr & MSR_PR)) {
@@ -3559,6 +4266,10 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
        kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
        kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
 
+       /* If running as a nested hypervisor, we don't support HPT guests */
+       if (kvmhv_on_pseries())
+               info->flags |= KVM_PPC_NO_HASH;
+
        return 0;
 }
 
@@ -3723,8 +4434,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
                        __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
                dw1 = PATB_GR | kvm->arch.process_table;
        }
-
-       mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
+       kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
 }
 
 /*
@@ -3820,6 +4530,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
 {
+       if (nesting_enabled(kvm))
+               kvmhv_release_all_nested(kvm);
        kvmppc_free_radix(kvm);
        kvmppc_update_lpcr(kvm, LPCR_VPM1,
                           LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
@@ -3841,6 +4553,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
        kvmppc_free_hpt(&kvm->arch.hpt);
        kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
                           LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+       kvmppc_rmap_reset(kvm);
        kvm->arch.radix = 1;
        return 0;
 }
@@ -3940,6 +4653,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
        kvmppc_alloc_host_rm_ops();
 
+       kvmhv_vm_nested_init(kvm);
+
        /*
         * Since we don't flush the TLB when tearing down a VM,
         * and this lpid might have previously been used,
@@ -3958,9 +4673,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
                kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
        /* Init LPCR for virtual RMA mode */
-       kvm->arch.host_lpid = mfspr(SPRN_LPID);
-       kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
-       lpcr &= LPCR_PECE | LPCR_LPES;
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               kvm->arch.host_lpid = mfspr(SPRN_LPID);
+               kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+               lpcr &= LPCR_PECE | LPCR_LPES;
+       } else {
+               lpcr = 0;
+       }
        lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
                LPCR_VPM0 | LPCR_VPM1;
        kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
@@ -4027,8 +4746,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
         * On POWER9, we only need to do this if the "indep_threads_mode"
         * module parameter has been set to N.
         */
-       if (cpu_has_feature(CPU_FTR_ARCH_300))
-               kvm->arch.threads_indep = indep_threads_mode;
+       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
+                       pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
+                       kvm->arch.threads_indep = true;
+               } else {
+                       kvm->arch.threads_indep = indep_threads_mode;
+               }
+       }
        if (!kvm->arch.threads_indep)
                kvm_hv_vm_activated();
 
@@ -4051,6 +4776,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
        snprintf(buf, sizeof(buf), "vm%d", current->pid);
        kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
        kvmppc_mmu_debugfs_init(kvm);
+       if (radix_enabled())
+               kvmhv_radix_debugfs_init(kvm);
 
        return 0;
 }
@@ -4073,13 +4800,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
        kvmppc_free_vcores(kvm);
 
-       kvmppc_free_lpid(kvm->arch.lpid);
 
        if (kvm_is_radix(kvm))
                kvmppc_free_radix(kvm);
        else
                kvmppc_free_hpt(&kvm->arch.hpt);
 
+       /* Perform global invalidation and return lpid to the pool */
+       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               if (nesting_enabled(kvm))
+                       kvmhv_release_all_nested(kvm);
+               kvm->arch.process_table = 0;
+               kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
+       }
+       kvmppc_free_lpid(kvm->arch.lpid);
+
        kvmppc_free_pimap(kvm);
 }
 
@@ -4104,11 +4839,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
 
 static int kvmppc_core_check_processor_compat_hv(void)
 {
-       if (!cpu_has_feature(CPU_FTR_HVMODE) ||
-           !cpu_has_feature(CPU_FTR_ARCH_206))
-               return -EIO;
+       if (cpu_has_feature(CPU_FTR_HVMODE) &&
+           cpu_has_feature(CPU_FTR_ARCH_206))
+               return 0;
 
-       return 0;
+       /* POWER9 in radix mode is capable of being a nested hypervisor. */
+       if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
+               return 0;
+
+       return -EIO;
 }
 
 #ifdef CONFIG_KVM_XICS
@@ -4426,6 +5165,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
        if (radix && !radix_enabled())
                return -EINVAL;
 
+       /* If we're a nested hypervisor, we currently only support radix */
+       if (kvmhv_on_pseries() && !radix)
+               return -EINVAL;
+
        mutex_lock(&kvm->lock);
        if (radix != kvm_is_radix(kvm)) {
                if (kvm->arch.mmu_ready) {
@@ -4458,6 +5201,19 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
        return err;
 }
 
+static int kvmhv_enable_nested(struct kvm *kvm)
+{
+       if (!nested)
+               return -EPERM;
+       if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix)
+               return -ENODEV;
+
+       /* kvm == NULL means the caller is testing if the capability exists */
+       if (kvm)
+               kvm->arch.nested_enable = true;
+       return 0;
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
        .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
        .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -4497,6 +5253,7 @@ static struct kvmppc_ops kvm_ops_hv = {
        .configure_mmu = kvmhv_configure_mmu,
        .get_rmmu_info = kvmhv_get_rmmu_info,
        .set_smt_mode = kvmhv_set_smt_mode,
+       .enable_nested = kvmhv_enable_nested,
 };
 
 static int kvm_init_subcore_bitmap(void)
@@ -4547,6 +5304,10 @@ static int kvmppc_book3s_init_hv(void)
        if (r < 0)
                return -ENODEV;
 
+       r = kvmhv_nested_init();
+       if (r)
+               return r;
+
        r = kvm_init_subcore_bitmap();
        if (r)
                return r;
@@ -4557,7 +5318,8 @@ static int kvmppc_book3s_init_hv(void)
         * indirectly, via OPAL.
         */
 #ifdef CONFIG_SMP
-       if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
+       if (!xive_enabled() && !kvmhv_on_pseries() &&
+           !local_paca->kvm_hstate.xics_phys) {
                struct device_node *np;
 
                np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
@@ -4605,6 +5367,7 @@ static void kvmppc_book3s_exit_hv(void)
        if (kvmppc_radix_possible())
                kvmppc_radix_exit();
        kvmppc_hv_ops = NULL;
+       kvmhv_nested_exit();
 }
 
 module_init(kvmppc_book3s_init_hv);
index fc6bb96..a71e2fc 100644 (file)
@@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu)
        void __iomem *xics_phys;
        unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 
+       /* For a nested hypervisor, use the XICS via hcall */
+       if (kvmhv_on_pseries()) {
+               unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+               plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
+                               IPI_PRIORITY);
+               return;
+       }
+
        /* On POWER9 we can use msgsnd for any destination cpu. */
        if (cpu_has_feature(CPU_FTR_ARCH_300)) {
                msg |= get_hard_smp_processor_id(cpu);
@@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again)
                return 1;
 
        /* Now read the interrupt from the ICP */
-       xics_phys = local_paca->kvm_hstate.xics_phys;
-       rc = 0;
-       if (!xics_phys)
-               rc = opal_int_get_xirr(&xirr, false);
-       else
-               xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
+       if (kvmhv_on_pseries()) {
+               unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+               rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
+               xirr = cpu_to_be32(retbuf[0]);
+       } else {
+               xics_phys = local_paca->kvm_hstate.xics_phys;
+               rc = 0;
+               if (!xics_phys)
+                       rc = opal_int_get_xirr(&xirr, false);
+               else
+                       xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
+       }
        if (rc < 0)
                return 1;
 
@@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again)
         */
        if (xisr == XICS_IPI) {
                rc = 0;
-               if (xics_phys) {
+               if (kvmhv_on_pseries()) {
+                       unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+                       plpar_hcall_raw(H_IPI, retbuf,
+                                       hard_smp_processor_id(), 0xff);
+                       plpar_hcall_raw(H_EOI, retbuf, h_xirr);
+               } else if (xics_phys) {
                        __raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
                        __raw_rm_writel(xirr, xics_phys + XICS_XIRR);
                } else {
@@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again)
                        /* We raced with the host,
                         * we need to resend that IPI, bummer
                         */
-                       if (xics_phys)
+                       if (kvmhv_on_pseries()) {
+                               unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+                               plpar_hcall_raw(H_IPI, retbuf,
+                                               hard_smp_processor_id(),
+                                               IPI_PRIORITY);
+                       } else if (xics_phys)
                                __raw_rm_writeb(IPI_PRIORITY,
                                                xics_phys + XICS_MFRR);
                        else
@@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
        smp_mb();
        local_paca->kvm_hstate.kvm_split_mode = NULL;
 }
+
+/*
+ * Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
+ * Can we inject a Decrementer or a External interrupt?
+ */
+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
+{
+       int ext;
+       unsigned long vec = 0;
+       unsigned long lpcr;
+
+       /* Insert EXTERNAL bit into LPCR at the MER bit position */
+       ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
+       lpcr = mfspr(SPRN_LPCR);
+       lpcr |= ext << LPCR_MER_SH;
+       mtspr(SPRN_LPCR, lpcr);
+       isync();
+
+       if (vcpu->arch.shregs.msr & MSR_EE) {
+               if (ext) {
+                       vec = BOOK3S_INTERRUPT_EXTERNAL;
+               } else {
+                       long int dec = mfspr(SPRN_DEC);
+                       if (!(lpcr & LPCR_LD))
+                               dec = (int) dec;
+                       if (dec < 0)
+                               vec = BOOK3S_INTERRUPT_DECREMENTER;
+               }
+       }
+       if (vec) {
+               unsigned long msr, old_msr = vcpu->arch.shregs.msr;
+
+               kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
+               kvmppc_set_srr1(vcpu, old_msr);
+               kvmppc_set_pc(vcpu, vec);
+               msr = vcpu->arch.intr_msr;
+               if (MSR_TM_ACTIVE(old_msr))
+                       msr |= MSR_TS_S;
+               vcpu->arch.shregs.msr = msr;
+       }
+
+       if (vcpu->arch.doorbell_request) {
+               mtspr(SPRN_DPDES, 1);
+               vcpu->arch.vcore->dpdes = 1;
+               smp_wmb();
+               vcpu->arch.doorbell_request = 0;
+       }
+}
index 666b91c..a6d1001 100644 (file)
@@ -64,52 +64,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
        /* Save host PMU registers */
-BEGIN_FTR_SECTION
-       /* Work around P8 PMAE bug */
-       li      r3, -1
-       clrrdi  r3, r3, 10
-       mfspr   r8, SPRN_MMCR2
-       mtspr   SPRN_MMCR2, r3          /* freeze all counters using MMCR2 */
-       isync
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       li      r3, 1
-       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
-       mfspr   r7, SPRN_MMCR0          /* save MMCR0 */
-       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable interrupts */
-       mfspr   r6, SPRN_MMCRA
-       /* Clear MMCRA in order to disable SDAR updates */
-       li      r5, 0
-       mtspr   SPRN_MMCRA, r5
-       isync
-       lbz     r5, PACA_PMCINUSE(r13)  /* is the host using the PMU? */
-       cmpwi   r5, 0
-       beq     31f                     /* skip if not */
-       mfspr   r5, SPRN_MMCR1
-       mfspr   r9, SPRN_SIAR
-       mfspr   r10, SPRN_SDAR
-       std     r7, HSTATE_MMCR0(r13)
-       std     r5, HSTATE_MMCR1(r13)
-       std     r6, HSTATE_MMCRA(r13)
-       std     r9, HSTATE_SIAR(r13)
-       std     r10, HSTATE_SDAR(r13)
-BEGIN_FTR_SECTION
-       mfspr   r9, SPRN_SIER
-       std     r8, HSTATE_MMCR2(r13)
-       std     r9, HSTATE_SIER(r13)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       mfspr   r3, SPRN_PMC1
-       mfspr   r5, SPRN_PMC2
-       mfspr   r6, SPRN_PMC3
-       mfspr   r7, SPRN_PMC4
-       mfspr   r8, SPRN_PMC5
-       mfspr   r9, SPRN_PMC6
-       stw     r3, HSTATE_PMC1(r13)
-       stw     r5, HSTATE_PMC2(r13)
-       stw     r6, HSTATE_PMC3(r13)
-       stw     r7, HSTATE_PMC4(r13)
-       stw     r8, HSTATE_PMC5(r13)
-       stw     r9, HSTATE_PMC6(r13)
-31:
+       bl      kvmhv_save_host_pmu
 
        /*
         * Put whatever is in the decrementer into the
@@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
        ld      r0, PPC_LR_STKOFF(r1)
        mtlr    r0
        blr
+
+_GLOBAL(kvmhv_save_host_pmu)
+BEGIN_FTR_SECTION
+       /* Work around P8 PMAE bug */
+       li      r3, -1
+       clrrdi  r3, r3, 10
+       mfspr   r8, SPRN_MMCR2
+       mtspr   SPRN_MMCR2, r3          /* freeze all counters using MMCR2 */
+       isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mfspr   r7, SPRN_MMCR0          /* save MMCR0 */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable interrupts */
+       mfspr   r6, SPRN_MMCRA
+       /* Clear MMCRA in order to disable SDAR updates */
+       li      r5, 0
+       mtspr   SPRN_MMCRA, r5
+       isync
+       lbz     r5, PACA_PMCINUSE(r13)  /* is the host using the PMU? */
+       cmpwi   r5, 0
+       beq     31f                     /* skip if not */
+       mfspr   r5, SPRN_MMCR1
+       mfspr   r9, SPRN_SIAR
+       mfspr   r10, SPRN_SDAR
+       std     r7, HSTATE_MMCR0(r13)
+       std     r5, HSTATE_MMCR1(r13)
+       std     r6, HSTATE_MMCRA(r13)
+       std     r9, HSTATE_SIAR(r13)
+       std     r10, HSTATE_SDAR(r13)
+BEGIN_FTR_SECTION
+       mfspr   r9, SPRN_SIER
+       std     r8, HSTATE_MMCR2(r13)
+       std     r9, HSTATE_SIER(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       mfspr   r3, SPRN_PMC1
+       mfspr   r5, SPRN_PMC2
+       mfspr   r6, SPRN_PMC3
+       mfspr   r7, SPRN_PMC4
+       mfspr   r8, SPRN_PMC5
+       mfspr   r9, SPRN_PMC6
+       stw     r3, HSTATE_PMC1(r13)
+       stw     r5, HSTATE_PMC2(r13)
+       stw     r6, HSTATE_PMC3(r13)
+       stw     r7, HSTATE_PMC4(r13)
+       stw     r8, HSTATE_PMC5(r13)
+       stw     r9, HSTATE_PMC6(r13)
+31:    blr
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
new file mode 100644 (file)
index 0000000..401d2ec
--- /dev/null
@@ -0,0 +1,1291 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2018
+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
+ *        Paul Mackerras <paulus@ozlabs.org>
+ *
+ * Description: KVM functions specific to running nested KVM-HV guests
+ * on Book3S processors (specifically POWER9 and later).
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/llist.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/pte-walk.h>
+#include <asm/reg.h>
+
+static struct patb_entry *pseries_partition_tb;
+
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
+
+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       hr->pcr = vc->pcr;
+       hr->dpdes = vc->dpdes;
+       hr->hfscr = vcpu->arch.hfscr;
+       hr->tb_offset = vc->tb_offset;
+       hr->dawr0 = vcpu->arch.dawr;
+       hr->dawrx0 = vcpu->arch.dawrx;
+       hr->ciabr = vcpu->arch.ciabr;
+       hr->purr = vcpu->arch.purr;
+       hr->spurr = vcpu->arch.spurr;
+       hr->ic = vcpu->arch.ic;
+       hr->vtb = vc->vtb;
+       hr->srr0 = vcpu->arch.shregs.srr0;
+       hr->srr1 = vcpu->arch.shregs.srr1;
+       hr->sprg[0] = vcpu->arch.shregs.sprg0;
+       hr->sprg[1] = vcpu->arch.shregs.sprg1;
+       hr->sprg[2] = vcpu->arch.shregs.sprg2;
+       hr->sprg[3] = vcpu->arch.shregs.sprg3;
+       hr->pidr = vcpu->arch.pid;
+       hr->cfar = vcpu->arch.cfar;
+       hr->ppr = vcpu->arch.ppr;
+}
+
+static void byteswap_pt_regs(struct pt_regs *regs)
+{
+       unsigned long *addr = (unsigned long *) regs;
+
+       for (; addr < ((unsigned long *) (regs + 1)); addr++)
+               *addr = swab64(*addr);
+}
+
+static void byteswap_hv_regs(struct hv_guest_state *hr)
+{
+       hr->version = swab64(hr->version);
+       hr->lpid = swab32(hr->lpid);
+       hr->vcpu_token = swab32(hr->vcpu_token);
+       hr->lpcr = swab64(hr->lpcr);
+       hr->pcr = swab64(hr->pcr);
+       hr->amor = swab64(hr->amor);
+       hr->dpdes = swab64(hr->dpdes);
+       hr->hfscr = swab64(hr->hfscr);
+       hr->tb_offset = swab64(hr->tb_offset);
+       hr->dawr0 = swab64(hr->dawr0);
+       hr->dawrx0 = swab64(hr->dawrx0);
+       hr->ciabr = swab64(hr->ciabr);
+       hr->hdec_expiry = swab64(hr->hdec_expiry);
+       hr->purr = swab64(hr->purr);
+       hr->spurr = swab64(hr->spurr);
+       hr->ic = swab64(hr->ic);
+       hr->vtb = swab64(hr->vtb);
+       hr->hdar = swab64(hr->hdar);
+       hr->hdsisr = swab64(hr->hdsisr);
+       hr->heir = swab64(hr->heir);
+       hr->asdr = swab64(hr->asdr);
+       hr->srr0 = swab64(hr->srr0);
+       hr->srr1 = swab64(hr->srr1);
+       hr->sprg[0] = swab64(hr->sprg[0]);
+       hr->sprg[1] = swab64(hr->sprg[1]);
+       hr->sprg[2] = swab64(hr->sprg[2]);
+       hr->sprg[3] = swab64(hr->sprg[3]);
+       hr->pidr = swab64(hr->pidr);
+       hr->cfar = swab64(hr->cfar);
+       hr->ppr = swab64(hr->ppr);
+}
+
+static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
+                                struct hv_guest_state *hr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       hr->dpdes = vc->dpdes;
+       hr->hfscr = vcpu->arch.hfscr;
+       hr->purr = vcpu->arch.purr;
+       hr->spurr = vcpu->arch.spurr;
+       hr->ic = vcpu->arch.ic;
+       hr->vtb = vc->vtb;
+       hr->srr0 = vcpu->arch.shregs.srr0;
+       hr->srr1 = vcpu->arch.shregs.srr1;
+       hr->sprg[0] = vcpu->arch.shregs.sprg0;
+       hr->sprg[1] = vcpu->arch.shregs.sprg1;
+       hr->sprg[2] = vcpu->arch.shregs.sprg2;
+       hr->sprg[3] = vcpu->arch.shregs.sprg3;
+       hr->pidr = vcpu->arch.pid;
+       hr->cfar = vcpu->arch.cfar;
+       hr->ppr = vcpu->arch.ppr;
+       switch (trap) {
+       case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+               hr->hdar = vcpu->arch.fault_dar;
+               hr->hdsisr = vcpu->arch.fault_dsisr;
+               hr->asdr = vcpu->arch.fault_gpa;
+               break;
+       case BOOK3S_INTERRUPT_H_INST_STORAGE:
+               hr->asdr = vcpu->arch.fault_gpa;
+               break;
+       case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+               hr->heir = vcpu->arch.emul_inst;
+               break;
+       }
+}
+
+static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+       /*
+        * Don't let L1 enable features for L2 which we've disabled for L1,
+        * but preserve the interrupt cause field.
+        */
+       hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
+
+       /* Don't let data address watchpoint match in hypervisor state */
+       hr->dawrx0 &= ~DAWRX_HYP;
+
+       /* Don't let completed instruction address breakpt match in HV state */
+       if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+               hr->ciabr &= ~CIABR_PRIV;
+}
+
+static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       vc->pcr = hr->pcr;
+       vc->dpdes = hr->dpdes;
+       vcpu->arch.hfscr = hr->hfscr;
+       vcpu->arch.dawr = hr->dawr0;
+       vcpu->arch.dawrx = hr->dawrx0;
+       vcpu->arch.ciabr = hr->ciabr;
+       vcpu->arch.purr = hr->purr;
+       vcpu->arch.spurr = hr->spurr;
+       vcpu->arch.ic = hr->ic;
+       vc->vtb = hr->vtb;
+       vcpu->arch.shregs.srr0 = hr->srr0;
+       vcpu->arch.shregs.srr1 = hr->srr1;
+       vcpu->arch.shregs.sprg0 = hr->sprg[0];
+       vcpu->arch.shregs.sprg1 = hr->sprg[1];
+       vcpu->arch.shregs.sprg2 = hr->sprg[2];
+       vcpu->arch.shregs.sprg3 = hr->sprg[3];
+       vcpu->arch.pid = hr->pidr;
+       vcpu->arch.cfar = hr->cfar;
+       vcpu->arch.ppr = hr->ppr;
+}
+
+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
+                                  struct hv_guest_state *hr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       vc->dpdes = hr->dpdes;
+       vcpu->arch.hfscr = hr->hfscr;
+       vcpu->arch.purr = hr->purr;
+       vcpu->arch.spurr = hr->spurr;
+       vcpu->arch.ic = hr->ic;
+       vc->vtb = hr->vtb;
+       vcpu->arch.fault_dar = hr->hdar;
+       vcpu->arch.fault_dsisr = hr->hdsisr;
+       vcpu->arch.fault_gpa = hr->asdr;
+       vcpu->arch.emul_inst = hr->heir;
+       vcpu->arch.shregs.srr0 = hr->srr0;
+       vcpu->arch.shregs.srr1 = hr->srr1;
+       vcpu->arch.shregs.sprg0 = hr->sprg[0];
+       vcpu->arch.shregs.sprg1 = hr->sprg[1];
+       vcpu->arch.shregs.sprg2 = hr->sprg[2];
+       vcpu->arch.shregs.sprg3 = hr->sprg[3];
+       vcpu->arch.pid = hr->pidr;
+       vcpu->arch.cfar = hr->cfar;
+       vcpu->arch.ppr = hr->ppr;
+}
+
+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
+{
+       long int err, r;
+       struct kvm_nested_guest *l2;
+       struct pt_regs l2_regs, saved_l1_regs;
+       struct hv_guest_state l2_hv, saved_l1_hv;
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       u64 hv_ptr, regs_ptr;
+       u64 hdec_exp;
+       s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
+       u64 mask;
+       unsigned long lpcr;
+
+       if (vcpu->kvm->arch.l1_ptcr == 0)
+               return H_NOT_AVAILABLE;
+
+       /* copy parameters in */
+       hv_ptr = kvmppc_get_gpr(vcpu, 4);
+       err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
+                                 sizeof(struct hv_guest_state));
+       if (err)
+               return H_PARAMETER;
+       if (kvmppc_need_byteswap(vcpu))
+               byteswap_hv_regs(&l2_hv);
+       if (l2_hv.version != HV_GUEST_STATE_VERSION)
+               return H_P2;
+
+       regs_ptr = kvmppc_get_gpr(vcpu, 5);
+       err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
+                                 sizeof(struct pt_regs));
+       if (err)
+               return H_PARAMETER;
+       if (kvmppc_need_byteswap(vcpu))
+               byteswap_pt_regs(&l2_regs);
+       if (l2_hv.vcpu_token >= NR_CPUS)
+               return H_PARAMETER;
+
+       /* translate lpid */
+       l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
+       if (!l2)
+               return H_PARAMETER;
+       if (!l2->l1_gr_to_hr) {
+               mutex_lock(&l2->tlb_lock);
+               kvmhv_update_ptbl_cache(l2);
+               mutex_unlock(&l2->tlb_lock);
+       }
+
+       /* save l1 values of things */
+       vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+       saved_l1_regs = vcpu->arch.regs;
+       kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
+
+       /* convert TB values/offsets to host (L0) values */
+       hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
+       vc->tb_offset += l2_hv.tb_offset;
+
+       /* set L1 state to L2 state */
+       vcpu->arch.nested = l2;
+       vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
+       vcpu->arch.regs = l2_regs;
+       vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+       mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
+               LPCR_LPES | LPCR_MER;
+       lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
+       sanitise_hv_regs(vcpu, &l2_hv);
+       restore_hv_regs(vcpu, &l2_hv);
+
+       vcpu->arch.ret = RESUME_GUEST;
+       vcpu->arch.trap = 0;
+       do {
+               if (mftb() >= hdec_exp) {
+                       vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+                       r = RESUME_HOST;
+                       break;
+               }
+               r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
+                                         lpcr);
+       } while (is_kvmppc_resume_guest(r));
+
+       /* save L2 state for return */
+       l2_regs = vcpu->arch.regs;
+       l2_regs.msr = vcpu->arch.shregs.msr;
+       delta_purr = vcpu->arch.purr - l2_hv.purr;
+       delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
+       delta_ic = vcpu->arch.ic - l2_hv.ic;
+       delta_vtb = vc->vtb - l2_hv.vtb;
+       save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
+
+       /* restore L1 state */
+       vcpu->arch.nested = NULL;
+       vcpu->arch.regs = saved_l1_regs;
+       vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
+       /* set L1 MSR TS field according to L2 transaction state */
+       if (l2_regs.msr & MSR_TS_MASK)
+               vcpu->arch.shregs.msr |= MSR_TS_S;
+       vc->tb_offset = saved_l1_hv.tb_offset;
+       restore_hv_regs(vcpu, &saved_l1_hv);
+       vcpu->arch.purr += delta_purr;
+       vcpu->arch.spurr += delta_spurr;
+       vcpu->arch.ic += delta_ic;
+       vc->vtb += delta_vtb;
+
+       kvmhv_put_nested(l2);
+
+       /* copy l2_hv_state and regs back to guest */
+       if (kvmppc_need_byteswap(vcpu)) {
+               byteswap_hv_regs(&l2_hv);
+               byteswap_pt_regs(&l2_regs);
+       }
+       err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
+                                  sizeof(struct hv_guest_state));
+       if (err)
+               return H_AUTHORITY;
+       err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
+                                  sizeof(struct pt_regs));
+       if (err)
+               return H_AUTHORITY;
+
+       if (r == -EINTR)
+               return H_INTERRUPT;
+
+       return vcpu->arch.trap;
+}
+
+long kvmhv_nested_init(void)
+{
+       long int ptb_order;
+       unsigned long ptcr;
+       long rc;
+
+       if (!kvmhv_on_pseries())
+               return 0;
+       if (!radix_enabled())
+               return -ENODEV;
+
+       /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
+       ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
+       if (ptb_order < 8)
+               ptb_order = 8;
+       pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
+                                      GFP_KERNEL);
+       if (!pseries_partition_tb) {
+               pr_err("kvm-hv: failed to allocated nested partition table\n");
+               return -ENOMEM;
+       }
+
+       ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
+       rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
+       if (rc != H_SUCCESS) {
+               pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
+                      rc);
+               kfree(pseries_partition_tb);
+               pseries_partition_tb = NULL;
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+void kvmhv_nested_exit(void)
+{
+       /*
+        * N.B. the kvmhv_on_pseries() test is there because it enables
+        * the compiler to remove the call to plpar_hcall_norets()
+        * when CONFIG_PPC_PSERIES=n.
+        */
+       if (kvmhv_on_pseries() && pseries_partition_tb) {
+               plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
+               kfree(pseries_partition_tb);
+               pseries_partition_tb = NULL;
+       }
+}
+
+static void kvmhv_flush_lpid(unsigned int lpid)
+{
+       long rc;
+
+       if (!kvmhv_on_pseries()) {
+               radix__flush_tlb_lpid(lpid);
+               return;
+       }
+
+       rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
+                               lpid, TLBIEL_INVAL_SET_LPID);
+       if (rc)
+               pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
+}
+
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
+{
+       if (!kvmhv_on_pseries()) {
+               mmu_partition_table_set_entry(lpid, dw0, dw1);
+               return;
+       }
+
+       pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+       pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+       /* L0 will do the necessary barriers */
+       kvmhv_flush_lpid(lpid);
+}
+
+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
+{
+       unsigned long dw0;
+
+       dw0 = PATB_HR | radix__get_tree_size() |
+               __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
+       kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
+}
+
+void kvmhv_vm_nested_init(struct kvm *kvm)
+{
+       kvm->arch.max_nested_lpid = -1;
+}
+
+/*
+ * Handle the H_SET_PARTITION_TABLE hcall.
+ * r4 = guest real address of partition table + log_2(size) - 12
+ * (formatted as for the PTCR).
+ */
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
+       int srcu_idx;
+       long ret = H_SUCCESS;
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+       /*
+        * Limit the partition table to 4096 entries (because that's what
+        * hardware supports), and check the base address.
+        */
+       if ((ptcr & PRTS_MASK) > 12 - 8 ||
+           !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
+               ret = H_PARAMETER;
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+       if (ret == H_SUCCESS)
+               kvm->arch.l1_ptcr = ptcr;
+       return ret;
+}
+
+/*
+ * Reload the partition table entry for a guest.
+ * Caller must hold gp->tlb_lock.
+ */
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
+{
+       int ret;
+       struct patb_entry ptbl_entry;
+       unsigned long ptbl_addr;
+       struct kvm *kvm = gp->l1_host;
+
+       ret = -EFAULT;
+       ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
+       if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
+               ret = kvm_read_guest(kvm, ptbl_addr,
+                                    &ptbl_entry, sizeof(ptbl_entry));
+       if (ret) {
+               gp->l1_gr_to_hr = 0;
+               gp->process_table = 0;
+       } else {
+               gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
+               gp->process_table = be64_to_cpu(ptbl_entry.patb1);
+       }
+       kvmhv_set_nested_ptbl(gp);
+}
+
+struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+{
+       struct kvm_nested_guest *gp;
+       long shadow_lpid;
+
+       gp = kzalloc(sizeof(*gp), GFP_KERNEL);
+       if (!gp)
+               return NULL;
+       gp->l1_host = kvm;
+       gp->l1_lpid = lpid;
+       mutex_init(&gp->tlb_lock);
+       gp->shadow_pgtable = pgd_alloc(kvm->mm);
+       if (!gp->shadow_pgtable)
+               goto out_free;
+       shadow_lpid = kvmppc_alloc_lpid();
+       if (shadow_lpid < 0)
+               goto out_free2;
+       gp->shadow_lpid = shadow_lpid;
+
+       memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
+
+       return gp;
+
+ out_free2:
+       pgd_free(kvm->mm, gp->shadow_pgtable);
+ out_free:
+       kfree(gp);
+       return NULL;
+}
+
+/*
+ * Free up any resources allocated for a nested guest.
+ */
+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+
+       if (gp->shadow_pgtable) {
+               /*
+                * No vcpu is using this struct and no call to
+                * kvmhv_get_nested can find this struct,
+                * so we don't need to hold kvm->mmu_lock.
+                */
+               kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+                                         gp->shadow_lpid);
+               pgd_free(kvm->mm, gp->shadow_pgtable);
+       }
+       kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
+       kvmppc_free_lpid(gp->shadow_lpid);
+       kfree(gp);
+}
+
+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+       int lpid = gp->l1_lpid;
+       long ref;
+
+       spin_lock(&kvm->mmu_lock);
+       if (gp == kvm->arch.nested_guests[lpid]) {
+               kvm->arch.nested_guests[lpid] = NULL;
+               if (lpid == kvm->arch.max_nested_lpid) {
+                       while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
+                               ;
+                       kvm->arch.max_nested_lpid = lpid;
+               }
+               --gp->refcnt;
+       }
+       ref = gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+       if (ref == 0)
+               kvmhv_release_nested(gp);
+}
+
+/*
+ * Free up all nested resources allocated for this guest.
+ * This is called with no vcpus of the guest running, when
+ * switching the guest to HPT mode or when destroying the
+ * guest.
+ */
+void kvmhv_release_all_nested(struct kvm *kvm)
+{
+       int i;
+       struct kvm_nested_guest *gp;
+       struct kvm_nested_guest *freelist = NULL;
+       struct kvm_memory_slot *memslot;
+       int srcu_idx;
+
+       spin_lock(&kvm->mmu_lock);
+       for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+               gp = kvm->arch.nested_guests[i];
+               if (!gp)
+                       continue;
+               kvm->arch.nested_guests[i] = NULL;
+               if (--gp->refcnt == 0) {
+                       gp->next = freelist;
+                       freelist = gp;
+               }
+       }
+       kvm->arch.max_nested_lpid = -1;
+       spin_unlock(&kvm->mmu_lock);
+       while ((gp = freelist) != NULL) {
+               freelist = gp->next;
+               kvmhv_release_nested(gp);
+       }
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+       kvm_for_each_memslot(memslot, kvm_memslots(kvm))
+               kvmhv_free_memslot_nest_rmap(memslot);
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+/* caller must hold gp->tlb_lock */
+static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+
+       spin_lock(&kvm->mmu_lock);
+       kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
+       spin_unlock(&kvm->mmu_lock);
+       kvmhv_flush_lpid(gp->shadow_lpid);
+       kvmhv_update_ptbl_cache(gp);
+       if (gp->l1_gr_to_hr == 0)
+               kvmhv_remove_nested(gp);
+}
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+                                         bool create)
+{
+       struct kvm_nested_guest *gp, *newgp;
+
+       if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
+           l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
+               return NULL;
+
+       spin_lock(&kvm->mmu_lock);
+       gp = kvm->arch.nested_guests[l1_lpid];
+       if (gp)
+               ++gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+
+       if (gp || !create)
+               return gp;
+
+       newgp = kvmhv_alloc_nested(kvm, l1_lpid);
+       if (!newgp)
+               return NULL;
+       spin_lock(&kvm->mmu_lock);
+       if (kvm->arch.nested_guests[l1_lpid]) {
+               /* someone else beat us to it */
+               gp = kvm->arch.nested_guests[l1_lpid];
+       } else {
+               kvm->arch.nested_guests[l1_lpid] = newgp;
+               ++newgp->refcnt;
+               gp = newgp;
+               newgp = NULL;
+               if (l1_lpid > kvm->arch.max_nested_lpid)
+                       kvm->arch.max_nested_lpid = l1_lpid;
+       }
+       ++gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+
+       if (newgp)
+               kvmhv_release_nested(newgp);
+
+       return gp;
+}
+
+void kvmhv_put_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+       long ref;
+
+       spin_lock(&kvm->mmu_lock);
+       ref = --gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+       if (ref == 0)
+               kvmhv_release_nested(gp);
+}
+
+static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
+{
+       if (lpid > kvm->arch.max_nested_lpid)
+               return NULL;
+       return kvm->arch.nested_guests[lpid];
+}
+
+static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
+{
+       return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
+                                      RMAP_NESTED_GPA_MASK));
+}
+
+void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
+                           struct rmap_nested **n_rmap)
+{
+       struct llist_node *entry = ((struct llist_head *) rmapp)->first;
+       struct rmap_nested *cursor;
+       u64 rmap, new_rmap = (*n_rmap)->rmap;
+
+       /* Are there any existing entries? */
+       if (!(*rmapp)) {
+               /* No -> use the rmap as a single entry */
+               *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
+               return;
+       }
+
+       /* Do any entries match what we're trying to insert? */
+       for_each_nest_rmap_safe(cursor, entry, &rmap) {
+               if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
+                       return;
+       }
+
+       /* Do we need to create a list or just add the new entry? */
+       rmap = *rmapp;
+       if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+               *rmapp = 0UL;
+       llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
+       if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+               (*n_rmap)->list.next = (struct llist_node *) rmap;
+
+       /* Set NULL so not freed by caller */
+       *n_rmap = NULL;
+}
+
+static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
+                                  unsigned long hpa, unsigned long mask)
+{
+       struct kvm_nested_guest *gp;
+       unsigned long gpa;
+       unsigned int shift, lpid;
+       pte_t *ptep;
+
+       gpa = n_rmap & RMAP_NESTED_GPA_MASK;
+       lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
+       gp = kvmhv_find_nested(kvm, lpid);
+       if (!gp)
+               return;
+
+       /* Find and invalidate the pte */
+       ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+       /* Don't spuriously invalidate ptes if the pfn has changed */
+       if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
+               kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+}
+
+static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
+                                       unsigned long hpa, unsigned long mask)
+{
+       struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
+       struct rmap_nested *cursor;
+       unsigned long rmap;
+
+       for_each_nest_rmap_safe(cursor, entry, &rmap) {
+               kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
+               kfree(cursor);
+       }
+}
+
+/* called with kvm->mmu_lock held */
+void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
+                                 struct kvm_memory_slot *memslot,
+                                 unsigned long gpa, unsigned long hpa,
+                                 unsigned long nbytes)
+{
+       unsigned long gfn, end_gfn;
+       unsigned long addr_mask;
+
+       if (!memslot)
+               return;
+       gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
+       end_gfn = gfn + (nbytes >> PAGE_SHIFT);
+
+       addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
+       hpa &= addr_mask;
+
+       for (; gfn < end_gfn; gfn++) {
+               unsigned long *rmap = &memslot->arch.rmap[gfn];
+               kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
+       }
+}
+
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
+{
+       unsigned long page;
+
+       for (page = 0; page < free->npages; page++) {
+               unsigned long rmap, *rmapp = &free->arch.rmap[page];
+               struct rmap_nested *cursor;
+               struct llist_node *entry;
+
+               entry = llist_del_all((struct llist_head *) rmapp);
+               for_each_nest_rmap_safe(cursor, entry, &rmap)
+                       kfree(cursor);
+       }
+}
+
+static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
+                                       struct kvm_nested_guest *gp,
+                                       long gpa, int *shift_ret)
+{
+       struct kvm *kvm = vcpu->kvm;
+       bool ret = false;
+       pte_t *ptep;
+       int shift;
+
+       spin_lock(&kvm->mmu_lock);
+       ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+       if (!shift)
+               shift = PAGE_SHIFT;
+       if (ptep && pte_present(*ptep)) {
+               kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+               ret = true;
+       }
+       spin_unlock(&kvm->mmu_lock);
+
+       if (shift_ret)
+               *shift_ret = shift;
+       return ret;
+}
+
+static inline int get_ric(unsigned int instr)
+{
+       return (instr >> 18) & 0x3;
+}
+
+static inline int get_prs(unsigned int instr)
+{
+       return (instr >> 17) & 0x1;
+}
+
+static inline int get_r(unsigned int instr)
+{
+       return (instr >> 16) & 0x1;
+}
+
+static inline int get_lpid(unsigned long r_val)
+{
+       return r_val & 0xffffffff;
+}
+
+static inline int get_is(unsigned long r_val)
+{
+       return (r_val >> 10) & 0x3;
+}
+
+static inline int get_ap(unsigned long r_val)
+{
+       return (r_val >> 5) & 0x7;
+}
+
+static inline long get_epn(unsigned long r_val)
+{
+       return r_val >> 12;
+}
+
+static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
+                                       int ap, long epn)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_nested_guest *gp;
+       long npages;
+       int shift, shadow_shift;
+       unsigned long addr;
+
+       shift = ap_to_shift(ap);
+       addr = epn << 12;
+       if (shift < 0)
+               /* Invalid ap encoding */
+               return -EINVAL;
+
+       addr &= ~((1UL << shift) - 1);
+       npages = 1UL << (shift - PAGE_SHIFT);
+
+       gp = kvmhv_get_nested(kvm, lpid, false);
+       if (!gp) /* No such guest -> nothing to do */
+               return 0;
+       mutex_lock(&gp->tlb_lock);
+
+       /* There may be more than one host page backing this single guest pte */
+       do {
+               kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
+
+               npages -= 1UL << (shadow_shift - PAGE_SHIFT);
+               addr += 1UL << shadow_shift;
+       } while (npages > 0);
+
+       mutex_unlock(&gp->tlb_lock);
+       kvmhv_put_nested(gp);
+       return 0;
+}
+
+static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
+                                    struct kvm_nested_guest *gp, int ric)
+{
+       struct kvm *kvm = vcpu->kvm;
+
+       mutex_lock(&gp->tlb_lock);
+       switch (ric) {
+       case 0:
+               /* Invalidate TLB */
+               spin_lock(&kvm->mmu_lock);
+               kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+                                         gp->shadow_lpid);
+               kvmhv_flush_lpid(gp->shadow_lpid);
+               spin_unlock(&kvm->mmu_lock);
+               break;
+       case 1:
+               /*
+                * Invalidate PWC
+                * We don't cache this -> nothing to do
+                */
+               break;
+       case 2:
+               /* Invalidate TLB, PWC and caching of partition table entries */
+               kvmhv_flush_nested(gp);
+               break;
+       default:
+               break;
+       }
+       mutex_unlock(&gp->tlb_lock);
+}
+
+static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_nested_guest *gp;
+       int i;
+
+       spin_lock(&kvm->mmu_lock);
+       for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+               gp = kvm->arch.nested_guests[i];
+               if (gp) {
+                       spin_unlock(&kvm->mmu_lock);
+                       kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+                       spin_lock(&kvm->mmu_lock);
+               }
+       }
+       spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
+                                   unsigned long rsval, unsigned long rbval)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_nested_guest *gp;
+       int r, ric, prs, is, ap;
+       int lpid;
+       long epn;
+       int ret = 0;
+
+       ric = get_ric(instr);
+       prs = get_prs(instr);
+       r = get_r(instr);
+       lpid = get_lpid(rsval);
+       is = get_is(rbval);
+
+       /*
+        * These cases are invalid and are not handled:
+        * r   != 1 -> Only radix supported
+        * prs == 1 -> Not HV privileged
+        * ric == 3 -> No cluster bombs for radix
+        * is  == 1 -> Partition scoped translations not associated with pid
+        * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
+        */
+       if ((!r) || (prs) || (ric == 3) || (is == 1) ||
+           ((!is) && (ric == 1 || ric == 2)))
+               return -EINVAL;
+
+       switch (is) {
+       case 0:
+               /*
+                * We know ric == 0
+                * Invalidate TLB for a given target address
+                */
+               epn = get_epn(rbval);
+               ap = get_ap(rbval);
+               ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
+               break;
+       case 2:
+               /* Invalidate matching LPID */
+               gp = kvmhv_get_nested(kvm, lpid, false);
+               if (gp) {
+                       kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+                       kvmhv_put_nested(gp);
+               }
+               break;
+       case 3:
+               /* Invalidate ALL LPIDs */
+               kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+/*
+ * This handles the H_TLB_INVALIDATE hcall.
+ * Parameters are (r4) tlbie instruction code, (r5) rS contents,
+ * (r6) rB contents.
+ */
+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
+{
+       int ret;
+
+       ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
+                       kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
+       if (ret)
+               return H_PARAMETER;
+       return H_SUCCESS;
+}
+
+/* Used to convert a nested guest real address to a L1 guest real address */
+static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
+                                      struct kvm_nested_guest *gp,
+                                      unsigned long n_gpa, unsigned long dsisr,
+                                      struct kvmppc_pte *gpte_p)
+{
+       u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
+       int ret;
+
+       ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
+                                        &fault_addr);
+
+       if (ret) {
+               /* We didn't find a pte */
+               if (ret == -EINVAL) {
+                       /* Unsupported mmu config */
+                       flags |= DSISR_UNSUPP_MMU;
+               } else if (ret == -ENOENT) {
+                       /* No translation found */
+                       flags |= DSISR_NOHPTE;
+               } else if (ret == -EFAULT) {
+                       /* Couldn't access L1 real address */
+                       flags |= DSISR_PRTABLE_FAULT;
+                       vcpu->arch.fault_gpa = fault_addr;
+               } else {
+                       /* Unknown error */
+                       return ret;
+               }
+               goto forward_to_l1;
+       } else {
+               /* We found a pte -> check permissions */
+               if (dsisr & DSISR_ISSTORE) {
+                       /* Can we write? */
+                       if (!gpte_p->may_write) {
+                               flags |= DSISR_PROTFAULT;
+                               goto forward_to_l1;
+                       }
+               } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+                       /* Can we execute? */
+                       if (!gpte_p->may_execute) {
+                               flags |= SRR1_ISI_N_OR_G;
+                               goto forward_to_l1;
+                       }
+               } else {
+                       /* Can we read? */
+                       if (!gpte_p->may_read && !gpte_p->may_write) {
+                               flags |= DSISR_PROTFAULT;
+                               goto forward_to_l1;
+                       }
+               }
+       }
+
+       return 0;
+
+forward_to_l1:
+       vcpu->arch.fault_dsisr = flags;
+       if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+               vcpu->arch.shregs.msr &= ~0x783f0000ul;
+               vcpu->arch.shregs.msr |= flags;
+       }
+       return RESUME_HOST;
+}
+
+static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
+                                      struct kvm_nested_guest *gp,
+                                      unsigned long n_gpa,
+                                      struct kvmppc_pte gpte,
+                                      unsigned long dsisr)
+{
+       struct kvm *kvm = vcpu->kvm;
+       bool writing = !!(dsisr & DSISR_ISSTORE);
+       u64 pgflags;
+       bool ret;
+
+       /* Are the rc bits set in the L1 partition scoped pte? */
+       pgflags = _PAGE_ACCESSED;
+       if (writing)
+               pgflags |= _PAGE_DIRTY;
+       if (pgflags & ~gpte.rc)
+               return RESUME_HOST;
+
+       spin_lock(&kvm->mmu_lock);
+       /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
+       ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
+                                    gpte.raddr, kvm->arch.lpid);
+       spin_unlock(&kvm->mmu_lock);
+       if (!ret)
+               return -EINVAL;
+
+       /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
+       ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
+                                     gp->shadow_lpid);
+       if (!ret)
+               return -EINVAL;
+       return 0;
+}
+
+static inline int kvmppc_radix_level_to_shift(int level)
+{
+       switch (level) {
+       case 2:
+               return PUD_SHIFT;
+       case 1:
+               return PMD_SHIFT;
+       default:
+               return PAGE_SHIFT;
+       }
+}
+
+static inline int kvmppc_radix_shift_to_level(int shift)
+{
+       if (shift == PUD_SHIFT)
+               return 2;
+       if (shift == PMD_SHIFT)
+               return 1;
+       if (shift == PAGE_SHIFT)
+               return 0;
+       WARN_ON_ONCE(1);
+       return 0;
+}
+
+/* called with gp->tlb_lock held */
+static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
+                                         struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_memory_slot *memslot;
+       struct rmap_nested *n_rmap;
+       struct kvmppc_pte gpte;
+       pte_t pte, *pte_p;
+       unsigned long mmu_seq;
+       unsigned long dsisr = vcpu->arch.fault_dsisr;
+       unsigned long ea = vcpu->arch.fault_dar;
+       unsigned long *rmapp;
+       unsigned long n_gpa, gpa, gfn, perm = 0UL;
+       unsigned int shift, l1_shift, level;
+       bool writing = !!(dsisr & DSISR_ISSTORE);
+       bool kvm_ro = false;
+       long int ret;
+
+       if (!gp->l1_gr_to_hr) {
+               kvmhv_update_ptbl_cache(gp);
+               if (!gp->l1_gr_to_hr)
+                       return RESUME_HOST;
+       }
+
+       /* Convert the nested guest real address into a L1 guest real address */
+
+       n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
+       if (!(dsisr & DSISR_PRTABLE_FAULT))
+               n_gpa |= ea & 0xFFF;
+       ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
+
+       /*
+        * If the hardware found a translation but we don't now have a usable
+        * translation in the l1 partition-scoped tree, remove the shadow pte
+        * and let the guest retry.
+        */
+       if (ret == RESUME_HOST &&
+           (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G |
+                     DSISR_BAD_COPYPASTE)))
+               goto inval;
+       if (ret)
+               return ret;
+
+       /* Failed to set the reference/change bits */
+       if (dsisr & DSISR_SET_RC) {
+               ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
+               if (ret == RESUME_HOST)
+                       return ret;
+               if (ret)
+                       goto inval;
+               dsisr &= ~DSISR_SET_RC;
+               if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+                              DSISR_PROTFAULT)))
+                       return RESUME_GUEST;
+       }
+
+       /*
+        * We took an HISI or HDSI while we were running a nested guest which
+        * means we have no partition scoped translation for that. This means
+        * we need to insert a pte for the mapping into our shadow_pgtable.
+        */
+
+       l1_shift = gpte.page_shift;
+       if (l1_shift < PAGE_SHIFT) {
+               /* We don't support l1 using a page size smaller than our own */
+               pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
+                       l1_shift, PAGE_SHIFT);
+               return -EINVAL;
+       }
+       gpa = gpte.raddr;
+       gfn = gpa >> PAGE_SHIFT;
+
+       /* 1. Get the corresponding host memslot */
+
+       memslot = gfn_to_memslot(kvm, gfn);
+       if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+               if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
+                       /* unusual error -> reflect to the guest as a DSI */
+                       kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+                       return RESUME_GUEST;
+               }
+               /* passthrough of emulated MMIO case... */
+               pr_err("emulated MMIO passthrough?\n");
+               return -EINVAL;
+       }
+       if (memslot->flags & KVM_MEM_READONLY) {
+               if (writing) {
+                       /* Give the guest a DSI */
+                       kvmppc_core_queue_data_storage(vcpu, ea,
+                                       DSISR_ISSTORE | DSISR_PROTFAULT);
+                       return RESUME_GUEST;
+               }
+               kvm_ro = true;
+       }
+
+       /* 2. Find the host pte for this L1 guest real address */
+
+       /* Used to check for invalidations in progress */
+       mmu_seq = kvm->mmu_notifier_seq;
+       smp_rmb();
+
+       /* See if can find translation in our partition scoped tables for L1 */
+       pte = __pte(0);
+       spin_lock(&kvm->mmu_lock);
+       pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+       if (!shift)
+               shift = PAGE_SHIFT;
+       if (pte_p)
+               pte = *pte_p;
+       spin_unlock(&kvm->mmu_lock);
+
+       if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
+               /* No suitable pte found -> try to insert a mapping */
+               ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
+                                       writing, kvm_ro, &pte, &level);
+               if (ret == -EAGAIN)
+                       return RESUME_GUEST;
+               else if (ret)
+                       return ret;
+               shift = kvmppc_radix_level_to_shift(level);
+       }
+
+       /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
+
+       /* The permissions is the combination of the host and l1 guest ptes */
+       perm |= gpte.may_read ? 0UL : _PAGE_READ;
+       perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
+       perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
+       pte = __pte(pte_val(pte) & ~perm);
+
+       /* What size pte can we insert? */
+       if (shift > l1_shift) {
+               u64 mask;
+               unsigned int actual_shift = PAGE_SHIFT;
+               if (PMD_SHIFT < l1_shift)
+                       actual_shift = PMD_SHIFT;
+               mask = (1UL << shift) - (1UL << actual_shift);
+               pte = __pte(pte_val(pte) | (gpa & mask));
+               shift = actual_shift;
+       }
+       level = kvmppc_radix_shift_to_level(shift);
+       n_gpa &= ~((1UL << shift) - 1);
+
+       /* 4. Insert the pte into our shadow_pgtable */
+
+       n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
+       if (!n_rmap)
+               return RESUME_GUEST; /* Let the guest try again */
+       n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
+               (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
+       rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+       ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
+                               mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
+       if (n_rmap)
+               kfree(n_rmap);
+       if (ret == -EAGAIN)
+               ret = RESUME_GUEST;     /* Let the guest try again */
+
+       return ret;
+
+ inval:
+       kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
+       return RESUME_GUEST;
+}
+
+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
+{
+       struct kvm_nested_guest *gp = vcpu->arch.nested;
+       long int ret;
+
+       mutex_lock(&gp->tlb_lock);
+       ret = __kvmhv_nested_page_fault(vcpu, gp);
+       mutex_unlock(&gp->tlb_lock);
+       return ret;
+}
+
+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
+{
+       int ret = -1;
+
+       spin_lock(&kvm->mmu_lock);
+       while (++lpid <= kvm->arch.max_nested_lpid) {
+               if (kvm->arch.nested_guests[lpid]) {
+                       ret = lpid;
+                       break;
+               }
+       }
+       spin_unlock(&kvm->mmu_lock);
+       return ret;
+}
index b11043b..0787f12 100644 (file)
@@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void)
 
        local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
 }
+EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest);
 
 void kvmppc_subcore_exit_guest(void)
 {
@@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void)
 
        local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
 }
+EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest);
 
 static bool kvmppc_tb_resync_required(void)
 {
@@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void)
        } else {
                wait_for_tb_resync();
        }
+
+       /*
+        * Reset tb_offset_applied so the guest exit code won't try
+        * to subtract the previous timebase offset from the timebase.
+        */
+       if (local_paca->kvm_hstate.kvm_vcore)
+               local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0;
+
        return 0;
 }
index 758d1d2..b3f5786 100644 (file)
@@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 
        /* Mark the target VCPU as having an interrupt pending */
        vcpu->stat.queue_intr++;
-       set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+       set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
 
        /* Kick self ? Just set MER and return */
        if (vcpu == this_vcpu) {
@@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
 {
        /* Note: Only called on self ! */
-       clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
-                 &vcpu->arch.pending_exceptions);
+       clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
        mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
 }
 
@@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
        void __iomem *xics_phys;
        int64_t rc;
 
+       if (kvmhv_on_pseries()) {
+               unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+               iosync();
+               plpar_hcall_raw(H_EOI, retbuf, hwirq);
+               return;
+       }
+
        rc = pnv_opal_pci_msi_eoi(c, hwirq);
 
        if (rc)
index 1d14046..9b8d50a 100644 (file)
@@ -28,6 +28,7 @@
 #include <asm/exception-64s.h>
 #include <asm/kvm_book3s_asm.h>
 #include <asm/book3s/64/mmu-hash.h>
+#include <asm/export.h>
 #include <asm/tm.h>
 #include <asm/opal.h>
 #include <asm/xive-regs.h>
@@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 #define NAPPING_NOVCPU 2
 
 /* Stack frame offsets for kvmppc_hv_entry */
-#define SFS                    160
+#define SFS                    208
 #define STACK_SLOT_TRAP                (SFS-4)
+#define STACK_SLOT_SHORT_PATH  (SFS-8)
 #define STACK_SLOT_TID         (SFS-16)
 #define STACK_SLOT_PSSCR       (SFS-24)
 #define STACK_SLOT_PID         (SFS-32)
@@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 #define STACK_SLOT_DAWR                (SFS-56)
 #define STACK_SLOT_DAWRX       (SFS-64)
 #define STACK_SLOT_HFSCR       (SFS-72)
+/* the following is used by the P9 short path */
+#define STACK_SLOT_NVGPRS      (SFS-152)       /* 18 gprs */
 
 /*
  * Call kvmppc_hv_entry in real mode.
@@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        mtspr   SPRN_SPRG_VDSO_WRITE,r3
 
        /* Reload the host's PMU registers */
-       lbz     r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
-       cmpwi   r4, 0
-       beq     23f                     /* skip if not */
-BEGIN_FTR_SECTION
-       ld      r3, HSTATE_MMCR0(r13)
-       andi.   r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
-       cmpwi   r4, MMCR0_PMAO
-       beql    kvmppc_fix_pmao
-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
-       lwz     r3, HSTATE_PMC1(r13)
-       lwz     r4, HSTATE_PMC2(r13)
-       lwz     r5, HSTATE_PMC3(r13)
-       lwz     r6, HSTATE_PMC4(r13)
-       lwz     r8, HSTATE_PMC5(r13)
-       lwz     r9, HSTATE_PMC6(r13)
-       mtspr   SPRN_PMC1, r3
-       mtspr   SPRN_PMC2, r4
-       mtspr   SPRN_PMC3, r5
-       mtspr   SPRN_PMC4, r6
-       mtspr   SPRN_PMC5, r8
-       mtspr   SPRN_PMC6, r9
-       ld      r3, HSTATE_MMCR0(r13)
-       ld      r4, HSTATE_MMCR1(r13)
-       ld      r5, HSTATE_MMCRA(r13)
-       ld      r6, HSTATE_SIAR(r13)
-       ld      r7, HSTATE_SDAR(r13)
-       mtspr   SPRN_MMCR1, r4
-       mtspr   SPRN_MMCRA, r5
-       mtspr   SPRN_SIAR, r6
-       mtspr   SPRN_SDAR, r7
-BEGIN_FTR_SECTION
-       ld      r8, HSTATE_MMCR2(r13)
-       ld      r9, HSTATE_SIER(r13)
-       mtspr   SPRN_MMCR2, r8
-       mtspr   SPRN_SIER, r9
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       mtspr   SPRN_MMCR0, r3
-       isync
-23:
+       bl      kvmhv_load_host_pmu
 
        /*
         * Reload DEC.  HDEC interrupts were disabled when
@@ -796,66 +762,23 @@ BEGIN_FTR_SECTION
        b       91f
 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        /*
-        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
         */
        mr      r3, r4
        ld      r4, VCPU_MSR(r3)
+       li      r5, 0                   /* don't preserve non-vol regs */
        bl      kvmppc_restore_tm_hv
+       nop
        ld      r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
 
-       /* Load guest PMU registers */
-       /* R4 is live here (vcpu pointer) */
-       li      r3, 1
-       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
-       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
-       isync
-BEGIN_FTR_SECTION
-       ld      r3, VCPU_MMCR(r4)
-       andi.   r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
-       cmpwi   r5, MMCR0_PMAO
-       beql    kvmppc_fix_pmao
-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
-       lwz     r3, VCPU_PMC(r4)        /* always load up guest PMU registers */
-       lwz     r5, VCPU_PMC + 4(r4)    /* to prevent information leak */
-       lwz     r6, VCPU_PMC + 8(r4)
-       lwz     r7, VCPU_PMC + 12(r4)
-       lwz     r8, VCPU_PMC + 16(r4)
-       lwz     r9, VCPU_PMC + 20(r4)
-       mtspr   SPRN_PMC1, r3
-       mtspr   SPRN_PMC2, r5
-       mtspr   SPRN_PMC3, r6
-       mtspr   SPRN_PMC4, r7
-       mtspr   SPRN_PMC5, r8
-       mtspr   SPRN_PMC6, r9
-       ld      r3, VCPU_MMCR(r4)
-       ld      r5, VCPU_MMCR + 8(r4)
-       ld      r6, VCPU_MMCR + 16(r4)
-       ld      r7, VCPU_SIAR(r4)
-       ld      r8, VCPU_SDAR(r4)
-       mtspr   SPRN_MMCR1, r5
-       mtspr   SPRN_MMCRA, r6
-       mtspr   SPRN_SIAR, r7
-       mtspr   SPRN_SDAR, r8
-BEGIN_FTR_SECTION
-       ld      r5, VCPU_MMCR + 24(r4)
-       ld      r6, VCPU_SIER(r4)
-       mtspr   SPRN_MMCR2, r5
-       mtspr   SPRN_SIER, r6
-BEGIN_FTR_SECTION_NESTED(96)
-       lwz     r7, VCPU_PMC + 24(r4)
-       lwz     r8, VCPU_PMC + 28(r4)
-       ld      r9, VCPU_MMCR + 32(r4)
-       mtspr   SPRN_SPMC1, r7
-       mtspr   SPRN_SPMC2, r8
-       mtspr   SPRN_MMCRS, r9
-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       mtspr   SPRN_MMCR0, r3
-       isync
+       /* Load guest PMU registers; r4 = vcpu pointer here */
+       mr      r3, r4
+       bl      kvmhv_load_guest_pmu
 
        /* Load up FP, VMX and VSX registers */
+       ld      r4, HSTATE_KVM_VCPU(r13)
        bl      kvmppc_load_fp
 
        ld      r14, VCPU_GPR(R14)(r4)
@@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 no_xive:
 #endif /* CONFIG_KVM_XICS */
 
-deliver_guest_interrupt:
-       ld      r6, VCPU_CTR(r4)
-       ld      r7, VCPU_XER(r4)
-
-       mtctr   r6
-       mtxer   r7
+       li      r0, 0
+       stw     r0, STACK_SLOT_SHORT_PATH(r1)
 
-kvmppc_cede_reentry:           /* r4 = vcpu, r13 = paca */
-       ld      r10, VCPU_PC(r4)
-       ld      r11, VCPU_MSR(r4)
+deliver_guest_interrupt:       /* r4 = vcpu, r13 = paca */
+       /* Check if we can deliver an external or decrementer interrupt now */
+       ld      r0, VCPU_PENDING_EXC(r4)
+BEGIN_FTR_SECTION
+       /* On POWER9, also check for emulated doorbell interrupt */
+       lbz     r3, VCPU_DBELL_REQ(r4)
+       or      r0, r0, r3
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+       cmpdi   r0, 0
+       beq     71f
+       mr      r3, r4
+       bl      kvmppc_guest_entry_inject_int
+       ld      r4, HSTATE_KVM_VCPU(r13)
+71:
        ld      r6, VCPU_SRR0(r4)
        ld      r7, VCPU_SRR1(r4)
        mtspr   SPRN_SRR0, r6
        mtspr   SPRN_SRR1, r7
 
+fast_guest_entry_c:
+       ld      r10, VCPU_PC(r4)
+       ld      r11, VCPU_MSR(r4)
        /* r11 = vcpu->arch.msr & ~MSR_HV */
        rldicl  r11, r11, 63 - MSR_HV_LG, 1
        rotldi  r11, r11, 1 + MSR_HV_LG
        ori     r11, r11, MSR_ME
 
-       /* Check if we can deliver an external or decrementer interrupt now */
-       ld      r0, VCPU_PENDING_EXC(r4)
-       rldicl  r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
-       cmpdi   cr1, r0, 0
-       andi.   r8, r11, MSR_EE
-       mfspr   r8, SPRN_LPCR
-       /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
-       rldimi  r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
-       mtspr   SPRN_LPCR, r8
-       isync
-       beq     5f
-       li      r0, BOOK3S_INTERRUPT_EXTERNAL
-       bne     cr1, 12f
-       mfspr   r0, SPRN_DEC
-BEGIN_FTR_SECTION
-       /* On POWER9 check whether the guest has large decrementer enabled */
-       andis.  r8, r8, LPCR_LD@h
-       bne     15f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-       extsw   r0, r0
-15:    cmpdi   r0, 0
-       li      r0, BOOK3S_INTERRUPT_DECREMENTER
-       bge     5f
-
-12:    mtspr   SPRN_SRR0, r10
-       mr      r10,r0
-       mtspr   SPRN_SRR1, r11
-       mr      r9, r4
-       bl      kvmppc_msr_interrupt
-5:
-BEGIN_FTR_SECTION
-       b       fast_guest_return
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-       /* On POWER9, check for pending doorbell requests */
-       lbz     r0, VCPU_DBELL_REQ(r4)
-       cmpwi   r0, 0
-       beq     fast_guest_return
-       ld      r5, HSTATE_KVM_VCORE(r13)
-       /* Set DPDES register so the CPU will take a doorbell interrupt */
-       li      r0, 1
-       mtspr   SPRN_DPDES, r0
-       std     r0, VCORE_DPDES(r5)
-       /* Make sure other cpus see vcore->dpdes set before dbell req clear */
-       lwsync
-       /* Clear the pending doorbell request */
-       li      r0, 0
-       stb     r0, VCPU_DBELL_REQ(r4)
+       ld      r6, VCPU_CTR(r4)
+       ld      r7, VCPU_XER(r4)
+       mtctr   r6
+       mtxer   r7
 
 /*
  * Required state:
@@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
        ld      r5, VCPU_LR(r4)
-       lwz     r6, VCPU_CR(r4)
+       l     r6, VCPU_CR(r4)
        mtlr    r5
        mtcr    r6
 
@@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
        HRFI_TO_GUEST
        b       .
 
+/*
+ * Enter the guest on a P9 or later system where we have exactly
+ * one vcpu per vcore and we don't need to go to real mode
+ * (which implies that host and guest are both using radix MMU mode).
+ * r3 = vcpu pointer
+ * Most SPRs and all the VSRs have been loaded already.
+ */
+_GLOBAL(__kvmhv_vcpu_entry_p9)
+EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -SFS(r1)
+
+       li      r0, 1
+       stw     r0, STACK_SLOT_SHORT_PATH(r1)
+
+       std     r3, HSTATE_KVM_VCPU(r13)
+       mfcr    r4
+       stw     r4, SFS+8(r1)
+
+       std     r1, HSTATE_HOST_R1(r13)
+
+       reg = 14
+       .rept   18
+       std     reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+       reg = reg + 1
+       .endr
+
+       reg = 14
+       .rept   18
+       ld      reg, __VCPU_GPR(reg)(r3)
+       reg = reg + 1
+       .endr
+
+       mfmsr   r10
+       std     r10, HSTATE_HOST_MSR(r13)
+
+       mr      r4, r3
+       b       fast_guest_entry_c
+guest_exit_short_path:
+
+       li      r0, KVM_GUEST_MODE_NONE
+       stb     r0, HSTATE_IN_GUEST(r13)
+
+       reg = 14
+       .rept   18
+       std     reg, __VCPU_GPR(reg)(r9)
+       reg = reg + 1
+       .endr
+
+       reg = 14
+       .rept   18
+       ld      reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+       reg = reg + 1
+       .endr
+
+       lwz     r4, SFS+8(r1)
+       mtcr    r4
+
+       mr      r3, r12         /* trap number */
+
+       addi    r1, r1, SFS
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+
+       /* If we are in real mode, do a rfid to get back to the caller */
+       mfmsr   r4
+       andi.   r5, r4, MSR_IR
+       bnelr
+       rldicl  r5, r4, 64 - MSR_TS_S_LG, 62    /* extract TS field */
+       mtspr   SPRN_SRR0, r0
+       ld      r10, HSTATE_HOST_MSR(r13)
+       rldimi  r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+       mtspr   SPRN_SRR1, r10
+       RFI_TO_KERNEL
+       b       .
+
 secondary_too_late:
        li      r12, 0
        stw     r12, STACK_SLOT_TRAP(r1)
@@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv:
        std     r3, VCPU_GPR(R12)(r9)
        /* CR is in the high half of r12 */
        srdi    r4, r12, 32
-       stw     r4, VCPU_CR(r9)
+       std     r4, VCPU_CR(r9)
 BEGIN_FTR_SECTION
        ld      r3, HSTATE_CFAR(r13)
        std     r3, VCPU_CFAR(r9)
@@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        std     r3, VCPU_CTR(r9)
        std     r4, VCPU_XER(r9)
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       /* For softpatch interrupt, go off and do TM instruction emulation */
-       cmpwi   r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
-       beq     kvmppc_tm_emul
-#endif
+       /* Save more register state  */
+       mfdar   r3
+       mfdsisr r4
+       std     r3, VCPU_DAR(r9)
+       stw     r4, VCPU_DSISR(r9)
 
        /* If this is a page table miss then see if it's theirs or ours */
        cmpwi   r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
        beq     kvmppc_hdsi
+       std     r3, VCPU_FAULT_DAR(r9)
+       stw     r4, VCPU_FAULT_DSISR(r9)
        cmpwi   r12, BOOK3S_INTERRUPT_H_INST_STORAGE
        beq     kvmppc_hisi
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       /* For softpatch interrupt, go off and do TM instruction emulation */
+       cmpwi   r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
+       beq     kvmppc_tm_emul
+#endif
+
        /* See if this is a leftover HDEC interrupt */
        cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
        bne     2f
@@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 BEGIN_FTR_SECTION
        PPC_MSGSYNC
        lwsync
+       /* always exit if we're running a nested guest */
+       ld      r0, VCPU_NESTED(r9)
+       cmpdi   r0, 0
+       bne     guest_exit_cont
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
        lbz     r0, HSTATE_HOST_IPI(r13)
        cmpwi   r0, 0
-       beq     4f
+       beq     maybe_reenter_guest
        b       guest_exit_cont
 3:
        /* If it's a hypervisor facility unavailable interrupt, save HFSCR */
@@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 14:
        /* External interrupt ? */
        cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
-       bne+    guest_exit_cont
-
-       /* External interrupt, first check for host_ipi. If this is
-        * set, we know the host wants us out so let's do it now
-        */
-       bl      kvmppc_read_intr
-
-       /*
-        * Restore the active volatile registers after returning from
-        * a C function.
-        */
-       ld      r9, HSTATE_KVM_VCPU(r13)
-       li      r12, BOOK3S_INTERRUPT_EXTERNAL
-
-       /*
-        * kvmppc_read_intr return codes:
-        *
-        * Exit to host (r3 > 0)
-        *   1 An interrupt is pending that needs to be handled by the host
-        *     Exit guest and return to host by branching to guest_exit_cont
-        *
-        *   2 Passthrough that needs completion in the host
-        *     Exit guest and return to host by branching to guest_exit_cont
-        *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
-        *     to indicate to the host to complete handling the interrupt
-        *
-        * Before returning to guest, we check if any CPU is heading out
-        * to the host and if so, we head out also. If no CPUs are heading
-        * check return values <= 0.
-        *
-        * Return to guest (r3 <= 0)
-        *  0 No external interrupt is pending
-        * -1 A guest wakeup IPI (which has now been cleared)
-        *    In either case, we return to guest to deliver any pending
-        *    guest interrupts.
-        *
-        * -2 A PCI passthrough external interrupt was handled
-        *    (interrupt was delivered directly to guest)
-        *    Return to guest to deliver any pending guest interrupts.
-        */
-
-       cmpdi   r3, 1
-       ble     1f
-
-       /* Return code = 2 */
-       li      r12, BOOK3S_INTERRUPT_HV_RM_HARD
-       stw     r12, VCPU_TRAP(r9)
-       b       guest_exit_cont
-
-1:     /* Return code <= 1 */
-       cmpdi   r3, 0
-       bgt     guest_exit_cont
-
-       /* Return code <= 0 */
-4:     ld      r5, HSTATE_KVM_VCORE(r13)
-       lwz     r0, VCORE_ENTRY_EXIT(r5)
-       cmpwi   r0, 0x100
-       mr      r4, r9
-       blt     deliver_guest_interrupt
-
-guest_exit_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
-       /* Save more register state  */
-       mfdar   r6
-       mfdsisr r7
-       std     r6, VCPU_DAR(r9)
-       stw     r7, VCPU_DSISR(r9)
-       /* don't overwrite fault_dar/fault_dsisr if HDSI */
-       cmpwi   r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
-       beq     mc_cont
-       std     r6, VCPU_FAULT_DAR(r9)
-       stw     r7, VCPU_FAULT_DSISR(r9)
-
+       beq     kvmppc_guest_external
        /* See if it is a machine check */
        cmpwi   r12, BOOK3S_INTERRUPT_MACHINE_CHECK
        beq     machine_check_realmode
-mc_cont:
+       /* Or a hypervisor maintenance interrupt */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       beq     hmi_realmode
+
+guest_exit_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
+
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
        addi    r3, r9, VCPU_TB_RMEXIT
        mr      r4, r9
@@ -1552,6 +1465,11 @@ mc_cont:
 1:
 #endif /* CONFIG_KVM_XICS */
 
+       /* If we came in through the P9 short path, go back out to C now */
+       lwz     r0, STACK_SLOT_SHORT_PATH(r1)
+       cmpwi   r0, 0
+       bne     guest_exit_short_path
+
        /* For hash guest, read the guest SLB and save it away */
        ld      r5, VCPU_KVM(r9)
        lbz     r0, KVM_RADIX(r5)
@@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION
        b       91f
 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        /*
-        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
         */
        mr      r3, r9
        ld      r4, VCPU_MSR(r3)
+       li      r5, 0                   /* don't preserve non-vol regs */
        bl      kvmppc_save_tm_hv
+       nop
        ld      r9, HSTATE_KVM_VCPU(r13)
 91:
 #endif
@@ -1802,90 +1722,19 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 25:
        /* Save PMU registers if requested */
        /* r8 and cr0.eq are live here */
+       mr      r3, r9
+       li      r4, 1
+       beq     21f                     /* if no VPA, save PMU stuff anyway */
+       lbz     r4, LPPACA_PMCINUSE(r8)
+21:    bl      kvmhv_save_guest_pmu
+       ld      r9, HSTATE_KVM_VCPU(r13)
+
+       /* Restore host values of some registers */
 BEGIN_FTR_SECTION
-       /*
-        * POWER8 seems to have a hardware bug where setting
-        * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
-        * when some counters are already negative doesn't seem
-        * to cause a performance monitor alert (and hence interrupt).
-        * The effect of this is that when saving the PMU state,
-        * if there is no PMU alert pending when we read MMCR0
-        * before freezing the counters, but one becomes pending
-        * before we read the counters, we lose it.
-        * To work around this, we need a way to freeze the counters
-        * before reading MMCR0.  Normally, freezing the counters
-        * is done by writing MMCR0 (to set MMCR0[FC]) which
-        * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
-        * we can also freeze the counters using MMCR2, by writing
-        * 1s to all the counter freeze condition bits (there are
-        * 9 bits each for 6 counters).
-        */
-       li      r3, -1                  /* set all freeze bits */
-       clrrdi  r3, r3, 10
-       mfspr   r10, SPRN_MMCR2
-       mtspr   SPRN_MMCR2, r3
-       isync
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       li      r3, 1
-       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
-       mfspr   r4, SPRN_MMCR0          /* save MMCR0 */
-       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
-       mfspr   r6, SPRN_MMCRA
-       /* Clear MMCRA in order to disable SDAR updates */
-       li      r7, 0
-       mtspr   SPRN_MMCRA, r7
-       isync
-       beq     21f                     /* if no VPA, save PMU stuff anyway */
-       lbz     r7, LPPACA_PMCINUSE(r8)
-       cmpwi   r7, 0                   /* did they ask for PMU stuff to be saved? */
-       bne     21f
-       std     r3, VCPU_MMCR(r9)       /* if not, set saved MMCR0 to FC */
-       b       22f
-21:    mfspr   r5, SPRN_MMCR1
-       mfspr   r7, SPRN_SIAR
-       mfspr   r8, SPRN_SDAR
-       std     r4, VCPU_MMCR(r9)
-       std     r5, VCPU_MMCR + 8(r9)
-       std     r6, VCPU_MMCR + 16(r9)
-BEGIN_FTR_SECTION
-       std     r10, VCPU_MMCR + 24(r9)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       std     r7, VCPU_SIAR(r9)
-       std     r8, VCPU_SDAR(r9)
-       mfspr   r3, SPRN_PMC1
-       mfspr   r4, SPRN_PMC2
-       mfspr   r5, SPRN_PMC3
-       mfspr   r6, SPRN_PMC4
-       mfspr   r7, SPRN_PMC5
-       mfspr   r8, SPRN_PMC6
-       stw     r3, VCPU_PMC(r9)
-       stw     r4, VCPU_PMC + 4(r9)
-       stw     r5, VCPU_PMC + 8(r9)
-       stw     r6, VCPU_PMC + 12(r9)
-       stw     r7, VCPU_PMC + 16(r9)
-       stw     r8, VCPU_PMC + 20(r9)
-BEGIN_FTR_SECTION
-       mfspr   r5, SPRN_SIER
-       std     r5, VCPU_SIER(r9)
-BEGIN_FTR_SECTION_NESTED(96)
-       mfspr   r6, SPRN_SPMC1
-       mfspr   r7, SPRN_SPMC2
-       mfspr   r8, SPRN_MMCRS
-       stw     r6, VCPU_PMC + 24(r9)
-       stw     r7, VCPU_PMC + 28(r9)
-       std     r8, VCPU_MMCR + 32(r9)
-       lis     r4, 0x8000
-       mtspr   SPRN_MMCRS, r4
-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-22:
-
-       /* Restore host values of some registers */
-BEGIN_FTR_SECTION
-       ld      r5, STACK_SLOT_CIABR(r1)
-       ld      r6, STACK_SLOT_DAWR(r1)
-       ld      r7, STACK_SLOT_DAWRX(r1)
-       mtspr   SPRN_CIABR, r5
+       ld      r5, STACK_SLOT_CIABR(r1)
+       ld      r6, STACK_SLOT_DAWR(r1)
+       ld      r7, STACK_SLOT_DAWRX(r1)
+       mtspr   SPRN_CIABR, r5
        /*
         * If the DAWR doesn't work, it's ok to write these here as
         * this value should always be zero
@@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION
        mtspr   SPRN_DPDES, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
-       /* If HMI, call kvmppc_realmode_hmi_handler() */
-       lwz     r12, STACK_SLOT_TRAP(r1)
-       cmpwi   r12, BOOK3S_INTERRUPT_HMI
-       bne     27f
-       bl      kvmppc_realmode_hmi_handler
-       nop
-       cmpdi   r3, 0
-       /*
-        * At this point kvmppc_realmode_hmi_handler may have resync-ed
-        * the TB, and if it has, we must not subtract the guest timebase
-        * offset from the timebase. So, skip it.
-        *
-        * Also, do not call kvmppc_subcore_exit_guest() because it has
-        * been invoked as part of kvmppc_realmode_hmi_handler().
-        */
-       beq     30f
-
-27:
        /* Subtract timebase offset from timebase */
        ld      r8, VCORE_TB_OFFSET_APPL(r5)
        cmpdi   r8,0
@@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        addis   r8,r8,0x100             /* if so, increment upper 40 bits */
        mtspr   SPRN_TBU40,r8
 
-17:    bl      kvmppc_subcore_exit_guest
+17:
+       /*
+        * If this is an HMI, we called kvmppc_realmode_hmi_handler
+        * above, which may or may not have already called
+        * kvmppc_subcore_exit_guest.  Fortunately, all that
+        * kvmppc_subcore_exit_guest does is clear a flag, so calling
+        * it again here is benign even if kvmppc_realmode_hmi_handler
+        * has already called it.
+        */
+       bl      kvmppc_subcore_exit_guest
        nop
 30:    ld      r5,HSTATE_KVM_VCORE(r13)
        ld      r4,VCORE_KVM(r5)        /* pointer to struct kvm */
@@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
        mtlr    r0
        blr
 
+kvmppc_guest_external:
+       /* External interrupt, first check for host_ipi. If this is
+        * set, we know the host wants us out so let's do it now
+        */
+       bl      kvmppc_read_intr
+
+       /*
+        * Restore the active volatile registers after returning from
+        * a C function.
+        */
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       li      r12, BOOK3S_INTERRUPT_EXTERNAL
+
+       /*
+        * kvmppc_read_intr return codes:
+        *
+        * Exit to host (r3 > 0)
+        *   1 An interrupt is pending that needs to be handled by the host
+        *     Exit guest and return to host by branching to guest_exit_cont
+        *
+        *   2 Passthrough that needs completion in the host
+        *     Exit guest and return to host by branching to guest_exit_cont
+        *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+        *     to indicate to the host to complete handling the interrupt
+        *
+        * Before returning to guest, we check if any CPU is heading out
+        * to the host and if so, we head out also. If no CPUs are heading
+        * check return values <= 0.
+        *
+        * Return to guest (r3 <= 0)
+        *  0 No external interrupt is pending
+        * -1 A guest wakeup IPI (which has now been cleared)
+        *    In either case, we return to guest to deliver any pending
+        *    guest interrupts.
+        *
+        * -2 A PCI passthrough external interrupt was handled
+        *    (interrupt was delivered directly to guest)
+        *    Return to guest to deliver any pending guest interrupts.
+        */
+
+       cmpdi   r3, 1
+       ble     1f
+
+       /* Return code = 2 */
+       li      r12, BOOK3S_INTERRUPT_HV_RM_HARD
+       stw     r12, VCPU_TRAP(r9)
+       b       guest_exit_cont
+
+1:     /* Return code <= 1 */
+       cmpdi   r3, 0
+       bgt     guest_exit_cont
+
+       /* Return code <= 0 */
+maybe_reenter_guest:
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       lwz     r0, VCORE_ENTRY_EXIT(r5)
+       cmpwi   r0, 0x100
+       mr      r4, r9
+       blt     deliver_guest_interrupt
+       b       guest_exit_cont
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 /*
  * Softpatch interrupt for transactional memory emulation cases
@@ -2302,6 +2203,10 @@ hcall_try_real_mode:
        andi.   r0,r11,MSR_PR
        /* sc 1 from userspace - reflect to guest syscall */
        bne     sc_1_fast_return
+       /* sc 1 from nested guest - give it to L1 to handle */
+       ld      r0, VCPU_NESTED(r9)
+       cmpdi   r0, 0
+       bne     guest_exit_cont
        clrrdi  r3,r3,2
        cmpldi  r3,hcall_real_table_end - hcall_real_table
        bge     guest_exit_cont
@@ -2561,6 +2466,7 @@ hcall_real_table:
 hcall_real_table_end:
 
 _GLOBAL(kvmppc_h_set_xdabr)
+EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
        andi.   r0, r5, DABRX_USER | DABRX_KERNEL
        beq     6f
        li      r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
@@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr)
        blr
 
 _GLOBAL(kvmppc_h_set_dabr)
+EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
        li      r5, DABRX_USER | DABRX_KERNEL
 3:
 BEGIN_FTR_SECTION
@@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION
        b       91f
 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        /*
-        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
         */
        ld      r3, HSTATE_KVM_VCPU(r13)
        ld      r4, VCPU_MSR(r3)
+       li      r5, 0                   /* don't preserve non-vol regs */
        bl      kvmppc_save_tm_hv
+       nop
 91:
 #endif
 
@@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION
        b       91f
 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        /*
-        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
         */
        mr      r3, r4
        ld      r4, VCPU_MSR(r3)
+       li      r5, 0                   /* don't preserve non-vol regs */
        bl      kvmppc_restore_tm_hv
+       nop
        ld      r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
@@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        mr      r9, r4
        cmpdi   r3, 0
        bgt     guest_exit_cont
-
-       /* see if any other thread is already exiting */
-       lwz     r0,VCORE_ENTRY_EXIT(r5)
-       cmpwi   r0,0x100
-       bge     guest_exit_cont
-
-       b       kvmppc_cede_reentry     /* if not go back to guest */
+       b       maybe_reenter_guest
 
        /* cede when already previously prodded case */
 kvm_cede_prodded:
@@ -2947,12 +2852,12 @@ machine_check_realmode:
         */
        ld      r11, VCPU_MSR(r9)
        rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
-       bne     mc_cont                 /* if so, exit to host */
+       bne     guest_exit_cont         /* if so, exit to host */
        /* Check if guest is capable of handling NMI exit */
        ld      r10, VCPU_KVM(r9)
        lbz     r10, KVM_FWNMI(r10)
        cmpdi   r10, 1                  /* FWNMI capable? */
-       beq     mc_cont                 /* if so, exit with KVM_EXIT_NMI. */
+       beq     guest_exit_cont         /* if so, exit with KVM_EXIT_NMI. */
 
        /* if not, fall through for backward compatibility. */
        andi.   r10, r11, MSR_RI        /* check for unrecoverable exception */
@@ -2966,6 +2871,21 @@ machine_check_realmode:
 2:     b       fast_interrupt_c_return
 
 /*
+ * Call C code to handle a HMI in real mode.
+ * Only the primary thread does the call, secondary threads are handled
+ * by calling hmi_exception_realmode() after kvmppc_hv_entry returns.
+ * r9 points to the vcpu on entry
+ */
+hmi_realmode:
+       lbz     r0, HSTATE_PTID(r13)
+       cmpwi   r0, 0
+       bne     guest_exit_cont
+       bl      kvmppc_realmode_hmi_handler
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       li      r12, BOOK3S_INTERRUPT_HMI
+       b       guest_exit_cont
+
+/*
  * Check the reason we woke from nap, and take appropriate action.
  * Returns (in r3):
  *     0 if nothing needs to be done
@@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  * Save transactional state and TM-related registers.
  * Called with r3 pointing to the vcpu struct and r4 containing
  * the guest MSR value.
- * This can modify all checkpointed registers, but
+ * r5 is non-zero iff non-volatile register state needs to be maintained.
+ * If r5 == 0, this can modify all checkpointed registers, but
  * restores r1 and r2 before exit.
  */
-kvmppc_save_tm_hv:
+_GLOBAL_TOC(kvmppc_save_tm_hv)
+EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv)
        /* See if we need to handle fake suspend mode */
 BEGIN_FTR_SECTION
        b       __kvmppc_save_tm
@@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
        nop
 
-       std     r1, HSTATE_HOST_R1(r13)
-
-       /* Clear the MSR RI since r1, r13 may be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
-
        /* We have to treclaim here because that's the only way to do S->N */
        li      r3, TM_CAUSE_KVM_RESCHED
        TRECLAIM(R3)
@@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
         * We were in fake suspend, so we are not going to save the
         * register state as the guest checkpointed state (since
         * we already have it), therefore we can now use any volatile GPR.
+        * In fact treclaim in fake suspend state doesn't modify
+        * any registers.
         */
-       /* Reload PACA pointer, stack pointer and TOC. */
-       GET_PACA(r13)
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATOC(r13)
 
-       /* Set MSR RI now we have r1 and r13 back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
-
-       HMT_MEDIUM
-       ld      r6, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r6
-BEGIN_FTR_SECTION_NESTED(96)
+BEGIN_FTR_SECTION
        bl      pnv_power9_force_smt4_release
-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
        nop
 
 4:
@@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
  * Restore transactional state and TM-related registers.
  * Called with r3 pointing to the vcpu struct
  * and r4 containing the guest MSR value.
+ * r5 is non-zero iff non-volatile register state needs to be maintained.
  * This potentially modifies all checkpointed registers.
  * It restores r1 and r2 from the PACA.
  */
-kvmppc_restore_tm_hv:
+_GLOBAL_TOC(kvmppc_restore_tm_hv)
+EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv)
        /*
         * If we are doing TM emulation for the guest on a POWER9 DD2,
         * then we don't actually do a trechkpt -- we either set up
@@ -3424,6 +3333,194 @@ kvmppc_msr_interrupt:
        blr
 
 /*
+ * Load up guest PMU state.  R3 points to the vcpu struct.
+ */
+_GLOBAL(kvmhv_load_guest_pmu)
+EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu)
+       mr      r4, r3
+       mflr    r0
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
+       isync
+BEGIN_FTR_SECTION
+       ld      r3, VCPU_MMCR(r4)
+       andi.   r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+       cmpwi   r5, MMCR0_PMAO
+       beql    kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+       lwz     r3, VCPU_PMC(r4)        /* always load up guest PMU registers */
+       lwz     r5, VCPU_PMC + 4(r4)    /* to prevent information leak */
+       lwz     r6, VCPU_PMC + 8(r4)
+       lwz     r7, VCPU_PMC + 12(r4)
+       lwz     r8, VCPU_PMC + 16(r4)
+       lwz     r9, VCPU_PMC + 20(r4)
+       mtspr   SPRN_PMC1, r3
+       mtspr   SPRN_PMC2, r5
+       mtspr   SPRN_PMC3, r6
+       mtspr   SPRN_PMC4, r7
+       mtspr   SPRN_PMC5, r8
+       mtspr   SPRN_PMC6, r9
+       ld      r3, VCPU_MMCR(r4)
+       ld      r5, VCPU_MMCR + 8(r4)
+       ld      r6, VCPU_MMCR + 16(r4)
+       ld      r7, VCPU_SIAR(r4)
+       ld      r8, VCPU_SDAR(r4)
+       mtspr   SPRN_MMCR1, r5
+       mtspr   SPRN_MMCRA, r6
+       mtspr   SPRN_SIAR, r7
+       mtspr   SPRN_SDAR, r8
+BEGIN_FTR_SECTION
+       ld      r5, VCPU_MMCR + 24(r4)
+       ld      r6, VCPU_SIER(r4)
+       mtspr   SPRN_MMCR2, r5
+       mtspr   SPRN_SIER, r6
+BEGIN_FTR_SECTION_NESTED(96)
+       lwz     r7, VCPU_PMC + 24(r4)
+       lwz     r8, VCPU_PMC + 28(r4)
+       ld      r9, VCPU_MMCR + 32(r4)
+       mtspr   SPRN_SPMC1, r7
+       mtspr   SPRN_SPMC2, r8
+       mtspr   SPRN_MMCRS, r9
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       mtspr   SPRN_MMCR0, r3
+       isync
+       mtlr    r0
+       blr
+
+/*
+ * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
+ */
+_GLOBAL(kvmhv_load_host_pmu)
+EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu)
+       mflr    r0
+       lbz     r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
+       cmpwi   r4, 0
+       beq     23f                     /* skip if not */
+BEGIN_FTR_SECTION
+       ld      r3, HSTATE_MMCR0(r13)
+       andi.   r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+       cmpwi   r4, MMCR0_PMAO
+       beql    kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+       lwz     r3, HSTATE_PMC1(r13)
+       lwz     r4, HSTATE_PMC2(r13)
+       lwz     r5, HSTATE_PMC3(r13)
+       lwz     r6, HSTATE_PMC4(r13)
+       lwz     r8, HSTATE_PMC5(r13)
+       lwz     r9, HSTATE_PMC6(r13)
+       mtspr   SPRN_PMC1, r3
+       mtspr   SPRN_PMC2, r4
+       mtspr   SPRN_PMC3, r5
+       mtspr   SPRN_PMC4, r6
+       mtspr   SPRN_PMC5, r8
+       mtspr   SPRN_PMC6, r9
+       ld      r3, HSTATE_MMCR0(r13)
+       ld      r4, HSTATE_MMCR1(r13)
+       ld      r5, HSTATE_MMCRA(r13)
+       ld      r6, HSTATE_SIAR(r13)
+       ld      r7, HSTATE_SDAR(r13)
+       mtspr   SPRN_MMCR1, r4
+       mtspr   SPRN_MMCRA, r5
+       mtspr   SPRN_SIAR, r6
+       mtspr   SPRN_SDAR, r7
+BEGIN_FTR_SECTION
+       ld      r8, HSTATE_MMCR2(r13)
+       ld      r9, HSTATE_SIER(r13)
+       mtspr   SPRN_MMCR2, r8
+       mtspr   SPRN_SIER, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       mtspr   SPRN_MMCR0, r3
+       isync
+       mtlr    r0
+23:    blr
+
+/*
+ * Save guest PMU state into the vcpu struct.
+ * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
+ */
+_GLOBAL(kvmhv_save_guest_pmu)
+EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu)
+       mr      r9, r3
+       mr      r8, r4
+BEGIN_FTR_SECTION
+       /*
+        * POWER8 seems to have a hardware bug where setting
+        * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
+        * when some counters are already negative doesn't seem
+        * to cause a performance monitor alert (and hence interrupt).
+        * The effect of this is that when saving the PMU state,
+        * if there is no PMU alert pending when we read MMCR0
+        * before freezing the counters, but one becomes pending
+        * before we read the counters, we lose it.
+        * To work around this, we need a way to freeze the counters
+        * before reading MMCR0.  Normally, freezing the counters
+        * is done by writing MMCR0 (to set MMCR0[FC]) which
+        * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
+        * we can also freeze the counters using MMCR2, by writing
+        * 1s to all the counter freeze condition bits (there are
+        * 9 bits each for 6 counters).
+        */
+       li      r3, -1                  /* set all freeze bits */
+       clrrdi  r3, r3, 10
+       mfspr   r10, SPRN_MMCR2
+       mtspr   SPRN_MMCR2, r3
+       isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mfspr   r4, SPRN_MMCR0          /* save MMCR0 */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
+       mfspr   r6, SPRN_MMCRA
+       /* Clear MMCRA in order to disable SDAR updates */
+       li      r7, 0
+       mtspr   SPRN_MMCRA, r7
+       isync
+       cmpwi   r8, 0                   /* did they ask for PMU stuff to be saved? */
+       bne     21f
+       std     r3, VCPU_MMCR(r9)       /* if not, set saved MMCR0 to FC */
+       b       22f
+21:    mfspr   r5, SPRN_MMCR1
+       mfspr   r7, SPRN_SIAR
+       mfspr   r8, SPRN_SDAR
+       std     r4, VCPU_MMCR(r9)
+       std     r5, VCPU_MMCR + 8(r9)
+       std     r6, VCPU_MMCR + 16(r9)
+BEGIN_FTR_SECTION
+       std     r10, VCPU_MMCR + 24(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       std     r7, VCPU_SIAR(r9)
+       std     r8, VCPU_SDAR(r9)
+       mfspr   r3, SPRN_PMC1
+       mfspr   r4, SPRN_PMC2
+       mfspr   r5, SPRN_PMC3
+       mfspr   r6, SPRN_PMC4
+       mfspr   r7, SPRN_PMC5
+       mfspr   r8, SPRN_PMC6
+       stw     r3, VCPU_PMC(r9)
+       stw     r4, VCPU_PMC + 4(r9)
+       stw     r5, VCPU_PMC + 8(r9)
+       stw     r6, VCPU_PMC + 12(r9)
+       stw     r7, VCPU_PMC + 16(r9)
+       stw     r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+       mfspr   r5, SPRN_SIER
+       std     r5, VCPU_SIER(r9)
+BEGIN_FTR_SECTION_NESTED(96)
+       mfspr   r6, SPRN_SPMC1
+       mfspr   r7, SPRN_SPMC2
+       mfspr   r8, SPRN_MMCRS
+       stw     r6, VCPU_PMC + 24(r9)
+       stw     r7, VCPU_PMC + 28(r9)
+       std     r8, VCPU_MMCR + 32(r9)
+       lis     r4, 0x8000
+       mtspr   SPRN_MMCRS, r4
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+22:    blr
+
+/*
  * This works around a hardware bug on POWER8E processors, where
  * writing a 1 to the MMCR0[PMAO] bit doesn't generate a
  * performance monitor interrupt.  Instead, when we need to have
index 0082850..888e260 100644 (file)
@@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
                        return RESUME_GUEST;
                }
                /* Set CR0 to indicate previous transactional state */
-               vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
+               vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
                        (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
                /* L=1 => tresume, L=0 => tsuspend */
                if (instr & (1 << 21)) {
@@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
                copy_from_checkpoint(vcpu);
 
                /* Set CR0 to indicate previous transactional state */
-               vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
+               vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
                        (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
                vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
                return RESUME_GUEST;
@@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
                copy_to_checkpoint(vcpu);
 
                /* Set CR0 to indicate previous transactional state */
-               vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
+               vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
                        (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
                vcpu->arch.shregs.msr = msr | MSR_TS_S;
                return RESUME_GUEST;
index b2c7c6f..3cf5863 100644 (file)
@@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu)
                if (instr & (1 << 21))
                        vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
                /* Set CR0 to 0b0010 */
-               vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000;
+               vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+                       0x20000000;
                return 1;
        }
 
@@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu)
        vcpu->arch.shregs.msr &= ~MSR_TS_MASK;  /* go to N state */
        vcpu->arch.regs.nip = vcpu->arch.tfhar;
        copy_from_checkpoint(vcpu);
-       vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000;
+       vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000;
 }
index 614ebb4..4efd65d 100644 (file)
@@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
        svcpu->gpr[11] = vcpu->arch.regs.gpr[11];
        svcpu->gpr[12] = vcpu->arch.regs.gpr[12];
        svcpu->gpr[13] = vcpu->arch.regs.gpr[13];
-       svcpu->cr  = vcpu->arch.cr;
+       svcpu->cr  = vcpu->arch.regs.ccr;
        svcpu->xer = vcpu->arch.regs.xer;
        svcpu->ctr = vcpu->arch.regs.ctr;
        svcpu->lr  = vcpu->arch.regs.link;
@@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
        vcpu->arch.regs.gpr[11] = svcpu->gpr[11];
        vcpu->arch.regs.gpr[12] = svcpu->gpr[12];
        vcpu->arch.regs.gpr[13] = svcpu->gpr[13];
-       vcpu->arch.cr  = svcpu->cr;
+       vcpu->arch.regs.ccr  = svcpu->cr;
        vcpu->arch.regs.xer = svcpu->xer;
        vcpu->arch.regs.ctr = svcpu->ctr;
        vcpu->arch.regs.link  = svcpu->lr;
@@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                r = RESUME_GUEST;
                break;
        case BOOK3S_INTERRUPT_EXTERNAL:
-       case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
        case BOOK3S_INTERRUPT_EXTERNAL_HV:
        case BOOK3S_INTERRUPT_H_VIRT:
                vcpu->stat.ext_intr_exits++;
index b8356cd..b0b2bfc 100644 (file)
@@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp,
         */
        if (new.out_ee) {
                kvmppc_book3s_queue_irqprio(icp->vcpu,
-                                           BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+                                           BOOK3S_INTERRUPT_EXTERNAL);
                if (!change_self)
                        kvmppc_fast_vcpu_kick(icp->vcpu);
        }
@@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
        u32 xirr;
 
        /* First, remove EE from the processor */
-       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
-                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
 
        /*
         * ICP State: Accept_Interrupt
@@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
         * We can remove EE from the current processor, the update
         * transaction will set it again if needed
         */
-       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
-                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
 
        do {
                old_state = new_state = READ_ONCE(icp->state);
@@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
         * Deassert the CPU interrupt request.
         * icp_try_update will reassert it if necessary.
         */
-       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
-                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
 
        /*
         * Note that if we displace an interrupt from old_state.xisr,
@@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
        }
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-       if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+       if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+           cpu_has_feature(CPU_FTR_HVMODE)) {
                /* Enable real mode support */
                xics->real_mode = ENABLE_REALMODE;
                xics->real_mode_dbg = DEBUG_REALMODE;
index 30c2eb7..ad4a370 100644 (file)
 #define XIVE_Q_GAP     2
 
 /*
+ * Push a vcpu's context to the XIVE on guest entry.
+ * This assumes we are in virtual mode (MMU on)
+ */
+void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
+{
+       void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
+       u64 pq;
+
+       if (!tima)
+               return;
+       eieio();
+       __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
+       __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
+       vcpu->arch.xive_pushed = 1;
+       eieio();
+
+       /*
+        * We clear the irq_pending flag. There is a small chance of a
+        * race vs. the escalation interrupt happening on another
+        * processor setting it again, but the only consequence is to
+        * cause a spurious wakeup on the next H_CEDE, which is not an
+        * issue.
+        */
+       vcpu->arch.irq_pending = 0;
+
+       /*
+        * In single escalation mode, if the escalation interrupt is
+        * on, we mask it.
+        */
+       if (vcpu->arch.xive_esc_on) {
+               pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
+                                                 XIVE_ESB_SET_PQ_01));
+               mb();
+
+               /*
+                * We have a possible subtle race here: The escalation
+                * interrupt might have fired and be on its way to the
+                * host queue while we mask it, and if we unmask it
+                * early enough (re-cede right away), there is a
+                * theorical possibility that it fires again, thus
+                * landing in the target queue more than once which is
+                * a big no-no.
+                *
+                * Fortunately, solving this is rather easy. If the
+                * above load setting PQ to 01 returns a previous
+                * value where P is set, then we know the escalation
+                * interrupt is somewhere on its way to the host. In
+                * that case we simply don't clear the xive_esc_on
+                * flag below. It will be eventually cleared by the
+                * handler for the escalation interrupt.
+                *
+                * Then, when doing a cede, we check that flag again
+                * before re-enabling the escalation interrupt, and if
+                * set, we abort the cede.
+                */
+               if (!(pq & XIVE_ESB_VAL_P))
+                       /* Now P is 0, we can clear the flag */
+                       vcpu->arch.xive_esc_on = 0;
+       }
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
+
+/*
  * This is a simple trigger for a generic XIVE IRQ. This must
  * only be called for interrupts that support a trigger page
  */
index 4171ede..033363d 100644 (file)
@@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
        /* First collect pending bits from HW */
        GLUE(X_PFX,ack_pending)(xc);
 
-       /*
-        * Cleanup the old-style bits if needed (they may have been
-        * set by pull or an escalation interrupts).
-        */
-       if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
-               clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
-                         &vcpu->arch.pending_exceptions);
-
        pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
                 xc->pending, xc->hw_cppr, xc->cppr);
 
index 81bd8a0..051af7d 100644 (file)
         */
        PPC_LL  r4, PACACURRENT(r13)
        PPC_LL  r4, (THREAD + THREAD_KVM_VCPU)(r4)
-       stw     r10, VCPU_CR(r4)
+       PPC_STL r10, VCPU_CR(r4)
        PPC_STL r11, VCPU_GPR(R4)(r4)
        PPC_STL r5, VCPU_GPR(R5)(r4)
        PPC_STL r6, VCPU_GPR(R6)(r4)
@@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
        PPC_STL r4, VCPU_GPR(R4)(r11)
        PPC_LL  r4, THREAD_NORMSAVE(0)(r10)
        PPC_STL r5, VCPU_GPR(R5)(r11)
-       stw     r13, VCPU_CR(r11)
+       PPC_STL r13, VCPU_CR(r11)
        mfspr   r5, \srr0
        PPC_STL r3, VCPU_GPR(R10)(r11)
        PPC_LL  r3, THREAD_NORMSAVE(2)(r10)
@@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
        PPC_STL r4, VCPU_GPR(R4)(r11)
        PPC_LL  r4, GPR9(r8)
        PPC_STL r5, VCPU_GPR(R5)(r11)
-       stw     r9, VCPU_CR(r11)
+       PPC_STL r9, VCPU_CR(r11)
        mfspr   r5, \srr0
        PPC_STL r3, VCPU_GPR(R8)(r11)
        PPC_LL  r3, GPR10(r8)
@@ -643,7 +643,7 @@ lightweight_exit:
        PPC_LL  r3, VCPU_LR(r4)
        PPC_LL  r5, VCPU_XER(r4)
        PPC_LL  r6, VCPU_CTR(r4)
-       lwz     r7, VCPU_CR(r4)
+       PPC_LL  r7, VCPU_CR(r4)
        PPC_LL  r8, VCPU_PC(r4)
        PPC_LD(r9, VCPU_SHARED_MSR, r11)
        PPC_LL  r0, VCPU_GPR(R0)(r4)
index 75dce1e..f91b130 100644 (file)
@@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
        emulated = EMULATE_FAIL;
        vcpu->arch.regs.msr = vcpu->arch.shared->msr;
-       vcpu->arch.regs.ccr = vcpu->arch.cr;
        if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) {
                int type = op.type & INSTR_TYPE_MASK;
                int size = GETSIZE(op.type);
index eba5756..2869a29 100644 (file)
@@ -594,7 +594,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = !!(hv_enabled && radix_enabled());
                break;
        case KVM_CAP_PPC_MMU_HASH_V3:
-               r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
+               r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
+                      cpu_has_feature(CPU_FTR_HVMODE));
+               break;
+       case KVM_CAP_PPC_NESTED_HV:
+               r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
+                      !kvmppc_hv_ops->enable_nested(NULL));
                break;
 #endif
        case KVM_CAP_SYNC_MMU:
@@ -2114,6 +2119,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                        r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
                break;
        }
+
+       case KVM_CAP_PPC_NESTED_HV:
+               r = -EINVAL;
+               if (!is_kvmppc_hv_enabled(kvm) ||
+                   !kvm->arch.kvm_ops->enable_nested)
+                       break;
+               r = kvm->arch.kvm_ops->enable_nested(kvm);
+               break;
 #endif
        default:
                r = -EINVAL;
index 90e330f..0531a14 100644 (file)
  * Save transactional state and TM-related registers.
  * Called with:
  * - r3 pointing to the vcpu struct
- * - r4 points to the MSR with current TS bits:
+ * - r4 containing the MSR with current TS bits:
  *     (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
- * This can modify all checkpointed registers, but
- * restores r1, r2 before exit.
+ * - r5 containing a flag indicating that non-volatile registers
+ *     must be preserved.
+ * If r5 == 0, this can modify all checkpointed registers, but
+ * restores r1, r2 before exit.  If r5 != 0, this restores the
+ * MSR TM/FP/VEC/VSX bits to their state on entry.
  */
 _GLOBAL(__kvmppc_save_tm)
        mflr    r0
        std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -SWITCH_FRAME_SIZE(r1)
+
+       mr      r9, r3
+       cmpdi   cr7, r5, 0
 
        /* Turn on TM. */
        mfmsr   r8
+       mr      r10, r8
        li      r0, 1
        rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
        ori     r8, r8, MSR_FP
@@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm)
        std     r1, HSTATE_SCRATCH2(r13)
        std     r3, HSTATE_SCRATCH1(r13)
 
+       /* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */
+       mfcr    r6
+       SAVE_GPR(6, r1)
+
+       /* Save DSCR so we can restore it to avoid running with user value */
+       mfspr   r7, SPRN_DSCR
+       SAVE_GPR(7, r1)
+
+       /*
+        * We are going to do treclaim., which will modify all checkpointed
+        * registers.  Save the non-volatile registers on the stack if
+        * preservation of non-volatile state has been requested.
+        */
+       beq     cr7, 3f
+       SAVE_NVGPRS(r1)
+
+       /* MSR[TS] will be 0 (non-transactional) once we do treclaim. */
+       li      r0, 0
+       rldimi  r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+       SAVE_GPR(10, r1)        /* final MSR value */
+3:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 BEGIN_FTR_SECTION
        /* Emulation of the treclaim instruction needs TEXASR before treclaim */
@@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
        std     r9, PACATMSCRATCH(r13)
        ld      r9, HSTATE_SCRATCH1(r13)
 
-       /* Get a few more GPRs free. */
-       std     r29, VCPU_GPRS_TM(29)(r9)
-       std     r30, VCPU_GPRS_TM(30)(r9)
-       std     r31, VCPU_GPRS_TM(31)(r9)
-
-       /* Save away PPR and DSCR soon so don't run with user values. */
-       mfspr   r31, SPRN_PPR
+       /* Save away PPR soon so we don't run with user value. */
+       std     r0, VCPU_GPRS_TM(0)(r9)
+       mfspr   r0, SPRN_PPR
        HMT_MEDIUM
-       mfspr   r30, SPRN_DSCR
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-#endif
 
-       /* Save all but r9, r13 & r29-r31 */
-       reg = 0
+       /* Reload stack pointer. */
+       std     r1, VCPU_GPRS_TM(1)(r9)
+       ld      r1, HSTATE_SCRATCH2(r13)
+
+       /* Set MSR RI now we have r1 and r13 back. */
+       std     r2, VCPU_GPRS_TM(2)(r9)
+       li      r2, MSR_RI
+       mtmsrd  r2, 1
+
+       /* Reload TOC pointer. */
+       ld      r2, PACATOC(r13)
+
+       /* Save all but r0-r2, r9 & r13 */
+       reg = 3
        .rept   29
        .if (reg != 9) && (reg != 13)
        std     reg, VCPU_GPRS_TM(reg)(r9)
@@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
        ld      r4, PACATMSCRATCH(r13)
        std     r4, VCPU_GPRS_TM(9)(r9)
 
-       /* Reload stack pointer and TOC. */
-       ld      r1, HSTATE_SCRATCH2(r13)
-       ld      r2, PACATOC(r13)
-
-       /* Set MSR RI now we have r1 and r13 back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
+       /* Restore host DSCR and CR values, after saving guest values */
+       mfcr    r6
+       mfspr   r7, SPRN_DSCR
+       stw     r6, VCPU_CR_TM(r9)
+       std     r7, VCPU_DSCR_TM(r9)
+       REST_GPR(6, r1)
+       REST_GPR(7, r1)
+       mtcr    r6
+       mtspr   SPRN_DSCR, r7
 
-       /* Save away checkpinted SPRs. */
-       std     r31, VCPU_PPR_TM(r9)
-       std     r30, VCPU_DSCR_TM(r9)
+       /* Save away checkpointed SPRs. */
+       std     r0, VCPU_PPR_TM(r9)
        mflr    r5
-       mfcr    r6
        mfctr   r7
        mfspr   r8, SPRN_AMR
        mfspr   r10, SPRN_TAR
        mfxer   r11
        std     r5, VCPU_LR_TM(r9)
-       stw     r6, VCPU_CR_TM(r9)
        std     r7, VCPU_CTR_TM(r9)
        std     r8, VCPU_AMR_TM(r9)
        std     r10, VCPU_TAR_TM(r9)
        std     r11, VCPU_XER_TM(r9)
 
-       /* Restore r12 as trap number. */
-       lwz     r12, VCPU_TRAP(r9)
-
        /* Save FP/VSX. */
        addi    r3, r9, VCPU_FPRS_TM
        bl      store_fp_state
@@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
        bl      store_vr_state
        mfspr   r6, SPRN_VRSAVE
        stw     r6, VCPU_VRSAVE_TM(r9)
+
+       /* Restore non-volatile registers if requested to */
+       beq     cr7, 1f
+       REST_NVGPRS(r1)
+       REST_GPR(10, r1)
 1:
        /*
         * We need to save these SPRs after the treclaim so that the software
@@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
         */
        mfspr   r7, SPRN_TEXASR
        std     r7, VCPU_TEXASR(r9)
-11:
        mfspr   r5, SPRN_TFHAR
        mfspr   r6, SPRN_TFIAR
        std     r5, VCPU_TFHAR(r9)
        std     r6, VCPU_TFIAR(r9)
 
+       /* Restore MSR state if requested */
+       beq     cr7, 2f
+       mtmsrd  r10, 0
+2:
+       addi    r1, r1, SWITCH_FRAME_SIZE
        ld      r0, PPC_LR_STKOFF(r1)
        mtlr    r0
        blr
@@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
  * be invoked from C function by PR KVM only.
  */
 _GLOBAL(_kvmppc_save_tm_pr)
-       mflr    r5
-       std     r5, PPC_LR_STKOFF(r1)
-       stdu    r1, -SWITCH_FRAME_SIZE(r1)
-       SAVE_NVGPRS(r1)
-
-       /* save MSR since TM/math bits might be impacted
-        * by __kvmppc_save_tm().
-        */
-       mfmsr   r5
-       SAVE_GPR(5, r1)
-
-       /* also save DSCR/CR/TAR so that it can be recovered later */
-       mfspr   r6, SPRN_DSCR
-       SAVE_GPR(6, r1)
-
-       mfcr    r7
-       stw     r7, _CCR(r1)
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -PPC_MIN_STKFRM(r1)
 
        mfspr   r8, SPRN_TAR
-       SAVE_GPR(8, r1)
+       std     r8, PPC_MIN_STKFRM-8(r1)
 
+       li      r5, 1           /* preserve non-volatile registers */
        bl      __kvmppc_save_tm
 
-       REST_GPR(8, r1)
+       ld      r8, PPC_MIN_STKFRM-8(r1)
        mtspr   SPRN_TAR, r8
 
-       ld      r7, _CCR(r1)
-       mtcr    r7
-
-       REST_GPR(6, r1)
-       mtspr   SPRN_DSCR, r6
-
-       /* need preserve current MSR's MSR_TS bits */
-       REST_GPR(5, r1)
-       mfmsr   r6
-       rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
-       rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
-       mtmsrd  r5
-
-       REST_NVGPRS(r1)
-       addi    r1, r1, SWITCH_FRAME_SIZE
-       ld      r5, PPC_LR_STKOFF(r1)
-       mtlr    r5
+       addi    r1, r1, PPC_MIN_STKFRM
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
        blr
 
 EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
@@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
  *  - r4 is the guest MSR with desired TS bits:
  *     For HV KVM, it is VCPU_MSR
  *     For PR KVM, it is provided by caller
- * This potentially modifies all checkpointed registers.
- * It restores r1, r2 from the PACA.
+ * - r5 containing a flag indicating that non-volatile registers
+ *     must be preserved.
+ * If r5 == 0, this potentially modifies all checkpointed registers, but
+ * restores r1, r2 from the PACA before exit.
+ * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry.
  */
 _GLOBAL(__kvmppc_restore_tm)
        mflr    r0
        std     r0, PPC_LR_STKOFF(r1)
 
+       cmpdi   cr7, r5, 0
+
        /* Turn on TM/FP/VSX/VMX so we can restore them. */
        mfmsr   r5
+       mr      r10, r5
        li      r6, MSR_TM >> 32
        sldi    r6, r6, 32
        or      r5, r5, r6
@@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm)
 
        mr      r5, r4
        rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beqlr           /* TM not active in guest */
-       std     r1, HSTATE_SCRATCH2(r13)
+       beq     9f              /* TM not active in guest */
 
        /* Make sure the failure summary is set, otherwise we'll program check
         * when we trechkpt.  It's possible that this might have been not set
@@ -256,6 +271,26 @@ _GLOBAL(__kvmppc_restore_tm)
        mtspr   SPRN_TEXASR, r7
 
        /*
+        * Make a stack frame and save non-volatile registers if requested.
+        */
+       stdu    r1, -SWITCH_FRAME_SIZE(r1)
+       std     r1, HSTATE_SCRATCH2(r13)
+
+       mfcr    r6
+       mfspr   r7, SPRN_DSCR
+       SAVE_GPR(2, r1)
+       SAVE_GPR(6, r1)
+       SAVE_GPR(7, r1)
+
+       beq     cr7, 4f
+       SAVE_NVGPRS(r1)
+
+       /* MSR[TS] will be 1 (suspended) once we do trechkpt */
+       li      r0, 1
+       rldimi  r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+       SAVE_GPR(10, r1)        /* final MSR value */
+4:
+       /*
         * We need to load up the checkpointed state for the guest.
         * We need to do this early as it will blow away any GPRs, VSRs and
         * some SPRs.
@@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm)
        ld      r29, VCPU_DSCR_TM(r3)
        ld      r30, VCPU_PPR_TM(r3)
 
-       std     r2, PACATMSCRATCH(r13) /* Save TOC */
-
        /* Clear the MSR RI since r1, r13 are all going to be foobar. */
        li      r5, 0
        mtmsrd  r5, 1
@@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm)
        /* Now let's get back the state we need. */
        HMT_MEDIUM
        GET_PACA(r13)
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-#endif
        ld      r1, HSTATE_SCRATCH2(r13)
-       ld      r2, PACATMSCRATCH(r13)
+       REST_GPR(7, r1)
+       mtspr   SPRN_DSCR, r7
 
        /* Set the MSR RI since we have our registers back. */
        li      r5, MSR_RI
        mtmsrd  r5, 1
+
+       /* Restore TOC pointer and CR */
+       REST_GPR(2, r1)
+       REST_GPR(6, r1)
+       mtcr    r6
+
+       /* Restore non-volatile registers if requested to. */
+       beq     cr7, 5f
+       REST_GPR(10, r1)
+       REST_NVGPRS(r1)
+
+5:     addi    r1, r1, SWITCH_FRAME_SIZE
        ld      r0, PPC_LR_STKOFF(r1)
        mtlr    r0
+
+9:     /* Restore MSR bits if requested */
+       beqlr   cr7
+       mtmsrd  r10, 0
        blr
 
 /*
@@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm)
  * can be invoked from C function by PR KVM only.
  */
 _GLOBAL(_kvmppc_restore_tm_pr)
-       mflr    r5
-       std     r5, PPC_LR_STKOFF(r1)
-       stdu    r1, -SWITCH_FRAME_SIZE(r1)
-       SAVE_NVGPRS(r1)
-
-       /* save MSR to avoid TM/math bits change */
-       mfmsr   r5
-       SAVE_GPR(5, r1)
-
-       /* also save DSCR/CR/TAR so that it can be recovered later */
-       mfspr   r6, SPRN_DSCR
-       SAVE_GPR(6, r1)
-
-       mfcr    r7
-       stw     r7, _CCR(r1)
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -PPC_MIN_STKFRM(r1)
 
+       /* save TAR so that it can be recovered later */
        mfspr   r8, SPRN_TAR
-       SAVE_GPR(8, r1)
+       std     r8, PPC_MIN_STKFRM-8(r1)
 
+       li      r5, 1
        bl      __kvmppc_restore_tm
 
-       REST_GPR(8, r1)
+       ld      r8, PPC_MIN_STKFRM-8(r1)
        mtspr   SPRN_TAR, r8
 
-       ld      r7, _CCR(r1)
-       mtcr    r7
-
-       REST_GPR(6, r1)
-       mtspr   SPRN_DSCR, r6
-
-       /* need preserve current MSR's MSR_TS bits */
-       REST_GPR(5, r1)
-       mfmsr   r6
-       rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
-       rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
-       mtmsrd  r5
-
-       REST_NVGPRS(r1)
-       addi    r1, r1, SWITCH_FRAME_SIZE
-       ld      r5, PPC_LR_STKOFF(r1)
-       mtlr    r5
+       addi    r1, r1, PPC_MIN_STKFRM
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
        blr
 
 EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
index f3b2375..372a82f 100644 (file)
@@ -14,7 +14,6 @@
        {0x400, "INST_STORAGE"}, \
        {0x480, "INST_SEGMENT"}, \
        {0x500, "EXTERNAL"}, \
-       {0x501, "EXTERNAL_LEVEL"}, \
        {0x502, "EXTERNAL_HV"}, \
        {0x600, "ALIGNMENT"}, \
        {0x700, "PROGRAM"}, \
index fef3e1e..4c4dfc4 100644 (file)
@@ -833,6 +833,15 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
 /*
  * Flush partition scoped translations from LPID (=LPIDR)
  */
+void radix__flush_tlb_lpid(unsigned int lpid)
+{
+       _tlbie_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
 void radix__local_flush_tlb_lpid(unsigned int lpid)
 {
        _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
index 039a341..8b25e1f 100644 (file)
@@ -783,6 +783,17 @@ config VFIO_CCW
          To compile this driver as a module, choose M here: the
          module will be called vfio_ccw.
 
+config VFIO_AP
+       def_tristate n
+       prompt "VFIO support for AP devices"
+       depends on S390_AP_IOMMU && VFIO_MDEV_DEVICE && KVM
+       help
+               This driver grants access to Adjunct Processor (AP) devices
+               via the VFIO mediated device interface.
+
+               To compile this driver as a module, choose M here: the module
+               will be called vfio_ap.
+
 endmenu
 
 menu "Dump support"
index 29c940b..d5d2488 100644 (file)
@@ -44,6 +44,7 @@
 #define KVM_REQ_ICPT_OPEREXC   KVM_ARCH_REQ(2)
 #define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3)
 #define KVM_REQ_STOP_MIGRATION  KVM_ARCH_REQ(4)
+#define KVM_REQ_VSIE_RESTART   KVM_ARCH_REQ(5)
 
 #define SIGP_CTRL_C            0x80
 #define SIGP_CTRL_SCN_MASK     0x3f
@@ -186,6 +187,7 @@ struct kvm_s390_sie_block {
 #define ECA_AIV                0x00200000
 #define ECA_VX         0x00020000
 #define ECA_PROTEXCI   0x00002000
+#define ECA_APIE       0x00000008
 #define ECA_SII                0x00000001
        __u32   eca;                    /* 0x004c */
 #define ICPT_INST      0x04
@@ -237,7 +239,11 @@ struct kvm_s390_sie_block {
        psw_t   gpsw;                   /* 0x0090 */
        __u64   gg14;                   /* 0x00a0 */
        __u64   gg15;                   /* 0x00a8 */
-       __u8    reservedb0[20];         /* 0x00b0 */
+       __u8    reservedb0[8];          /* 0x00b0 */
+#define HPID_KVM       0x4
+#define HPID_VSIE      0x5
+       __u8    hpid;                   /* 0x00b8 */
+       __u8    reservedb9[11];         /* 0x00b9 */
        __u16   extcpuaddr;             /* 0x00c4 */
        __u16   eic;                    /* 0x00c6 */
        __u32   reservedc8;             /* 0x00c8 */
@@ -255,6 +261,8 @@ struct kvm_s390_sie_block {
        __u8    reservede4[4];          /* 0x00e4 */
        __u64   tecmc;                  /* 0x00e8 */
        __u8    reservedf0[12];         /* 0x00f0 */
+#define CRYCB_FORMAT_MASK 0x00000003
+#define CRYCB_FORMAT0 0x00000000
 #define CRYCB_FORMAT1 0x00000001
 #define CRYCB_FORMAT2 0x00000003
        __u32   crycbd;                 /* 0x00fc */
@@ -715,6 +723,7 @@ struct kvm_s390_crypto {
        __u32 crycbd;
        __u8 aes_kw;
        __u8 dea_kw;
+       __u8 apie;
 };
 
 #define APCB0_MASK_SIZE 1
@@ -855,6 +864,10 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
                                 struct kvm_async_pf *work);
 
+void kvm_arch_crypto_clear_masks(struct kvm *kvm);
+void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
+                              unsigned long *aqm, unsigned long *adm);
+
 extern int sie64a(struct kvm_s390_sie_block *, u64 *);
 extern char sie_exit;
 
index 9a50f02..16511d9 100644 (file)
@@ -160,6 +160,8 @@ struct kvm_s390_vm_cpu_subfunc {
 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW       1
 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW      2
 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW      3
+#define KVM_S390_VM_CRYPTO_ENABLE_APIE         4
+#define KVM_S390_VM_CRYPTO_DISABLE_APIE                5
 
 /* kvm attributes for migration mode */
 #define KVM_S390_VM_MIGRATION_STOP     0
index ac5da6b..fe24150 100644 (file)
@@ -40,6 +40,7 @@
 #include <asm/sclp.h>
 #include <asm/cpacf.h>
 #include <asm/timex.h>
+#include <asm/ap.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 
@@ -844,20 +845,24 @@ void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm)
 
        kvm_s390_vcpu_block_all(kvm);
 
-       kvm_for_each_vcpu(i, vcpu, kvm)
+       kvm_for_each_vcpu(i, vcpu, kvm) {
                kvm_s390_vcpu_crypto_setup(vcpu);
+               /* recreate the shadow crycb by leaving the VSIE handler */
+               kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu);
+       }
 
        kvm_s390_vcpu_unblock_all(kvm);
 }
 
 static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
 {
-       if (!test_kvm_facility(kvm, 76))
-               return -EINVAL;
-
        mutex_lock(&kvm->lock);
        switch (attr->attr) {
        case KVM_S390_VM_CRYPTO_ENABLE_AES_KW:
+               if (!test_kvm_facility(kvm, 76)) {
+                       mutex_unlock(&kvm->lock);
+                       return -EINVAL;
+               }
                get_random_bytes(
                        kvm->arch.crypto.crycb->aes_wrapping_key_mask,
                        sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
@@ -865,6 +870,10 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
                VM_EVENT(kvm, 3, "%s", "ENABLE: AES keywrapping support");
                break;
        case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW:
+               if (!test_kvm_facility(kvm, 76)) {
+                       mutex_unlock(&kvm->lock);
+                       return -EINVAL;
+               }
                get_random_bytes(
                        kvm->arch.crypto.crycb->dea_wrapping_key_mask,
                        sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
@@ -872,17 +881,39 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
                VM_EVENT(kvm, 3, "%s", "ENABLE: DEA keywrapping support");
                break;
        case KVM_S390_VM_CRYPTO_DISABLE_AES_KW:
+               if (!test_kvm_facility(kvm, 76)) {
+                       mutex_unlock(&kvm->lock);
+                       return -EINVAL;
+               }
                kvm->arch.crypto.aes_kw = 0;
                memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0,
                        sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
                VM_EVENT(kvm, 3, "%s", "DISABLE: AES keywrapping support");
                break;
        case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
+               if (!test_kvm_facility(kvm, 76)) {
+                       mutex_unlock(&kvm->lock);
+                       return -EINVAL;
+               }
                kvm->arch.crypto.dea_kw = 0;
                memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0,
                        sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
                VM_EVENT(kvm, 3, "%s", "DISABLE: DEA keywrapping support");
                break;
+       case KVM_S390_VM_CRYPTO_ENABLE_APIE:
+               if (!ap_instructions_available()) {
+                       mutex_unlock(&kvm->lock);
+                       return -EOPNOTSUPP;
+               }
+               kvm->arch.crypto.apie = 1;
+               break;
+       case KVM_S390_VM_CRYPTO_DISABLE_APIE:
+               if (!ap_instructions_available()) {
+                       mutex_unlock(&kvm->lock);
+                       return -EOPNOTSUPP;
+               }
+               kvm->arch.crypto.apie = 0;
+               break;
        default:
                mutex_unlock(&kvm->lock);
                return -ENXIO;
@@ -1491,6 +1522,10 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
                case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
                        ret = 0;
                        break;
+               case KVM_S390_VM_CRYPTO_ENABLE_APIE:
+               case KVM_S390_VM_CRYPTO_DISABLE_APIE:
+                       ret = ap_instructions_available() ? 0 : -ENXIO;
+                       break;
                default:
                        ret = -ENXIO;
                        break;
@@ -1992,55 +2027,101 @@ long kvm_arch_vm_ioctl(struct file *filp,
        return r;
 }
 
-static int kvm_s390_query_ap_config(u8 *config)
-{
-       u32 fcn_code = 0x04000000UL;
-       u32 cc = 0;
-
-       memset(config, 0, 128);
-       asm volatile(
-               "lgr 0,%1\n"
-               "lgr 2,%2\n"
-               ".long 0xb2af0000\n"            /* PQAP(QCI) */
-               "0: ipm %0\n"
-               "srl %0,28\n"
-               "1:\n"
-               EX_TABLE(0b, 1b)
-               : "+r" (cc)
-               : "r" (fcn_code), "r" (config)
-               : "cc", "0", "2", "memory"
-       );
-
-       return cc;
-}
-
 static int kvm_s390_apxa_installed(void)
 {
-       u8 config[128];
-       int cc;
+       struct ap_config_info info;
 
-       if (test_facility(12)) {
-               cc = kvm_s390_query_ap_config(config);
-
-               if (cc)
-                       pr_err("PQAP(QCI) failed with cc=%d", cc);
-               else
-                       return config[0] & 0x40;
+       if (ap_instructions_available()) {
+               if (ap_qci(&info) == 0)
+                       return info.apxa;
        }
 
        return 0;
 }
 
+/*
+ * The format of the crypto control block (CRYCB) is specified in the 3 low
+ * order bits of the CRYCB designation (CRYCBD) field as follows:
+ * Format 0: Neither the message security assist extension 3 (MSAX3) nor the
+ *          AP extended addressing (APXA) facility are installed.
+ * Format 1: The APXA facility is not installed but the MSAX3 facility is.
+ * Format 2: Both the APXA and MSAX3 facilities are installed
+ */
 static void kvm_s390_set_crycb_format(struct kvm *kvm)
 {
        kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb;
 
+       /* Clear the CRYCB format bits - i.e., set format 0 by default */
+       kvm->arch.crypto.crycbd &= ~(CRYCB_FORMAT_MASK);
+
+       /* Check whether MSAX3 is installed */
+       if (!test_kvm_facility(kvm, 76))
+               return;
+
        if (kvm_s390_apxa_installed())
                kvm->arch.crypto.crycbd |= CRYCB_FORMAT2;
        else
                kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
 }
 
+void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
+                              unsigned long *aqm, unsigned long *adm)
+{
+       struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb;
+
+       mutex_lock(&kvm->lock);
+       kvm_s390_vcpu_block_all(kvm);
+
+       switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) {
+       case CRYCB_FORMAT2: /* APCB1 use 256 bits */
+               memcpy(crycb->apcb1.apm, apm, 32);
+               VM_EVENT(kvm, 3, "SET CRYCB: apm %016lx %016lx %016lx %016lx",
+                        apm[0], apm[1], apm[2], apm[3]);
+               memcpy(crycb->apcb1.aqm, aqm, 32);
+               VM_EVENT(kvm, 3, "SET CRYCB: aqm %016lx %016lx %016lx %016lx",
+                        aqm[0], aqm[1], aqm[2], aqm[3]);
+               memcpy(crycb->apcb1.adm, adm, 32);
+               VM_EVENT(kvm, 3, "SET CRYCB: adm %016lx %016lx %016lx %016lx",
+                        adm[0], adm[1], adm[2], adm[3]);
+               break;
+       case CRYCB_FORMAT1:
+       case CRYCB_FORMAT0: /* Fall through both use APCB0 */
+               memcpy(crycb->apcb0.apm, apm, 8);
+               memcpy(crycb->apcb0.aqm, aqm, 2);
+               memcpy(crycb->apcb0.adm, adm, 2);
+               VM_EVENT(kvm, 3, "SET CRYCB: apm %016lx aqm %04x adm %04x",
+                        apm[0], *((unsigned short *)aqm),
+                        *((unsigned short *)adm));
+               break;
+       default:        /* Can not happen */
+               break;
+       }
+
+       /* recreate the shadow crycb for each vcpu */
+       kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
+       kvm_s390_vcpu_unblock_all(kvm);
+       mutex_unlock(&kvm->lock);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks);
+
+void kvm_arch_crypto_clear_masks(struct kvm *kvm)
+{
+       mutex_lock(&kvm->lock);
+       kvm_s390_vcpu_block_all(kvm);
+
+       memset(&kvm->arch.crypto.crycb->apcb0, 0,
+              sizeof(kvm->arch.crypto.crycb->apcb0));
+       memset(&kvm->arch.crypto.crycb->apcb1, 0,
+              sizeof(kvm->arch.crypto.crycb->apcb1));
+
+       VM_EVENT(kvm, 3, "%s", "CLR CRYCB:");
+       /* recreate the shadow crycb for each vcpu */
+       kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
+       kvm_s390_vcpu_unblock_all(kvm);
+       mutex_unlock(&kvm->lock);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks);
+
 static u64 kvm_s390_get_initial_cpuid(void)
 {
        struct cpuid cpuid;
@@ -2052,12 +2133,12 @@ static u64 kvm_s390_get_initial_cpuid(void)
 
 static void kvm_s390_crypto_init(struct kvm *kvm)
 {
-       if (!test_kvm_facility(kvm, 76))
-               return;
-
        kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
        kvm_s390_set_crycb_format(kvm);
 
+       if (!test_kvm_facility(kvm, 76))
+               return;
+
        /* Enable AES/DEA protected key functions by default */
        kvm->arch.crypto.aes_kw = 1;
        kvm->arch.crypto.dea_kw = 1;
@@ -2583,17 +2664,25 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
 {
-       if (!test_kvm_facility(vcpu->kvm, 76))
+       /*
+        * If the AP instructions are not being interpreted and the MSAX3
+        * facility is not configured for the guest, there is nothing to set up.
+        */
+       if (!vcpu->kvm->arch.crypto.apie && !test_kvm_facility(vcpu->kvm, 76))
                return;
 
+       vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
        vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA);
+       vcpu->arch.sie_block->eca &= ~ECA_APIE;
+
+       if (vcpu->kvm->arch.crypto.apie)
+               vcpu->arch.sie_block->eca |= ECA_APIE;
 
+       /* Set up protected key support */
        if (vcpu->kvm->arch.crypto.aes_kw)
                vcpu->arch.sie_block->ecb3 |= ECB3_AES;
        if (vcpu->kvm->arch.crypto.dea_kw)
                vcpu->arch.sie_block->ecb3 |= ECB3_DEA;
-
-       vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
 }
 
 void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
@@ -2685,6 +2774,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
 
+       vcpu->arch.sie_block->hpid = HPID_KVM;
+
        kvm_s390_vcpu_crypto_setup(vcpu);
 
        return rc;
@@ -2768,18 +2859,25 @@ static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu)
        exit_sie(vcpu);
 }
 
+bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu)
+{
+       return atomic_read(&vcpu->arch.sie_block->prog20) &
+              (PROG_BLOCK_SIE | PROG_REQUEST);
+}
+
 static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu)
 {
        atomic_andnot(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
 }
 
 /*
- * Kick a guest cpu out of SIE and wait until SIE is not running.
+ * Kick a guest cpu out of (v)SIE and wait until (v)SIE is not running.
  * If the CPU is not running (e.g. waiting as idle) the function will
  * return immediately. */
 void exit_sie(struct kvm_vcpu *vcpu)
 {
        kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT);
+       kvm_s390_vsie_kick(vcpu);
        while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE)
                cpu_relax();
 }
@@ -3196,6 +3294,8 @@ retry:
 
        /* nothing to do, just clear the request */
        kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+       /* we left the vsie handler, nothing to do, just clear the request */
+       kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu);
 
        return 0;
 }
index 981e3ba..1f6e36c 100644 (file)
@@ -290,6 +290,7 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu);
+bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu);
 void exit_sie(struct kvm_vcpu *vcpu);
 void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu);
 int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
index a2b28cd..a153257 100644 (file)
@@ -135,14 +135,148 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        atomic_set(&scb_s->cpuflags, newflags);
        return 0;
 }
+/* Copy to APCB FORMAT1 from APCB FORMAT0 */
+static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
+                       unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h)
+{
+       struct kvm_s390_apcb0 tmp;
 
-/*
+       if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0)))
+               return -EFAULT;
+
+       apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0];
+       apcb_s->aqm[0] = apcb_h->aqm[0] & tmp.aqm[0] & 0xffff000000000000UL;
+       apcb_s->adm[0] = apcb_h->adm[0] & tmp.adm[0] & 0xffff000000000000UL;
+
+       return 0;
+
+}
+
+/**
+ * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0
+ * @vcpu: pointer to the virtual CPU
+ * @apcb_s: pointer to start of apcb in the shadow crycb
+ * @apcb_o: pointer to start of original apcb in the guest2
+ * @apcb_h: pointer to start of apcb in the guest1
+ *
+ * Returns 0 and -EFAULT on error reading guest apcb
+ */
+static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
+                       unsigned long apcb_o, unsigned long *apcb_h)
+{
+       if (read_guest_real(vcpu, apcb_o, apcb_s,
+                           sizeof(struct kvm_s390_apcb0)))
+               return -EFAULT;
+
+       bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0));
+
+       return 0;
+}
+
+/**
+ * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB
+ * @vcpu: pointer to the virtual CPU
+ * @apcb_s: pointer to start of apcb in the shadow crycb
+ * @apcb_o: pointer to start of original guest apcb
+ * @apcb_h: pointer to start of apcb in the host
+ *
+ * Returns 0 and -EFAULT on error reading guest apcb
+ */
+static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
+                       unsigned long apcb_o,
+                       unsigned long *apcb_h)
+{
+       if (read_guest_real(vcpu, apcb_o, apcb_s,
+                           sizeof(struct kvm_s390_apcb1)))
+               return -EFAULT;
+
+       bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1));
+
+       return 0;
+}
+
+/**
+ * setup_apcb - Create a shadow copy of the apcb.
+ * @vcpu: pointer to the virtual CPU
+ * @crycb_s: pointer to shadow crycb
+ * @crycb_o: pointer to original guest crycb
+ * @crycb_h: pointer to the host crycb
+ * @fmt_o: format of the original guest crycb.
+ * @fmt_h: format of the host crycb.
+ *
+ * Checks the compatibility between the guest and host crycb and calls the
+ * appropriate copy function.
+ *
+ * Return 0 or an error number if the guest and host crycb are incompatible.
+ */
+static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s,
+              const u32 crycb_o,
+              struct kvm_s390_crypto_cb *crycb_h,
+              int fmt_o, int fmt_h)
+{
+       struct kvm_s390_crypto_cb *crycb;
+
+       crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o;
+
+       switch (fmt_o) {
+       case CRYCB_FORMAT2:
+               if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK))
+                       return -EACCES;
+               if (fmt_h != CRYCB_FORMAT2)
+                       return -EINVAL;
+               return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1,
+                                   (unsigned long) &crycb->apcb1,
+                                   (unsigned long *)&crycb_h->apcb1);
+       case CRYCB_FORMAT1:
+               switch (fmt_h) {
+               case CRYCB_FORMAT2:
+                       return setup_apcb10(vcpu, &crycb_s->apcb1,
+                                           (unsigned long) &crycb->apcb0,
+                                           &crycb_h->apcb1);
+               case CRYCB_FORMAT1:
+                       return setup_apcb00(vcpu,
+                                           (unsigned long *) &crycb_s->apcb0,
+                                           (unsigned long) &crycb->apcb0,
+                                           (unsigned long *) &crycb_h->apcb0);
+               }
+               break;
+       case CRYCB_FORMAT0:
+               if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK))
+                       return -EACCES;
+
+               switch (fmt_h) {
+               case CRYCB_FORMAT2:
+                       return setup_apcb10(vcpu, &crycb_s->apcb1,
+                                           (unsigned long) &crycb->apcb0,
+                                           &crycb_h->apcb1);
+               case CRYCB_FORMAT1:
+               case CRYCB_FORMAT0:
+                       return setup_apcb00(vcpu,
+                                           (unsigned long *) &crycb_s->apcb0,
+                                           (unsigned long) &crycb->apcb0,
+                                           (unsigned long *) &crycb_h->apcb0);
+               }
+       }
+       return -EINVAL;
+}
+
+/**
+ * shadow_crycb - Create a shadow copy of the crycb block
+ * @vcpu: a pointer to the virtual CPU
+ * @vsie_page: a pointer to internal date used for the vSIE
+ *
  * Create a shadow copy of the crycb block and setup key wrapping, if
  * requested for guest 3 and enabled for guest 2.
  *
- * We only accept format-1 (no AP in g2), but convert it into format-2
+ * We accept format-1 or format-2, but we convert format-1 into format-2
+ * in the shadow CRYCB.
+ * Using format-2 enables the firmware to choose the right format when
+ * scheduling the SIE.
  * There is nothing to do for format-0.
  *
+ * This function centralize the issuing of set_validity_icpt() for all
+ * the subfunctions working on the crycb.
+ *
  * Returns: - 0 if shadowed or nothing to do
  *          - > 0 if control has to be given to guest 2
  */
@@ -154,23 +288,40 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        const u32 crycb_addr = crycbd_o & 0x7ffffff8U;
        unsigned long *b1, *b2;
        u8 ecb3_flags;
+       int apie_h;
+       int key_msk = test_kvm_facility(vcpu->kvm, 76);
+       int fmt_o = crycbd_o & CRYCB_FORMAT_MASK;
+       int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK;
+       int ret = 0;
 
        scb_s->crycbd = 0;
-       if (!(crycbd_o & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
-               return 0;
-       /* format-1 is supported with message-security-assist extension 3 */
-       if (!test_kvm_facility(vcpu->kvm, 76))
+
+       apie_h = vcpu->arch.sie_block->eca & ECA_APIE;
+       if (!apie_h && !key_msk)
                return 0;
+
+       if (!crycb_addr)
+               return set_validity_icpt(scb_s, 0x0039U);
+
+       if (fmt_o == CRYCB_FORMAT1)
+               if ((crycb_addr & PAGE_MASK) !=
+                   ((crycb_addr + 128) & PAGE_MASK))
+                       return set_validity_icpt(scb_s, 0x003CU);
+
+       if (apie_h && (scb_o->eca & ECA_APIE)) {
+               ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr,
+                                vcpu->kvm->arch.crypto.crycb,
+                                fmt_o, fmt_h);
+               if (ret)
+                       goto end;
+               scb_s->eca |= scb_o->eca & ECA_APIE;
+       }
+
        /* we may only allow it if enabled for guest 2 */
        ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
                     (ECB3_AES | ECB3_DEA);
        if (!ecb3_flags)
-               return 0;
-
-       if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
-               return set_validity_icpt(scb_s, 0x003CU);
-       else if (!crycb_addr)
-               return set_validity_icpt(scb_s, 0x0039U);
+               goto end;
 
        /* copy only the wrapping keys */
        if (read_guest_real(vcpu, crycb_addr + 72,
@@ -178,8 +329,6 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                return set_validity_icpt(scb_s, 0x0035U);
 
        scb_s->ecb3 |= ecb3_flags;
-       scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
-                       CRYCB_FORMAT2;
 
        /* xor both blocks in one run */
        b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
@@ -187,6 +336,16 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                            vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
        /* as 56%8 == 0, bitmap_xor won't overwrite any data */
        bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+end:
+       switch (ret) {
+       case -EINVAL:
+               return set_validity_icpt(scb_s, 0x0020U);
+       case -EFAULT:
+               return set_validity_icpt(scb_s, 0x0035U);
+       case -EACCES:
+               return set_validity_icpt(scb_s, 0x003CU);
+       }
+       scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
        return 0;
 }
 
@@ -383,6 +542,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        if (test_kvm_facility(vcpu->kvm, 156))
                scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
 
+       scb_s->hpid = HPID_VSIE;
+
        prepare_ibc(vcpu, vsie_page);
        rc = shadow_crycb(vcpu, vsie_page);
 out:
@@ -830,7 +991,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
        int guest_bp_isolation;
-       int rc;
+       int rc = 0;
 
        handle_last_fault(vcpu, vsie_page);
 
@@ -858,7 +1019,18 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        guest_enter_irqoff();
        local_irq_enable();
 
-       rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+       /*
+        * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking
+        * and VCPU requests also hinder the vSIE from running and lead
+        * to an immediate exit. kvm_s390_vsie_kick() has to be used to
+        * also kick the vSIE.
+        */
+       vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
+       barrier();
+       if (!kvm_s390_vcpu_sie_inhibited(vcpu))
+               rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+       barrier();
+       vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
 
        local_irq_disable();
        guest_exit_irqoff();
@@ -1005,7 +1177,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                if (rc == -EAGAIN)
                        rc = 0;
                if (rc || scb_s->icptcode || signal_pending(current) ||
-                   kvm_s390_vcpu_has_irq(vcpu, 0))
+                   kvm_s390_vcpu_has_irq(vcpu, 0) ||
+                   kvm_s390_vcpu_sie_inhibited(vcpu))
                        break;
        }
 
@@ -1122,7 +1295,8 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
        if (unlikely(scb_addr & 0x1ffUL))
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+       if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) ||
+           kvm_s390_vcpu_sie_inhibited(vcpu))
                return 0;
 
        vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
index 911c7de..1e668b9 100644 (file)
@@ -907,10 +907,16 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
        pmd_t *pmdp;
 
        BUG_ON(gmap_is_shadow(gmap));
-       spin_lock(&gmap->guest_table_lock);
        pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
+       if (!pmdp)
+               return NULL;
 
-       if (!pmdp || pmd_none(*pmdp)) {
+       /* without huge pages, there is no need to take the table lock */
+       if (!gmap->mm->context.allow_gmap_hpage_1m)
+               return pmd_none(*pmdp) ? NULL : pmdp;
+
+       spin_lock(&gmap->guest_table_lock);
+       if (pmd_none(*pmdp)) {
                spin_unlock(&gmap->guest_table_lock);
                return NULL;
        }
index 0c85aed..fd788e0 100644 (file)
@@ -106,6 +106,8 @@ static struct facility_def facility_defs[] = {
 
                .name = "FACILITIES_KVM_CPUMODEL",
                .bits = (int[]){
+                       12, /* AP Query Configuration Information */
+                       15, /* AP Facilities Test */
                        156, /* etoken facility */
                        -1  /* END */
                }
index 09b2e3e..55e51ff 100644 (file)
 #define UNMAPPED_GVA (~(gpa_t)0)
 
 /* KVM Hugepage definitions for x86 */
-#define KVM_NR_PAGE_SIZES      3
+enum {
+       PT_PAGE_TABLE_LEVEL   = 1,
+       PT_DIRECTORY_LEVEL    = 2,
+       PT_PDPE_LEVEL         = 3,
+       /* set max level to the biggest one */
+       PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL,
+};
+#define KVM_NR_PAGE_SIZES      (PT_MAX_HUGEPAGE_LEVEL - \
+                                PT_PAGE_TABLE_LEVEL + 1)
 #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
 #define KVM_HPAGE_SHIFT(x)     (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 #define KVM_HPAGE_SIZE(x)      (1UL << KVM_HPAGE_SHIFT(x))
@@ -177,6 +185,7 @@ enum {
 
 #define DR6_BD         (1 << 13)
 #define DR6_BS         (1 << 14)
+#define DR6_BT         (1 << 15)
 #define DR6_RTM                (1 << 16)
 #define DR6_FIXED_1    0xfffe0ff0
 #define DR6_INIT       0xffff0ff0
@@ -247,7 +256,7 @@ struct kvm_mmu_memory_cache {
  * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
  */
 union kvm_mmu_page_role {
-       unsigned word;
+       u32 word;
        struct {
                unsigned level:4;
                unsigned cr4_pae:1;
@@ -273,6 +282,34 @@ union kvm_mmu_page_role {
        };
 };
 
+union kvm_mmu_extended_role {
+/*
+ * This structure complements kvm_mmu_page_role caching everything needed for
+ * MMU configuration. If nothing in both these structures changed, MMU
+ * re-configuration can be skipped. @valid bit is set on first usage so we don't
+ * treat all-zero structure as valid data.
+ */
+       u32 word;
+       struct {
+               unsigned int valid:1;
+               unsigned int execonly:1;
+               unsigned int cr0_pg:1;
+               unsigned int cr4_pse:1;
+               unsigned int cr4_pke:1;
+               unsigned int cr4_smap:1;
+               unsigned int cr4_smep:1;
+               unsigned int cr4_la57:1;
+       };
+};
+
+union kvm_mmu_role {
+       u64 as_u64;
+       struct {
+               union kvm_mmu_page_role base;
+               union kvm_mmu_extended_role ext;
+       };
+};
+
 struct kvm_rmap_head {
        unsigned long val;
 };
@@ -280,18 +317,18 @@ struct kvm_rmap_head {
 struct kvm_mmu_page {
        struct list_head link;
        struct hlist_node hash_link;
+       bool unsync;
 
        /*
         * The following two entries are used to key the shadow page in the
         * hash table.
         */
-       gfn_t gfn;
        union kvm_mmu_page_role role;
+       gfn_t gfn;
 
        u64 *spt;
        /* hold the gfn of each spte inside spt */
        gfn_t *gfns;
-       bool unsync;
        int root_count;          /* Currently serving as active root */
        unsigned int unsync_children;
        struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
@@ -360,7 +397,7 @@ struct kvm_mmu {
        void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                           u64 *spte, const void *pte);
        hpa_t root_hpa;
-       union kvm_mmu_page_role base_role;
+       union kvm_mmu_role mmu_role;
        u8 root_level;
        u8 shadow_root_level;
        u8 ept_ad;
@@ -490,7 +527,7 @@ struct kvm_vcpu_hv {
        struct kvm_hyperv_exit exit;
        struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
        DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
-       cpumask_t tlb_lush;
+       cpumask_t tlb_flush;
 };
 
 struct kvm_vcpu_arch {
@@ -534,7 +571,13 @@ struct kvm_vcpu_arch {
         * the paging mode of the l1 guest. This context is always used to
         * handle faults.
         */
-       struct kvm_mmu mmu;
+       struct kvm_mmu *mmu;
+
+       /* Non-nested MMU for L1 */
+       struct kvm_mmu root_mmu;
+
+       /* L1 MMU when running nested */
+       struct kvm_mmu guest_mmu;
 
        /*
         * Paging state of an L2 guest (used for nested npt)
@@ -585,6 +628,8 @@ struct kvm_vcpu_arch {
                bool has_error_code;
                u8 nr;
                u32 error_code;
+               unsigned long payload;
+               bool has_payload;
                u8 nested_apf;
        } exception;
 
@@ -781,6 +826,9 @@ struct kvm_hv {
        u64 hv_reenlightenment_control;
        u64 hv_tsc_emulation_control;
        u64 hv_tsc_emulation_status;
+
+       /* How many vCPUs have VP index != vCPU index */
+       atomic_t num_mismatched_vp_indexes;
 };
 
 enum kvm_irqchip_mode {
@@ -871,6 +919,7 @@ struct kvm_arch {
        bool x2apic_broadcast_quirk_disabled;
 
        bool guest_can_read_msr_platform_info;
+       bool exception_payload_enabled;
 };
 
 struct kvm_vm_stat {
@@ -1133,6 +1182,9 @@ struct kvm_x86_ops {
        int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
 
        int (*get_msr_feature)(struct kvm_msr_entry *entry);
+
+       int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
+                                  uint16_t *vmcs_version);
 };
 
 struct kvm_arch_async_pf {
@@ -1170,7 +1222,6 @@ void kvm_mmu_module_exit(void);
 
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
-void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
@@ -1324,7 +1375,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free);
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+                       ulong roots_to_free);
 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
                           struct x86_exception *exception);
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
index e05e0d3..1fc7a0d 100644 (file)
@@ -40,7 +40,7 @@ static inline int cpu_has_vmx(void)
  */
 static inline void cpu_vmxoff(void)
 {
-       asm volatile (ASM_VMX_VMXOFF : : : "cc");
+       asm volatile ("vmxoff");
        cr4_clear_bits(X86_CR4_VMXE);
 }
 
index 9527ba5..ade0f15 100644 (file)
@@ -503,19 +503,6 @@ enum vmcs_field {
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR                0xfffbc000ul
 
-
-#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_INVEPT           ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
-#define ASM_VMX_INVVPID                  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
-
 struct vmx_msr_entry {
        u32 index;
        u32 reserved;
index fd23d57..dabfcf7 100644 (file)
@@ -288,6 +288,7 @@ struct kvm_reinject_control {
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR        0x00000002
 #define KVM_VCPUEVENT_VALID_SHADOW     0x00000004
 #define KVM_VCPUEVENT_VALID_SMM                0x00000008
+#define KVM_VCPUEVENT_VALID_PAYLOAD    0x00000010
 
 /* Interrupt shadow states */
 #define KVM_X86_SHADOW_INT_MOV_SS      0x01
@@ -299,7 +300,7 @@ struct kvm_vcpu_events {
                __u8 injected;
                __u8 nr;
                __u8 has_error_code;
-               __u8 pad;
+               __u8 pending;
                __u32 error_code;
        } exception;
        struct {
@@ -322,7 +323,9 @@ struct kvm_vcpu_events {
                __u8 smm_inside_nmi;
                __u8 latched_init;
        } smi;
-       __u32 reserved[9];
+       __u8 reserved[27];
+       __u8 exception_has_payload;
+       __u64 exception_payload;
 };
 
 /* for KVM_GET/SET_DEBUGREGS */
@@ -381,6 +384,7 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_NESTED_GUEST_MODE    0x00000001
 #define KVM_STATE_NESTED_RUN_PENDING   0x00000002
+#define KVM_STATE_NESTED_EVMCS         0x00000004
 
 #define KVM_STATE_NESTED_SMM_GUEST_MODE        0x00000001
 #define KVM_STATE_NESTED_SMM_VMXON     0x00000002
index 01d209a..4e80080 100644 (file)
@@ -36,6 +36,8 @@
 
 #include "trace.h"
 
+#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
+
 static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
 {
        return atomic64_read(&synic->sint[sint]);
@@ -132,8 +134,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
        struct kvm_vcpu *vcpu = NULL;
        int i;
 
-       if (vpidx < KVM_MAX_VCPUS)
-               vcpu = kvm_get_vcpu(kvm, vpidx);
+       if (vpidx >= KVM_MAX_VCPUS)
+               return NULL;
+
+       vcpu = kvm_get_vcpu(kvm, vpidx);
        if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
                return vcpu;
        kvm_for_each_vcpu(i, vcpu, kvm)
@@ -689,6 +693,24 @@ void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
                stimer_cleanup(&hv_vcpu->stimer[i]);
 }
 
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+       if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+               return false;
+       return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
+
+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
+                           struct hv_vp_assist_page *assist_page)
+{
+       if (!kvm_hv_assist_page_enabled(vcpu))
+               return false;
+       return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+                                     assist_page, sizeof(*assist_page));
+}
+EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
+
 static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
 {
        struct hv_message *msg = &stimer->msg;
@@ -1040,21 +1062,41 @@ static u64 current_task_runtime_100ns(void)
 
 static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
-       struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+       struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
 
        switch (msr) {
-       case HV_X64_MSR_VP_INDEX:
-               if (!host)
+       case HV_X64_MSR_VP_INDEX: {
+               struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+               int vcpu_idx = kvm_vcpu_get_idx(vcpu);
+               u32 new_vp_index = (u32)data;
+
+               if (!host || new_vp_index >= KVM_MAX_VCPUS)
                        return 1;
-               hv->vp_index = (u32)data;
+
+               if (new_vp_index == hv_vcpu->vp_index)
+                       return 0;
+
+               /*
+                * The VP index is initialized to vcpu_index by
+                * kvm_hv_vcpu_postcreate so they initially match.  Now the
+                * VP index is changing, adjust num_mismatched_vp_indexes if
+                * it now matches or no longer matches vcpu_idx.
+                */
+               if (hv_vcpu->vp_index == vcpu_idx)
+                       atomic_inc(&hv->num_mismatched_vp_indexes);
+               else if (new_vp_index == vcpu_idx)
+                       atomic_dec(&hv->num_mismatched_vp_indexes);
+
+               hv_vcpu->vp_index = new_vp_index;
                break;
+       }
        case HV_X64_MSR_VP_ASSIST_PAGE: {
                u64 gfn;
                unsigned long addr;
 
                if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
-                       hv->hv_vapic = data;
-                       if (kvm_lapic_enable_pv_eoi(vcpu, 0))
+                       hv_vcpu->hv_vapic = data;
+                       if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0))
                                return 1;
                        break;
                }
@@ -1062,12 +1104,19 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
                addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
                if (kvm_is_error_hva(addr))
                        return 1;
-               if (__clear_user((void __user *)addr, PAGE_SIZE))
+
+               /*
+                * Clear apic_assist portion of f(struct hv_vp_assist_page
+                * only, there can be valuable data in the rest which needs
+                * to be preserved e.g. on migration.
+                */
+               if (__clear_user((void __user *)addr, sizeof(u32)))
                        return 1;
-               hv->hv_vapic = data;
+               hv_vcpu->hv_vapic = data;
                kvm_vcpu_mark_page_dirty(vcpu, gfn);
                if (kvm_lapic_enable_pv_eoi(vcpu,
-                                           gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
+                                           gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
+                                           sizeof(struct hv_vp_assist_page)))
                        return 1;
                break;
        }
@@ -1080,7 +1129,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
        case HV_X64_MSR_VP_RUNTIME:
                if (!host)
                        return 1;
-               hv->runtime_offset = data - current_task_runtime_100ns();
+               hv_vcpu->runtime_offset = data - current_task_runtime_100ns();
                break;
        case HV_X64_MSR_SCONTROL:
        case HV_X64_MSR_SVERSION:
@@ -1172,11 +1221,11 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
                          bool host)
 {
        u64 data = 0;
-       struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+       struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
 
        switch (msr) {
        case HV_X64_MSR_VP_INDEX:
-               data = hv->vp_index;
+               data = hv_vcpu->vp_index;
                break;
        case HV_X64_MSR_EOI:
                return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
@@ -1185,10 +1234,10 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
        case HV_X64_MSR_TPR:
                return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
        case HV_X64_MSR_VP_ASSIST_PAGE:
-               data = hv->hv_vapic;
+               data = hv_vcpu->hv_vapic;
                break;
        case HV_X64_MSR_VP_RUNTIME:
-               data = current_task_runtime_100ns() + hv->runtime_offset;
+               data = current_task_runtime_100ns() + hv_vcpu->runtime_offset;
                break;
        case HV_X64_MSR_SCONTROL:
        case HV_X64_MSR_SVERSION:
@@ -1255,32 +1304,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
                return kvm_hv_get_msr(vcpu, msr, pdata, host);
 }
 
-static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
+static __always_inline unsigned long *sparse_set_to_vcpu_mask(
+       struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
+       u64 *vp_bitmap, unsigned long *vcpu_bitmap)
 {
-       int i = 0, j;
+       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_vcpu *vcpu;
+       int i, bank, sbank = 0;
 
-       if (!(valid_bank_mask & BIT_ULL(bank_no)))
-               return -1;
+       memset(vp_bitmap, 0,
+              KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap));
+       for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
+                        KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
+               vp_bitmap[bank] = sparse_banks[sbank++];
 
-       for (j = 0; j < bank_no; j++)
-               if (valid_bank_mask & BIT_ULL(j))
-                       i++;
+       if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) {
+               /* for all vcpus vp_index == vcpu_idx */
+               return (unsigned long *)vp_bitmap;
+       }
 
-       return i;
+       bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index,
+                            (unsigned long *)vp_bitmap))
+                       __set_bit(i, vcpu_bitmap);
+       }
+       return vcpu_bitmap;
 }
 
 static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
                            u16 rep_cnt, bool ex)
 {
        struct kvm *kvm = current_vcpu->kvm;
-       struct kvm_vcpu_hv *hv_current = &current_vcpu->arch.hyperv;
+       struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv;
        struct hv_tlb_flush_ex flush_ex;
        struct hv_tlb_flush flush;
-       struct kvm_vcpu *vcpu;
-       unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
-       unsigned long valid_bank_mask = 0;
+       u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+       DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+       unsigned long *vcpu_mask;
+       u64 valid_bank_mask;
        u64 sparse_banks[64];
-       int sparse_banks_len, i;
+       int sparse_banks_len;
        bool all_cpus;
 
        if (!ex) {
@@ -1290,6 +1354,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
                trace_kvm_hv_flush_tlb(flush.processor_mask,
                                       flush.address_space, flush.flags);
 
+               valid_bank_mask = BIT_ULL(0);
                sparse_banks[0] = flush.processor_mask;
                all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
        } else {
@@ -1306,7 +1371,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
                all_cpus = flush_ex.hv_vp_set.format !=
                        HV_GENERIC_SET_SPARSE_4K;
 
-               sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
+               sparse_banks_len =
+                       bitmap_weight((unsigned long *)&valid_bank_mask, 64) *
                        sizeof(sparse_banks[0]);
 
                if (!sparse_banks_len && !all_cpus)
@@ -1321,48 +1387,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
                        return HV_STATUS_INVALID_HYPERCALL_INPUT;
        }
 
-       cpumask_clear(&hv_current->tlb_lush);
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
-               int bank = hv->vp_index / 64, sbank = 0;
-
-               if (!all_cpus) {
-                       /* Banks >64 can't be represented */
-                       if (bank >= 64)
-                               continue;
-
-                       /* Non-ex hypercalls can only address first 64 vCPUs */
-                       if (!ex && bank)
-                               continue;
-
-                       if (ex) {
-                               /*
-                                * Check is the bank of this vCPU is in sparse
-                                * set and get the sparse bank number.
-                                */
-                               sbank = get_sparse_bank_no(valid_bank_mask,
-                                                          bank);
-
-                               if (sbank < 0)
-                                       continue;
-                       }
-
-                       if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
-                               continue;
-               }
+       cpumask_clear(&hv_vcpu->tlb_flush);
 
-               /*
-                * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
-                * can't analyze it here, flush TLB regardless of the specified
-                * address space.
-                */
-               __set_bit(i, vcpu_bitmap);
-       }
+       vcpu_mask = all_cpus ? NULL :
+               sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
+                                       vp_bitmap, vcpu_bitmap);
 
+       /*
+        * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
+        * analyze it here, flush TLB regardless of the specified address space.
+        */
        kvm_make_vcpus_request_mask(kvm,
                                    KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
-                                   vcpu_bitmap, &hv_current->tlb_lush);
+                                   vcpu_mask, &hv_vcpu->tlb_flush);
 
 ret_success:
        /* We always do full TLB flush, set rep_done = rep_cnt. */
@@ -1370,6 +1407,99 @@ ret_success:
                ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
 }
 
+static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
+                                unsigned long *vcpu_bitmap)
+{
+       struct kvm_lapic_irq irq = {
+               .delivery_mode = APIC_DM_FIXED,
+               .vector = vector
+       };
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
+                       continue;
+
+               /* We fail only when APIC is disabled */
+               kvm_apic_set_irq(vcpu, &irq, NULL);
+       }
+}
+
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
+                          bool ex, bool fast)
+{
+       struct kvm *kvm = current_vcpu->kvm;
+       struct hv_send_ipi_ex send_ipi_ex;
+       struct hv_send_ipi send_ipi;
+       u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+       DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+       unsigned long *vcpu_mask;
+       unsigned long valid_bank_mask;
+       u64 sparse_banks[64];
+       int sparse_banks_len;
+       u32 vector;
+       bool all_cpus;
+
+       if (!ex) {
+               if (!fast) {
+                       if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
+                                                   sizeof(send_ipi))))
+                               return HV_STATUS_INVALID_HYPERCALL_INPUT;
+                       sparse_banks[0] = send_ipi.cpu_mask;
+                       vector = send_ipi.vector;
+               } else {
+                       /* 'reserved' part of hv_send_ipi should be 0 */
+                       if (unlikely(ingpa >> 32 != 0))
+                               return HV_STATUS_INVALID_HYPERCALL_INPUT;
+                       sparse_banks[0] = outgpa;
+                       vector = (u32)ingpa;
+               }
+               all_cpus = false;
+               valid_bank_mask = BIT_ULL(0);
+
+               trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
+       } else {
+               if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
+                                           sizeof(send_ipi_ex))))
+                       return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+               trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
+                                        send_ipi_ex.vp_set.format,
+                                        send_ipi_ex.vp_set.valid_bank_mask);
+
+               vector = send_ipi_ex.vector;
+               valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
+               sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
+                       sizeof(sparse_banks[0]);
+
+               all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+
+               if (!sparse_banks_len)
+                       goto ret_success;
+
+               if (!all_cpus &&
+                   kvm_read_guest(kvm,
+                                  ingpa + offsetof(struct hv_send_ipi_ex,
+                                                   vp_set.bank_contents),
+                                  sparse_banks,
+                                  sparse_banks_len))
+                       return HV_STATUS_INVALID_HYPERCALL_INPUT;
+       }
+
+       if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+               return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+       vcpu_mask = all_cpus ? NULL :
+               sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
+                                       vp_bitmap, vcpu_bitmap);
+
+       kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+
+ret_success:
+       return HV_STATUS_SUCCESS;
+}
+
 bool kvm_hv_hypercall_enabled(struct kvm *kvm)
 {
        return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
@@ -1539,6 +1669,20 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
                }
                ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
                break;
+       case HVCALL_SEND_IPI:
+               if (unlikely(rep)) {
+                       ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+                       break;
+               }
+               ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast);
+               break;
+       case HVCALL_SEND_IPI_EX:
+               if (unlikely(fast || rep)) {
+                       ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+                       break;
+               }
+               ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false);
+               break;
        default:
                ret = HV_STATUS_INVALID_HYPERCALL_CODE;
                break;
index d6aa969..0e66c12 100644 (file)
@@ -62,6 +62,10 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
 
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
+                           struct hv_vp_assist_page *assist_page);
+
 static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
                                                        int timer_index)
 {
index fbb0e6d..3cd227f 100644 (file)
 #define APIC_BROADCAST                 0xFF
 #define X2APIC_BROADCAST               0xFFFFFFFFul
 
+static bool lapic_timer_advance_adjust_done = false;
+#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
+/* step-by-step approximation to mitigate fluctuation */
+#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+
 static inline int apic_test_vector(int vec, void *bitmap)
 {
        return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -955,14 +960,14 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
        map = rcu_dereference(kvm->arch.apic_map);
 
        ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
-       if (ret)
+       if (ret) {
+               *r = 0;
                for_each_set_bit(i, &bitmap, 16) {
                        if (!dst[i])
                                continue;
-                       if (*r < 0)
-                               *r = 0;
                        *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
                }
+       }
 
        rcu_read_unlock();
        return ret;
@@ -1472,7 +1477,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
 void wait_lapic_expire(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
-       u64 guest_tsc, tsc_deadline;
+       u64 guest_tsc, tsc_deadline, ns;
 
        if (!lapic_in_kernel(vcpu))
                return;
@@ -1492,6 +1497,24 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
        if (guest_tsc < tsc_deadline)
                __delay(min(tsc_deadline - guest_tsc,
                        nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
+
+       if (!lapic_timer_advance_adjust_done) {
+               /* too early */
+               if (guest_tsc < tsc_deadline) {
+                       ns = (tsc_deadline - guest_tsc) * 1000000ULL;
+                       do_div(ns, vcpu->arch.virtual_tsc_khz);
+                       lapic_timer_advance_ns -= min((unsigned int)ns,
+                               lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+               } else {
+               /* too late */
+                       ns = (guest_tsc - tsc_deadline) * 1000000ULL;
+                       do_div(ns, vcpu->arch.virtual_tsc_khz);
+                       lapic_timer_advance_ns += min((unsigned int)ns,
+                               lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+               }
+               if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
+                       lapic_timer_advance_adjust_done = true;
+       }
 }
 
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
@@ -2621,17 +2644,25 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
        return 0;
 }
 
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
 {
        u64 addr = data & ~KVM_MSR_ENABLED;
+       struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
+       unsigned long new_len;
+
        if (!IS_ALIGNED(addr, 4))
                return 1;
 
        vcpu->arch.pv_eoi.msr_val = data;
        if (!pv_eoi_enabled(vcpu))
                return 0;
-       return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
-                                        addr, sizeof(u8));
+
+       if (addr == ghc->gpa && len <= ghc->len)
+               new_len = ghc->len;
+       else
+               new_len = len;
+
+       return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
 }
 
 void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
index ed0ed39..ff6ef9c 100644 (file)
@@ -120,7 +120,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
        return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
 }
 
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
 void kvm_lapic_init(void);
 void kvm_lapic_exit(void);
 
index e843ec4..cf5f572 100644 (file)
@@ -932,7 +932,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
                obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
                if (!obj)
-                       return -ENOMEM;
+                       return cache->nobjs >= min ? 0 : -ENOMEM;
                cache->objects[cache->nobjs++] = obj;
        }
        return 0;
@@ -960,7 +960,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
                page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
                if (!page)
-                       return -ENOMEM;
+                       return cache->nobjs >= min ? 0 : -ENOMEM;
                cache->objects[cache->nobjs++] = page;
        }
        return 0;
@@ -1265,24 +1265,24 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
        mmu_free_pte_list_desc(desc);
 }
 
-static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
+static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
 {
        struct pte_list_desc *desc;
        struct pte_list_desc *prev_desc;
        int i;
 
        if (!rmap_head->val) {
-               printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
+               pr_err("%s: %p 0->BUG\n", __func__, spte);
                BUG();
        } else if (!(rmap_head->val & 1)) {
-               rmap_printk("pte_list_remove:  %p 1->0\n", spte);
+               rmap_printk("%s:  %p 1->0\n", __func__, spte);
                if ((u64 *)rmap_head->val != spte) {
-                       printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
+                       pr_err("%s:  %p 1->BUG\n", __func__, spte);
                        BUG();
                }
                rmap_head->val = 0;
        } else {
-               rmap_printk("pte_list_remove:  %p many->many\n", spte);
+               rmap_printk("%s:  %p many->many\n", __func__, spte);
                desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
                prev_desc = NULL;
                while (desc) {
@@ -1296,11 +1296,17 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
                        prev_desc = desc;
                        desc = desc->more;
                }
-               pr_err("pte_list_remove: %p many->many\n", spte);
+               pr_err("%s: %p many->many\n", __func__, spte);
                BUG();
        }
 }
 
+static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
+{
+       mmu_spte_clear_track_bits(sptep);
+       __pte_list_remove(sptep, rmap_head);
+}
+
 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
                                           struct kvm_memory_slot *slot)
 {
@@ -1349,7 +1355,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
        sp = page_header(__pa(spte));
        gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
        rmap_head = gfn_to_rmap(kvm, gfn, sp);
-       pte_list_remove(spte, rmap_head);
+       __pte_list_remove(spte, rmap_head);
 }
 
 /*
@@ -1685,7 +1691,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
        while ((sptep = rmap_get_first(rmap_head, &iter))) {
                rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
 
-               drop_spte(kvm, sptep);
+               pte_list_remove(rmap_head, sptep);
                flush = true;
        }
 
@@ -1721,7 +1727,7 @@ restart:
                need_flush = 1;
 
                if (pte_write(*ptep)) {
-                       drop_spte(kvm, sptep);
+                       pte_list_remove(rmap_head, sptep);
                        goto restart;
                } else {
                        new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
@@ -1988,7 +1994,7 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
                                       u64 *parent_pte)
 {
-       pte_list_remove(parent_pte, &sp->parent_ptes);
+       __pte_list_remove(parent_pte, &sp->parent_ptes);
 }
 
 static void drop_parent_pte(struct kvm_mmu_page *sp,
@@ -2181,7 +2187,7 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                            struct list_head *invalid_list)
 {
        if (sp->role.cr4_pae != !!is_pae(vcpu)
-           || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
+           || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
                return false;
        }
@@ -2375,14 +2381,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        int collisions = 0;
        LIST_HEAD(invalid_list);
 
-       role = vcpu->arch.mmu.base_role;
+       role = vcpu->arch.mmu->mmu_role.base;
        role.level = level;
        role.direct = direct;
        if (role.direct)
                role.cr4_pae = 0;
        role.access = access;
-       if (!vcpu->arch.mmu.direct_map
-           && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+       if (!vcpu->arch.mmu->direct_map
+           && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
                quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
@@ -2457,11 +2463,11 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 {
        iterator->addr = addr;
        iterator->shadow_addr = root;
-       iterator->level = vcpu->arch.mmu.shadow_root_level;
+       iterator->level = vcpu->arch.mmu->shadow_root_level;
 
        if (iterator->level == PT64_ROOT_4LEVEL &&
-           vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
-           !vcpu->arch.mmu.direct_map)
+           vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
+           !vcpu->arch.mmu->direct_map)
                --iterator->level;
 
        if (iterator->level == PT32E_ROOT_LEVEL) {
@@ -2469,10 +2475,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
                 * prev_root is currently only used for 64-bit hosts. So only
                 * the active root_hpa is valid here.
                 */
-               BUG_ON(root != vcpu->arch.mmu.root_hpa);
+               BUG_ON(root != vcpu->arch.mmu->root_hpa);
 
                iterator->shadow_addr
-                       = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+                       = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
                iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
                --iterator->level;
                if (!iterator->shadow_addr)
@@ -2483,7 +2489,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
                             struct kvm_vcpu *vcpu, u64 addr)
 {
-       shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
+       shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
                                    addr);
 }
 
@@ -3095,7 +3101,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
        int emulate = 0;
        gfn_t pseudo_gfn;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                return 0;
 
        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -3301,7 +3307,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
        u64 spte = 0ull;
        uint retry_count = 0;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                return false;
 
        if (!page_fault_can_be_fast(error_code))
@@ -3471,11 +3477,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 }
 
 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+                       ulong roots_to_free)
 {
        int i;
        LIST_HEAD(invalid_list);
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
        bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
 
        BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
@@ -3535,20 +3541,20 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
        struct kvm_mmu_page *sp;
        unsigned i;
 
-       if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
+       if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
                spin_lock(&vcpu->kvm->mmu_lock);
                if(make_mmu_pages_available(vcpu) < 0) {
                        spin_unlock(&vcpu->kvm->mmu_lock);
                        return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, 0, 0,
-                               vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
+                               vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
                ++sp->root_count;
                spin_unlock(&vcpu->kvm->mmu_lock);
-               vcpu->arch.mmu.root_hpa = __pa(sp->spt);
-       } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+               vcpu->arch.mmu->root_hpa = __pa(sp->spt);
+       } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
                for (i = 0; i < 4; ++i) {
-                       hpa_t root = vcpu->arch.mmu.pae_root[i];
+                       hpa_t root = vcpu->arch.mmu->pae_root[i];
 
                        MMU_WARN_ON(VALID_PAGE(root));
                        spin_lock(&vcpu->kvm->mmu_lock);
@@ -3561,9 +3567,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                        root = __pa(sp->spt);
                        ++sp->root_count;
                        spin_unlock(&vcpu->kvm->mmu_lock);
-                       vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+                       vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
                }
-               vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
        } else
                BUG();
 
@@ -3577,7 +3583,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
        gfn_t root_gfn;
        int i;
 
-       root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
+       root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT;
 
        if (mmu_check_root(vcpu, root_gfn))
                return 1;
@@ -3586,8 +3592,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         * Do we shadow a long mode page table? If so we need to
         * write-protect the guests page table root.
         */
-       if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+               hpa_t root = vcpu->arch.mmu->root_hpa;
 
                MMU_WARN_ON(VALID_PAGE(root));
 
@@ -3597,11 +3603,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                        return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-                               vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
+                               vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
                root = __pa(sp->spt);
                ++sp->root_count;
                spin_unlock(&vcpu->kvm->mmu_lock);
-               vcpu->arch.mmu.root_hpa = root;
+               vcpu->arch.mmu->root_hpa = root;
                return 0;
        }
 
@@ -3611,17 +3617,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         * the shadow page table may be a PAE or a long mode page table.
         */
        pm_mask = PT_PRESENT_MASK;
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
+       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
                pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
        for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
+               hpa_t root = vcpu->arch.mmu->pae_root[i];
 
                MMU_WARN_ON(VALID_PAGE(root));
-               if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-                       pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
+               if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
+                       pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
                        if (!(pdptr & PT_PRESENT_MASK)) {
-                               vcpu->arch.mmu.pae_root[i] = 0;
+                               vcpu->arch.mmu->pae_root[i] = 0;
                                continue;
                        }
                        root_gfn = pdptr >> PAGE_SHIFT;
@@ -3639,16 +3645,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                ++sp->root_count;
                spin_unlock(&vcpu->kvm->mmu_lock);
 
-               vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+               vcpu->arch.mmu->pae_root[i] = root | pm_mask;
        }
-       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+       vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
 
        /*
         * If we shadow a 32 bit page table with a long mode page
         * table we enter this path.
         */
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
-               if (vcpu->arch.mmu.lm_root == NULL) {
+       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
+               if (vcpu->arch.mmu->lm_root == NULL) {
                        /*
                         * The additional page necessary for this is only
                         * allocated on demand.
@@ -3660,12 +3666,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                        if (lm_root == NULL)
                                return 1;
 
-                       lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+                       lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
 
-                       vcpu->arch.mmu.lm_root = lm_root;
+                       vcpu->arch.mmu->lm_root = lm_root;
                }
 
-               vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
        }
 
        return 0;
@@ -3673,7 +3679,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
-       if (vcpu->arch.mmu.direct_map)
+       if (vcpu->arch.mmu->direct_map)
                return mmu_alloc_direct_roots(vcpu);
        else
                return mmu_alloc_shadow_roots(vcpu);
@@ -3684,17 +3690,16 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        int i;
        struct kvm_mmu_page *sp;
 
-       if (vcpu->arch.mmu.direct_map)
+       if (vcpu->arch.mmu->direct_map)
                return;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                return;
 
        vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
 
-       if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
-
+       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+               hpa_t root = vcpu->arch.mmu->root_hpa;
                sp = page_header(root);
 
                /*
@@ -3725,7 +3730,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 
        for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
+               hpa_t root = vcpu->arch.mmu->pae_root[i];
 
                if (root && VALID_PAGE(root)) {
                        root &= PT64_BASE_ADDR_MASK;
@@ -3799,7 +3804,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
        int root, leaf;
        bool reserved = false;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                goto exit;
 
        walk_shadow_page_lockless_begin(vcpu);
@@ -3816,7 +3821,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
                if (!is_shadow_present_pte(spte))
                        break;
 
-               reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
+               reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
                                                    iterator.level);
        }
 
@@ -3895,7 +3900,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
        struct kvm_shadow_walk_iterator iterator;
        u64 spte;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                return;
 
        walk_shadow_page_lockless_begin(vcpu);
@@ -3922,7 +3927,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
        if (r)
                return r;
 
-       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
 
 
        return nonpaging_map(vcpu, gva & PAGE_MASK,
@@ -3935,8 +3940,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 
        arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
        arch.gfn = gfn;
-       arch.direct_map = vcpu->arch.mmu.direct_map;
-       arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+       arch.direct_map = vcpu->arch.mmu->direct_map;
+       arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
 
        return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
@@ -4042,7 +4047,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        int write = error_code & PFERR_WRITE_MASK;
        bool map_writable;
 
-       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
 
        if (page_fault_handle_page_track(vcpu, error_code, gfn))
                return RET_PF_EMULATE;
@@ -4118,7 +4123,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 {
        uint i;
        struct kvm_mmu_root_info root;
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
 
        root.cr3 = mmu->get_cr3(vcpu);
        root.hpa = mmu->root_hpa;
@@ -4141,7 +4146,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
                            union kvm_mmu_page_role new_role,
                            bool skip_tlb_flush)
 {
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
 
        /*
         * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
@@ -4192,7 +4197,8 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
                              bool skip_tlb_flush)
 {
        if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
-               kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
+               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
+                                  KVM_MMU_ROOT_CURRENT);
 }
 
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
@@ -4210,7 +4216,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 static void inject_page_fault(struct kvm_vcpu *vcpu,
                              struct x86_exception *fault)
 {
-       vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+       vcpu->arch.mmu->inject_page_fault(vcpu, fault);
 }
 
 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
@@ -4414,7 +4420,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
-       bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+       bool uses_nx = context->nx ||
+               context->mmu_role.base.smep_andnot_wp;
        struct rsvd_bits_validate *shadow_zero_check;
        int i;
 
@@ -4553,7 +4560,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
                         * SMAP:kernel-mode data accesses from user-mode
                         * mappings should fault. A fault is considered
                         * as a SMAP violation if all of the following
-                        * conditions are ture:
+                        * conditions are true:
                         *   - X86_CR4_SMAP is set in CR4
                         *   - A user page is accessed
                         *   - The access is not a fetch
@@ -4714,27 +4721,65 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
        paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
 }
 
-static union kvm_mmu_page_role
-kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
+static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
+{
+       union kvm_mmu_extended_role ext = {0};
+
+       ext.cr0_pg = !!is_paging(vcpu);
+       ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+       ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+       ext.cr4_pse = !!is_pse(vcpu);
+       ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
+       ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+
+       ext.valid = 1;
+
+       return ext;
+}
+
+static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
+                                                  bool base_only)
+{
+       union kvm_mmu_role role = {0};
+
+       role.base.access = ACC_ALL;
+       role.base.nxe = !!is_nx(vcpu);
+       role.base.cr4_pae = !!is_pae(vcpu);
+       role.base.cr0_wp = is_write_protection(vcpu);
+       role.base.smm = is_smm(vcpu);
+       role.base.guest_mode = is_guest_mode(vcpu);
+
+       if (base_only)
+               return role;
+
+       role.ext = kvm_calc_mmu_role_ext(vcpu);
+
+       return role;
+}
+
+static union kvm_mmu_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
 {
-       union kvm_mmu_page_role role = {0};
+       union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
 
-       role.guest_mode = is_guest_mode(vcpu);
-       role.smm = is_smm(vcpu);
-       role.ad_disabled = (shadow_accessed_mask == 0);
-       role.level = kvm_x86_ops->get_tdp_level(vcpu);
-       role.direct = true;
-       role.access = ACC_ALL;
+       role.base.ad_disabled = (shadow_accessed_mask == 0);
+       role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
+       role.base.direct = true;
 
        return role;
 }
 
 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
-       struct kvm_mmu *context = &vcpu->arch.mmu;
+       struct kvm_mmu *context = vcpu->arch.mmu;
+       union kvm_mmu_role new_role =
+               kvm_calc_tdp_mmu_root_page_role(vcpu, false);
 
-       context->base_role.word = mmu_base_role_mask.word &
-                                 kvm_calc_tdp_mmu_root_page_role(vcpu).word;
+       new_role.base.word &= mmu_base_role_mask.word;
+       if (new_role.as_u64 == context->mmu_role.as_u64)
+               return;
+
+       context->mmu_role.as_u64 = new_role.as_u64;
        context->page_fault = tdp_page_fault;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
@@ -4774,36 +4819,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        reset_tdp_shadow_zero_bits_mask(vcpu, context);
 }
 
-static union kvm_mmu_page_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
-{
-       union kvm_mmu_page_role role = {0};
-       bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
-       bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
-
-       role.nxe = is_nx(vcpu);
-       role.cr4_pae = !!is_pae(vcpu);
-       role.cr0_wp  = is_write_protection(vcpu);
-       role.smep_andnot_wp = smep && !is_write_protection(vcpu);
-       role.smap_andnot_wp = smap && !is_write_protection(vcpu);
-       role.guest_mode = is_guest_mode(vcpu);
-       role.smm = is_smm(vcpu);
-       role.direct = !is_paging(vcpu);
-       role.access = ACC_ALL;
+static union kvm_mmu_role
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+{
+       union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
+
+       role.base.smep_andnot_wp = role.ext.cr4_smep &&
+               !is_write_protection(vcpu);
+       role.base.smap_andnot_wp = role.ext.cr4_smap &&
+               !is_write_protection(vcpu);
+       role.base.direct = !is_paging(vcpu);
 
        if (!is_long_mode(vcpu))
-               role.level = PT32E_ROOT_LEVEL;
+               role.base.level = PT32E_ROOT_LEVEL;
        else if (is_la57_mode(vcpu))
-               role.level = PT64_ROOT_5LEVEL;
+               role.base.level = PT64_ROOT_5LEVEL;
        else
-               role.level = PT64_ROOT_4LEVEL;
+               role.base.level = PT64_ROOT_4LEVEL;
 
        return role;
 }
 
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 {
-       struct kvm_mmu *context = &vcpu->arch.mmu;
+       struct kvm_mmu *context = vcpu->arch.mmu;
+       union kvm_mmu_role new_role =
+               kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+
+       new_role.base.word &= mmu_base_role_mask.word;
+       if (new_role.as_u64 == context->mmu_role.as_u64)
+               return;
 
        if (!is_paging(vcpu))
                nonpaging_init_context(vcpu, context);
@@ -4814,22 +4859,28 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
        else
                paging32_init_context(vcpu, context);
 
-       context->base_role.word = mmu_base_role_mask.word &
-                                 kvm_calc_shadow_mmu_root_page_role(vcpu).word;
+       context->mmu_role.as_u64 = new_role.as_u64;
        reset_shadow_zero_bits_mask(vcpu, context);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
-static union kvm_mmu_page_role
-kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
+static union kvm_mmu_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
+                                  bool execonly)
 {
-       union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
+       union kvm_mmu_role role;
+
+       /* Base role is inherited from root_mmu */
+       role.base.word = vcpu->arch.root_mmu.mmu_role.base.word;
+       role.ext = kvm_calc_mmu_role_ext(vcpu);
+
+       role.base.level = PT64_ROOT_4LEVEL;
+       role.base.direct = false;
+       role.base.ad_disabled = !accessed_dirty;
+       role.base.guest_mode = true;
+       role.base.access = ACC_ALL;
 
-       role.level = PT64_ROOT_4LEVEL;
-       role.direct = false;
-       role.ad_disabled = !accessed_dirty;
-       role.guest_mode = true;
-       role.access = ACC_ALL;
+       role.ext.execonly = execonly;
 
        return role;
 }
@@ -4837,11 +4888,17 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
                             bool accessed_dirty, gpa_t new_eptp)
 {
-       struct kvm_mmu *context = &vcpu->arch.mmu;
-       union kvm_mmu_page_role root_page_role =
-               kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
+       struct kvm_mmu *context = vcpu->arch.mmu;
+       union kvm_mmu_role new_role =
+               kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
+                                                  execonly);
+
+       __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
+
+       new_role.base.word &= mmu_base_role_mask.word;
+       if (new_role.as_u64 == context->mmu_role.as_u64)
+               return;
 
-       __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
        context->shadow_root_level = PT64_ROOT_4LEVEL;
 
        context->nx = true;
@@ -4853,7 +4910,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
        context->update_pte = ept_update_pte;
        context->root_level = PT64_ROOT_4LEVEL;
        context->direct_map = false;
-       context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
+       context->mmu_role.as_u64 = new_role.as_u64;
+
        update_permission_bitmask(vcpu, context, true);
        update_pkru_bitmask(vcpu, context, true);
        update_last_nonleaf_level(vcpu, context);
@@ -4864,7 +4922,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-       struct kvm_mmu *context = &vcpu->arch.mmu;
+       struct kvm_mmu *context = vcpu->arch.mmu;
 
        kvm_init_shadow_mmu(vcpu);
        context->set_cr3           = kvm_x86_ops->set_cr3;
@@ -4875,14 +4933,20 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 
 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 {
+       union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
        struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
+       new_role.base.word &= mmu_base_role_mask.word;
+       if (new_role.as_u64 == g_context->mmu_role.as_u64)
+               return;
+
+       g_context->mmu_role.as_u64 = new_role.as_u64;
        g_context->get_cr3           = get_cr3;
        g_context->get_pdptr         = kvm_pdptr_read;
        g_context->inject_page_fault = kvm_inject_page_fault;
 
        /*
-        * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
+        * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
         * L1's nested page tables (e.g. EPT12). The nested translation
         * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
         * L2's page tables as the first level of translation and L1's
@@ -4921,10 +4985,10 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
        if (reset_roots) {
                uint i;
 
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+               vcpu->arch.mmu->root_hpa = INVALID_PAGE;
 
                for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+                       vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
        }
 
        if (mmu_is_nested(vcpu))
@@ -4939,10 +5003,14 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu);
 static union kvm_mmu_page_role
 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
 {
+       union kvm_mmu_role role;
+
        if (tdp_enabled)
-               return kvm_calc_tdp_mmu_root_page_role(vcpu);
+               role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
        else
-               return kvm_calc_shadow_mmu_root_page_role(vcpu);
+               role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
+
+       return role.base;
 }
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -4972,8 +5040,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
-       kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
-       WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
+       WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+       WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
@@ -4987,7 +5057,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
         }
 
        ++vcpu->kvm->stat.mmu_pte_updated;
-       vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
+       vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
 }
 
 static bool need_remote_flush(u64 old, u64 new)
@@ -5164,10 +5234,12 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
                local_flush = true;
                while (npte--) {
+                       u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
+
                        entry = *spte;
                        mmu_page_zap_pte(vcpu->kvm, sp, spte);
                        if (gentry &&
-                             !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+                             !((sp->role.word ^ base_role)
                              & mmu_base_role_mask.word) && rmap_can_add(vcpu))
                                mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
                        if (need_remote_flush(entry, *spte))
@@ -5185,7 +5257,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
        gpa_t gpa;
        int r;
 
-       if (vcpu->arch.mmu.direct_map)
+       if (vcpu->arch.mmu->direct_map)
                return 0;
 
        gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -5221,10 +5293,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
 {
        int r, emulation_type = 0;
        enum emulation_result er;
-       bool direct = vcpu->arch.mmu.direct_map;
+       bool direct = vcpu->arch.mmu->direct_map;
 
        /* With shadow page tables, fault_address contains a GVA or nGPA.  */
-       if (vcpu->arch.mmu.direct_map) {
+       if (vcpu->arch.mmu->direct_map) {
                vcpu->arch.gpa_available = true;
                vcpu->arch.gpa_val = cr2;
        }
@@ -5237,8 +5309,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
        }
 
        if (r == RET_PF_INVALID) {
-               r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
-                                             false);
+               r = vcpu->arch.mmu->page_fault(vcpu, cr2,
+                                              lower_32_bits(error_code),
+                                              false);
                WARN_ON(r == RET_PF_INVALID);
        }
 
@@ -5254,7 +5327,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
         * paging in both guests. If true, we simply unprotect the page
         * and resume the guest.
         */
-       if (vcpu->arch.mmu.direct_map &&
+       if (vcpu->arch.mmu->direct_map &&
            (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
                kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
                return 1;
@@ -5302,7 +5375,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
        int i;
 
        /* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
@@ -5333,7 +5406,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
 
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 {
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
        bool tlb_flush = false;
        uint i;
 
@@ -5377,8 +5450,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
 
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
-       free_page((unsigned long)vcpu->arch.mmu.pae_root);
-       free_page((unsigned long)vcpu->arch.mmu.lm_root);
+       free_page((unsigned long)vcpu->arch.mmu->pae_root);
+       free_page((unsigned long)vcpu->arch.mmu->lm_root);
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -5398,9 +5471,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
        if (!page)
                return -ENOMEM;
 
-       vcpu->arch.mmu.pae_root = page_address(page);
+       vcpu->arch.mmu->pae_root = page_address(page);
        for (i = 0; i < 4; ++i)
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+               vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
 
        return 0;
 }
@@ -5409,27 +5482,21 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
        uint i;
 
-       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
-       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       vcpu->arch.mmu.translate_gpa = translate_gpa;
-       vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+       vcpu->arch.mmu = &vcpu->arch.root_mmu;
+       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 
+       vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
+       vcpu->arch.root_mmu.translate_gpa = translate_gpa;
        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-               vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
-       return alloc_mmu_pages(vcpu);
-}
+               vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
 
-void kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
-       MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
+       vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
+       for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+               vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
 
-       /*
-        * kvm_mmu_setup() is called only on vCPU initialization.  
-        * Therefore, no need to reset mmu roots as they are not yet
-        * initialized.
-        */
-       kvm_init_mmu(vcpu, false);
+       vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+       return alloc_mmu_pages(vcpu);
 }
 
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
@@ -5612,7 +5679,7 @@ restart:
                if (sp->role.direct &&
                        !kvm_is_reserved_pfn(pfn) &&
                        PageTransCompoundMap(pfn_to_page(pfn))) {
-                       drop_spte(kvm, sptep);
+                       pte_list_remove(rmap_head, sptep);
                        need_tlb_flush = 1;
                        goto restart;
                }
@@ -5869,6 +5936,16 @@ int kvm_mmu_module_init(void)
 {
        int ret = -ENOMEM;
 
+       /*
+        * MMU roles use union aliasing which is, generally speaking, an
+        * undefined behavior. However, we supposedly know how compilers behave
+        * and the current status quo is unlikely to change. Guardians below are
+        * supposed to let us know if the assumption becomes false.
+        */
+       BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
+       BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
+       BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
+
        kvm_mmu_reset_all_pte_masks();
 
        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
@@ -5898,7 +5975,7 @@ out:
 }
 
 /*
- * Caculate mmu pages needed for kvm.
+ * Calculate mmu pages needed for kvm.
  */
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 {
index 1fab69c..c7b3331 100644 (file)
 #define PT32_ROOT_LEVEL 2
 #define PT32E_ROOT_LEVEL 3
 
-#define PT_PDPE_LEVEL 3
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
-
 static inline u64 rsvd_bits(int s, int e)
 {
        if (e < s)
@@ -80,7 +75,7 @@ static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
-       if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+       if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
                return 0;
 
        return kvm_mmu_load(vcpu);
@@ -102,9 +97,9 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
 
 static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
 {
-       if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
-               vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa |
-                                            kvm_get_active_pcid(vcpu));
+       if (VALID_PAGE(vcpu->arch.mmu->root_hpa))
+               vcpu->arch.mmu->set_cr3(vcpu, vcpu->arch.mmu->root_hpa |
+                                             kvm_get_active_pcid(vcpu));
 }
 
 /*
index 1272861..abac7e2 100644 (file)
@@ -59,19 +59,19 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
        int i;
        struct kvm_mmu_page *sp;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                return;
 
-       if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+               hpa_t root = vcpu->arch.mmu->root_hpa;
 
                sp = page_header(root);
-               __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
+               __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level);
                return;
        }
 
        for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
+               hpa_t root = vcpu->arch.mmu->pae_root[i];
 
                if (root && VALID_PAGE(root)) {
                        root &= PT64_BASE_ADDR_MASK;
@@ -122,7 +122,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
        hpa =  pfn << PAGE_SHIFT;
        if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
                audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
-                            "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+                            "ent %llxn", vcpu->arch.mmu->root_level, pfn,
                             hpa, *sptep);
 }
 
index 14ffd97..7cf2185 100644 (file)
@@ -158,14 +158,15 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
                                  struct kvm_mmu_page *sp, u64 *spte,
                                  u64 gpte)
 {
-       if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+       if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
                goto no_present;
 
        if (!FNAME(is_present_gpte)(gpte))
                goto no_present;
 
        /* if accessed bit is not supported prefetch non accessed gpte */
-       if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK))
+       if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
+           !(gpte & PT_GUEST_ACCESSED_MASK))
                goto no_present;
 
        return false;
@@ -480,7 +481,7 @@ error:
 static int FNAME(walk_addr)(struct guest_walker *walker,
                            struct kvm_vcpu *vcpu, gva_t addr, u32 access)
 {
-       return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+       return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
                                        access);
 }
 
@@ -509,7 +510,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
        gfn = gpte_to_gfn(gpte);
        pte_access = sp->role.access & FNAME(gpte_access)(gpte);
-       FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
+       FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
        pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
                        no_dirty_log && (pte_access & ACC_WRITE_MASK));
        if (is_error_pfn(pfn))
@@ -604,7 +605,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
        direct_access = gw->pte_access;
 
-       top_level = vcpu->arch.mmu.root_level;
+       top_level = vcpu->arch.mmu->root_level;
        if (top_level == PT32E_ROOT_LEVEL)
                top_level = PT32_ROOT_LEVEL;
        /*
@@ -616,7 +617,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
        if (FNAME(gpte_changed)(vcpu, gw, top_level))
                goto out_gpte_changed;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                goto out_gpte_changed;
 
        for (shadow_walk_init(&it, vcpu, addr);
@@ -1004,7 +1005,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                gfn = gpte_to_gfn(gpte);
                pte_access = sp->role.access;
                pte_access &= FNAME(gpte_access)(gpte);
-               FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
+               FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
 
                if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
                      &nr_present))
index 61ccfb1..0e21ccc 100644 (file)
@@ -809,6 +809,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
            nested_svm_check_exception(svm, nr, has_error_code, error_code))
                return;
 
+       kvm_deliver_exception_payload(&svm->vcpu);
+
        if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
                unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 
@@ -2922,18 +2924,18 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
        WARN_ON(mmu_is_nested(vcpu));
        kvm_init_shadow_mmu(vcpu);
-       vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
-       vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
-       vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
-       vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
-       vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
-       reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
+       vcpu->arch.mmu->set_cr3           = nested_svm_set_tdp_cr3;
+       vcpu->arch.mmu->get_cr3           = nested_svm_get_tdp_cr3;
+       vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
+       vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+       vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
+       reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 }
 
 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 }
 
 static int nested_svm_check_permissions(struct vcpu_svm *svm)
@@ -2969,16 +2971,13 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
        svm->vmcb->control.exit_info_1 = error_code;
 
        /*
-        * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
-        * The fix is to add the ancillary datum (CR2 or DR6) to structs
-        * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
-        * written only when inject_pending_event runs (DR6 would written here
-        * too).  This should be conditional on a new capability---if the
-        * capability is disabled, kvm_multiple_exception would write the
-        * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+        * EXITINFO2 is undefined for all exception intercepts other
+        * than #PF.
         */
        if (svm->vcpu.arch.exception.nested_apf)
                svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+       else if (svm->vcpu.arch.exception.has_payload)
+               svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
        else
                svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
 
@@ -5642,26 +5641,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
                "mov %%r13, %c[r13](%[svm]) \n\t"
                "mov %%r14, %c[r14](%[svm]) \n\t"
                "mov %%r15, %c[r15](%[svm]) \n\t"
-#endif
                /*
                * Clear host registers marked as clobbered to prevent
                * speculative use.
                */
-               "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
-               "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
-               "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
-               "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
-               "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
-#ifdef CONFIG_X86_64
-               "xor %%r8, %%r8 \n\t"
-               "xor %%r9, %%r9 \n\t"
-               "xor %%r10, %%r10 \n\t"
-               "xor %%r11, %%r11 \n\t"
-               "xor %%r12, %%r12 \n\t"
-               "xor %%r13, %%r13 \n\t"
-               "xor %%r14, %%r14 \n\t"
-               "xor %%r15, %%r15 \n\t"
+               "xor %%r8d, %%r8d \n\t"
+               "xor %%r9d, %%r9d \n\t"
+               "xor %%r10d, %%r10d \n\t"
+               "xor %%r11d, %%r11d \n\t"
+               "xor %%r12d, %%r12d \n\t"
+               "xor %%r13d, %%r13d \n\t"
+               "xor %%r14d, %%r14d \n\t"
+               "xor %%r15d, %%r15d \n\t"
 #endif
+               "xor %%ebx, %%ebx \n\t"
+               "xor %%ecx, %%ecx \n\t"
+               "xor %%edx, %%edx \n\t"
+               "xor %%esi, %%esi \n\t"
+               "xor %%edi, %%edi \n\t"
                "pop %%" _ASM_BP
                :
                : [svm]"a"(svm),
@@ -7040,6 +7037,13 @@ failed:
        return ret;
 }
 
+static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+                                  uint16_t *vmcs_version)
+{
+       /* Intel-only feature */
+       return -ENODEV;
+}
+
 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -7169,6 +7173,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .mem_enc_op = svm_mem_enc_op,
        .mem_enc_reg_region = svm_register_enc_region,
        .mem_enc_unreg_region = svm_unregister_enc_region,
+
+       .nested_enable_evmcs = nested_enable_evmcs,
 };
 
 static int __init svm_init(void)
index 0f99768..0659465 100644 (file)
@@ -1418,6 +1418,48 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex,
                  __entry->valid_bank_mask, __entry->format,
                  __entry->address_space, __entry->flags)
 );
+
+/*
+ * Tracepoints for kvm_hv_send_ipi.
+ */
+TRACE_EVENT(kvm_hv_send_ipi,
+       TP_PROTO(u32 vector, u64 processor_mask),
+       TP_ARGS(vector, processor_mask),
+
+       TP_STRUCT__entry(
+               __field(u32, vector)
+               __field(u64, processor_mask)
+       ),
+
+       TP_fast_assign(
+               __entry->vector = vector;
+               __entry->processor_mask = processor_mask;
+       ),
+
+       TP_printk("vector %x processor_mask 0x%llx",
+                 __entry->vector, __entry->processor_mask)
+);
+
+TRACE_EVENT(kvm_hv_send_ipi_ex,
+       TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask),
+       TP_ARGS(vector, format, valid_bank_mask),
+
+       TP_STRUCT__entry(
+               __field(u32, vector)
+               __field(u64, format)
+               __field(u64, valid_bank_mask)
+       ),
+
+       TP_fast_assign(
+               __entry->vector = vector;
+               __entry->format = format;
+               __entry->valid_bank_mask = valid_bank_mask;
+       ),
+
+       TP_printk("vector %x format %llx valid_bank_mask 0x%llx",
+                 __entry->vector, __entry->format,
+                 __entry->valid_bank_mask)
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
index e665aa7..4555077 100644 (file)
@@ -20,6 +20,7 @@
 #include "mmu.h"
 #include "cpuid.h"
 #include "lapic.h"
+#include "hyperv.h"
 
 #include <linux/kvm_host.h>
 #include <linux/module.h>
@@ -61,7 +62,7 @@
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 #define __ex_clear(x, reg) \
-       ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
+       ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
 
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
@@ -107,9 +108,12 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
  * use VMX instructions.
  */
-static bool __read_mostly nested = 0;
+static bool __read_mostly nested = 1;
 module_param(nested, bool, S_IRUGO);
 
+static bool __read_mostly nested_early_check = 0;
+module_param(nested_early_check, bool, S_IRUGO);
+
 static u64 __read_mostly host_xss;
 
 static bool __read_mostly enable_pml = 1;
@@ -131,7 +135,7 @@ static bool __read_mostly enable_preemption_timer = 1;
 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 #endif
 
-#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
 #define KVM_VM_CR0_ALWAYS_ON                           \
        (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST |      \
@@ -187,6 +191,7 @@ static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 module_param(ple_window_max, uint, 0444);
 
 extern const ulong vmx_return;
+extern const ulong vmx_early_consistency_check_return;
 
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
@@ -827,14 +832,28 @@ struct nested_vmx {
         */
        struct vmcs12 *cached_shadow_vmcs12;
        /*
-        * Indicates if the shadow vmcs must be updated with the
-        * data hold by vmcs12
+        * Indicates if the shadow vmcs or enlightened vmcs must be updated
+        * with the data held by struct vmcs12.
         */
-       bool sync_shadow_vmcs;
+       bool need_vmcs12_sync;
        bool dirty_vmcs12;
 
+       /*
+        * vmcs02 has been initialized, i.e. state that is constant for
+        * vmcs02 has been written to the backing VMCS.  Initialization
+        * is delayed until L1 actually attempts to run a nested VM.
+        */
+       bool vmcs02_initialized;
+
        bool change_vmcs01_virtual_apic_mode;
 
+       /*
+        * Enlightened VMCS has been enabled. It does not mean that L1 has to
+        * use it. However, VMX features available to L1 will be limited based
+        * on what the enlightened VMCS supports.
+        */
+       bool enlightened_vmcs_enabled;
+
        /* L2 must run next, and mustn't decide to exit to L1. */
        bool nested_run_pending;
 
@@ -870,6 +889,10 @@ struct nested_vmx {
                /* in guest mode on SMM entry? */
                bool guest_mode;
        } smm;
+
+       gpa_t hv_evmcs_vmptr;
+       struct page *hv_evmcs_page;
+       struct hv_enlightened_vmcs *hv_evmcs;
 };
 
 #define POSTED_INTR_ON  0
@@ -1381,6 +1404,49 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs);
 
 #define KVM_EVMCS_VERSION 1
 
+/*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ *     POSTED_INTR_NV                  = 0x00000002,
+ *     GUEST_INTR_STATUS               = 0x00000810,
+ *     APIC_ACCESS_ADDR                = 0x00002014,
+ *     POSTED_INTR_DESC_ADDR           = 0x00002016,
+ *     EOI_EXIT_BITMAP0                = 0x0000201c,
+ *     EOI_EXIT_BITMAP1                = 0x0000201e,
+ *     EOI_EXIT_BITMAP2                = 0x00002020,
+ *     EOI_EXIT_BITMAP3                = 0x00002022,
+ *     GUEST_PML_INDEX                 = 0x00000812,
+ *     PML_ADDRESS                     = 0x0000200e,
+ *     VM_FUNCTION_CONTROL             = 0x00002018,
+ *     EPTP_LIST_ADDRESS               = 0x00002024,
+ *     VMREAD_BITMAP                   = 0x00002026,
+ *     VMWRITE_BITMAP                  = 0x00002028,
+ *
+ *     TSC_MULTIPLIER                  = 0x00002032,
+ *     PLE_GAP                         = 0x00004020,
+ *     PLE_WINDOW                      = 0x00004022,
+ *     VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
+ *      GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
+ *      HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
+ *
+ * Currently unsupported in KVM:
+ *     GUEST_IA32_RTIT_CTL             = 0x00002814,
+ */
+#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
+                                   PIN_BASED_VMX_PREEMPTION_TIMER)
+#define EVMCS1_UNSUPPORTED_2NDEXEC                                     \
+       (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |                         \
+        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |                      \
+        SECONDARY_EXEC_APIC_REGISTER_VIRT |                            \
+        SECONDARY_EXEC_ENABLE_PML |                                    \
+        SECONDARY_EXEC_ENABLE_VMFUNC |                                 \
+        SECONDARY_EXEC_SHADOW_VMCS |                                   \
+        SECONDARY_EXEC_TSC_SCALING |                                   \
+        SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
+
 #if IS_ENABLED(CONFIG_HYPERV)
 static bool __read_mostly enlightened_vmcs = true;
 module_param(enlightened_vmcs, bool, 0444);
@@ -1473,69 +1539,12 @@ static void evmcs_load(u64 phys_addr)
 
 static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
 {
-       /*
-        * Enlightened VMCSv1 doesn't support these:
-        *
-        *      POSTED_INTR_NV                  = 0x00000002,
-        *      GUEST_INTR_STATUS               = 0x00000810,
-        *      APIC_ACCESS_ADDR                = 0x00002014,
-        *      POSTED_INTR_DESC_ADDR           = 0x00002016,
-        *      EOI_EXIT_BITMAP0                = 0x0000201c,
-        *      EOI_EXIT_BITMAP1                = 0x0000201e,
-        *      EOI_EXIT_BITMAP2                = 0x00002020,
-        *      EOI_EXIT_BITMAP3                = 0x00002022,
-        */
-       vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
-       vmcs_conf->cpu_based_2nd_exec_ctrl &=
-               ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
-       vmcs_conf->cpu_based_2nd_exec_ctrl &=
-               ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-       vmcs_conf->cpu_based_2nd_exec_ctrl &=
-               ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
-
-       /*
-        *      GUEST_PML_INDEX                 = 0x00000812,
-        *      PML_ADDRESS                     = 0x0000200e,
-        */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
-
-       /*      VM_FUNCTION_CONTROL             = 0x00002018, */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
-
-       /*
-        *      EPTP_LIST_ADDRESS               = 0x00002024,
-        *      VMREAD_BITMAP                   = 0x00002026,
-        *      VMWRITE_BITMAP                  = 0x00002028,
-        */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
-
-       /*
-        *      TSC_MULTIPLIER                  = 0x00002032,
-        */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
-
-       /*
-        *      PLE_GAP                         = 0x00004020,
-        *      PLE_WINDOW                      = 0x00004022,
-        */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-
-       /*
-        *      VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
-        */
-       vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
 
-       /*
-        *      GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
-        *      HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
-        */
-       vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
-       vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+       vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+       vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
 
-       /*
-        * Currently unsupported in KVM:
-        *      GUEST_IA32_RTIT_CTL             = 0x00002814,
-        */
 }
 
 /* check_ept_pointer() should be under protection of ept_pointer_lock. */
@@ -1560,26 +1569,27 @@ static void check_ept_pointer_match(struct kvm *kvm)
 
 static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
 {
-       int ret;
+       struct kvm_vcpu *vcpu;
+       int ret = -ENOTSUPP, i;
 
        spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
 
        if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
                check_ept_pointer_match(kvm);
 
-       if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
-               ret = -ENOTSUPP;
-               goto out;
-       }
-
        /*
         * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
         * base of EPT PML4 table, strip off EPT configuration information.
         */
-       ret = hyperv_flush_guest_mapping(
-                       to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
+       if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+               kvm_for_each_vcpu(i, vcpu, kvm)
+                       ret |= hyperv_flush_guest_mapping(
+                               to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK);
+       } else {
+               ret = hyperv_flush_guest_mapping(
+                               to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
+       }
 
-out:
        spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
        return ret;
 }
@@ -1595,6 +1605,35 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
 static inline void evmcs_touch_msr_bitmap(void) {}
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
+static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+                              uint16_t *vmcs_version)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       /* We don't support disabling the feature for simplicity. */
+       if (vmx->nested.enlightened_vmcs_enabled)
+               return 0;
+
+       vmx->nested.enlightened_vmcs_enabled = true;
+
+       /*
+        * vmcs_version represents the range of supported Enlightened VMCS
+        * versions: lower 8 bits is the minimal version, higher 8 bits is the
+        * maximum supported version. KVM supports versions from 1 to
+        * KVM_EVMCS_VERSION.
+        */
+       if (vmcs_version)
+               *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
+
+       vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+       vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
+       vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+       vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+       vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
+
+       return 0;
+}
+
 static inline bool is_exception_n(u32 intr_info, u8 vector)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -1617,11 +1656,6 @@ static inline bool is_page_fault(u32 intr_info)
        return is_exception_n(intr_info, PF_VECTOR);
 }
 
-static inline bool is_no_device(u32 intr_info)
-{
-       return is_exception_n(intr_info, NM_VECTOR);
-}
-
 static inline bool is_invalid_opcode(u32 intr_info)
 {
        return is_exception_n(intr_info, UD_VECTOR);
@@ -1632,12 +1666,6 @@ static inline bool is_gp_fault(u32 intr_info)
        return is_exception_n(intr_info, GP_VECTOR);
 }
 
-static inline bool is_external_interrupt(u32 intr_info)
-{
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-               == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
 static inline bool is_machine_check(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -2063,9 +2091,6 @@ static inline bool is_nmi(u32 intr_info)
 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                              u32 exit_intr_info,
                              unsigned long exit_qualification);
-static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
-                       struct vmcs12 *vmcs12,
-                       u32 reason, unsigned long qualification);
 
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
@@ -2077,7 +2102,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
        return -1;
 }
 
-static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
 {
     struct {
        u64 vpid : 16;
@@ -2086,22 +2111,20 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
     } operand = { vpid, 0, gva };
     bool error;
 
-    asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
-                 : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
-                 : "memory");
+    asm volatile (__ex("invvpid %2, %1") CC_SET(na)
+                 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
     BUG_ON(error);
 }
 
-static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
 {
        struct {
                u64 eptp, gpa;
        } operand = {eptp, gpa};
        bool error;
 
-       asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
-                     : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
-                     : "memory");
+       asm volatile (__ex("invept %2, %1") CC_SET(na)
+                     : CC_OUT(na) (error) : "r"(ext), "m"(operand));
        BUG_ON(error);
 }
 
@@ -2120,9 +2143,8 @@ static void vmcs_clear(struct vmcs *vmcs)
        u64 phys_addr = __pa(vmcs);
        bool error;
 
-       asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
-                     : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
-                     : "memory");
+       asm volatile (__ex("vmclear %1") CC_SET(na)
+                     : CC_OUT(na) (error) : "m"(phys_addr));
        if (unlikely(error))
                printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
                       vmcs, phys_addr);
@@ -2145,9 +2167,8 @@ static void vmcs_load(struct vmcs *vmcs)
        if (static_branch_unlikely(&enable_evmcs))
                return evmcs_load(phys_addr);
 
-       asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
-                     : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
-                     : "memory");
+       asm volatile (__ex("vmptrld %1") CC_SET(na)
+                     : CC_OUT(na) (error) : "m"(phys_addr));
        if (unlikely(error))
                printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
                       vmcs, phys_addr);
@@ -2323,8 +2344,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
 {
        unsigned long value;
 
-       asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
-                     : "=a"(value) : "d"(field) : "cc");
+       asm volatile (__ex_clear("vmread %1, %0", "%k0")
+                     : "=r"(value) : "r"(field));
        return value;
 }
 
@@ -2375,8 +2396,8 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
 {
        bool error;
 
-       asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
-                     : CC_OUT(na) (error) : "a"(value), "d"(field));
+       asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
+                     : CC_OUT(na) (error) : "r"(field), "rm"(value));
        if (unlikely(error))
                vmwrite_error(field, value);
 }
@@ -2707,7 +2728,8 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
                u64 guest_val, u64 host_val)
 {
        vmcs_write64(guest_val_vmcs, guest_val);
-       vmcs_write64(host_val_vmcs, host_val);
+       if (host_val_vmcs != HOST_IA32_EFER)
+               vmcs_write64(host_val_vmcs, host_val);
        vm_entry_controls_setbit(vmx, entry);
        vm_exit_controls_setbit(vmx, exit);
 }
@@ -2805,8 +2827,6 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
                ignore_bits &= ~(u64)EFER_SCE;
 #endif
 
-       clear_atomic_switch_msr(vmx, MSR_EFER);
-
        /*
         * On EPT, we can't emulate NX, so we must switch EFER atomically.
         * On CPUs that support "load IA32_EFER", always switch EFER
@@ -2819,8 +2839,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
                if (guest_efer != host_efer)
                        add_atomic_switch_msr(vmx, MSR_EFER,
                                              guest_efer, host_efer, false);
+               else
+                       clear_atomic_switch_msr(vmx, MSR_EFER);
                return false;
        } else {
+               clear_atomic_switch_msr(vmx, MSR_EFER);
+
                guest_efer &= ~ignore_bits;
                guest_efer |= host_efer & ignore_bits;
 
@@ -3272,34 +3296,30 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        unsigned int nr = vcpu->arch.exception.nr;
+       bool has_payload = vcpu->arch.exception.has_payload;
+       unsigned long payload = vcpu->arch.exception.payload;
 
        if (nr == PF_VECTOR) {
                if (vcpu->arch.exception.nested_apf) {
                        *exit_qual = vcpu->arch.apf.nested_apf_token;
                        return 1;
                }
-               /*
-                * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
-                * The fix is to add the ancillary datum (CR2 or DR6) to structs
-                * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
-                * can be written only when inject_pending_event runs.  This should be
-                * conditional on a new capability---if the capability is disabled,
-                * kvm_multiple_exception would write the ancillary information to
-                * CR2 or DR6, for backwards ABI-compatibility.
-                */
                if (nested_vmx_is_page_fault_vmexit(vmcs12,
                                                    vcpu->arch.exception.error_code)) {
-                       *exit_qual = vcpu->arch.cr2;
-                       return 1;
-               }
-       } else {
-               if (vmcs12->exception_bitmap & (1u << nr)) {
-                       if (nr == DB_VECTOR)
-                               *exit_qual = vcpu->arch.dr6;
-                       else
-                               *exit_qual = 0;
+                       *exit_qual = has_payload ? payload : vcpu->arch.cr2;
                        return 1;
                }
+       } else if (vmcs12->exception_bitmap & (1u << nr)) {
+               if (nr == DB_VECTOR) {
+                       if (!has_payload) {
+                               payload = vcpu->arch.dr6;
+                               payload &= ~(DR6_FIXED_1 | DR6_BT);
+                               payload ^= DR6_RTM;
+                       }
+                       *exit_qual = payload;
+               } else
+                       *exit_qual = 0;
+               return 1;
        }
 
        return 0;
@@ -3326,6 +3346,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
        u32 error_code = vcpu->arch.exception.error_code;
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+       kvm_deliver_exception_payload(vcpu);
+
        if (has_error_code) {
                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -4397,9 +4419,7 @@ static void kvm_cpu_vmxon(u64 addr)
        cr4_set_bits(X86_CR4_VMXE);
        intel_pt_handle_vmx(1);
 
-       asm volatile (ASM_VMX_VMXON_RAX
-                       : : "a"(&addr), "m"(addr)
-                       : "memory", "cc");
+       asm volatile ("vmxon %0" : : "m"(addr));
 }
 
 static int hardware_enable(void)
@@ -4468,7 +4488,7 @@ static void vmclear_local_loaded_vmcss(void)
  */
 static void kvm_cpu_vmxoff(void)
 {
-       asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+       asm volatile (__ex("vmxoff"));
 
        intel_pt_handle_vmx(0);
        cr4_clear_bits(X86_CR4_VMXE);
@@ -5112,9 +5132,10 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
                                bool invalidate_gpa)
 {
        if (enable_ept && (invalidate_gpa || !enable_vpid)) {
-               if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                        return;
-               ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
+               ept_sync_context(construct_eptp(vcpu,
+                                               vcpu->arch.mmu->root_hpa));
        } else {
                vpid_sync_context(vpid);
        }
@@ -5264,7 +5285,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long hw_cr0;
 
-       hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
+       hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
        if (enable_unrestricted_guest)
                hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
        else {
@@ -6339,6 +6360,9 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
                rdmsr(MSR_IA32_CR_PAT, low32, high32);
                vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
        }
+
+       if (cpu_has_load_ia32_efer)
+               vmcs_write64(HOST_IA32_EFER, host_efer);
 }
 
 static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -6666,7 +6690,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
                vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
 
        if (enable_pml) {
-               ASSERT(vmx->pml_pg);
                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
        }
@@ -8067,35 +8090,39 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
 
 /*
  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
+ * set the success or error code of an emulated VMX instruction (as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
+ * instruction.
  */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
 {
        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
 {
        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
                            X86_EFLAGS_SF | X86_EFLAGS_OF))
                        | X86_EFLAGS_CF);
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                       u32 vm_instruction_error)
+static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
+                               u32 vm_instruction_error)
 {
-       if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-               /*
-                * failValid writes the error number to the current VMCS, which
-                * can't be done there isn't a current VMCS.
-                */
-               nested_vmx_failInvalid(vcpu);
-               return;
-       }
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       /*
+        * failValid writes the error number to the current VMCS, which
+        * can't be done if there isn't a current VMCS.
+        */
+       if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+               return nested_vmx_failInvalid(vcpu);
+
        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
                            X86_EFLAGS_SF | X86_EFLAGS_OF))
@@ -8105,6 +8132,7 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
         * We don't need to force a shadow sync because
         * VM_INSTRUCTION_ERROR is not shadowed
         */
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
@@ -8292,6 +8320,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 
        vmx->nested.vpid02 = allocate_vpid();
 
+       vmx->nested.vmcs02_initialized = false;
        vmx->nested.vmxon = true;
        return 0;
 
@@ -8345,10 +8374,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       if (vmx->nested.vmxon) {
-               nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmx->nested.vmxon)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
 
        if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                        != VMXON_NEEDED_FEATURES) {
@@ -8367,21 +8395,17 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
         * which replaces physical address width with 32
         */
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failInvalid(vcpu);
 
        page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-       if (is_error_page(page)) {
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (is_error_page(page))
+               return nested_vmx_failInvalid(vcpu);
+
        if (*(u32 *)kmap(page) != VMCS12_REVISION) {
                kunmap(page);
                kvm_release_page_clean(page);
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
+               return nested_vmx_failInvalid(vcpu);
        }
        kunmap(page);
        kvm_release_page_clean(page);
@@ -8391,8 +8415,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        if (ret)
                return ret;
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 /*
@@ -8423,8 +8446,24 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 }
 
-static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!vmx->nested.hv_evmcs)
+               return;
+
+       kunmap(vmx->nested.hv_evmcs_page);
+       kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
+       vmx->nested.hv_evmcs_vmptr = -1ull;
+       vmx->nested.hv_evmcs_page = NULL;
+       vmx->nested.hv_evmcs = NULL;
+}
+
+static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
        if (vmx->nested.current_vmptr == -1ull)
                return;
 
@@ -8432,16 +8471,18 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
                /* copy to memory all shadowed fields in case
                   they were modified */
                copy_shadow_to_vmcs12(vmx);
-               vmx->nested.sync_shadow_vmcs = false;
+               vmx->nested.need_vmcs12_sync = false;
                vmx_disable_shadow_vmcs(vmx);
        }
        vmx->nested.posted_intr_nv = -1;
 
        /* Flush VMCS12 to guest memory */
-       kvm_vcpu_write_guest_page(&vmx->vcpu,
+       kvm_vcpu_write_guest_page(vcpu,
                                  vmx->nested.current_vmptr >> PAGE_SHIFT,
                                  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
 
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
        vmx->nested.current_vmptr = -1ull;
 }
 
@@ -8449,8 +8490,10 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
  * just stops using VMX.
  */
-static void free_nested(struct vcpu_vmx *vmx)
+static void free_nested(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
        if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                return;
 
@@ -8483,6 +8526,10 @@ static void free_nested(struct vcpu_vmx *vmx)
                vmx->nested.pi_desc = NULL;
        }
 
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
+       nested_release_evmcs(vcpu);
+
        free_loaded_vmcs(&vmx->nested.vmcs02);
 }
 
@@ -8491,9 +8538,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
 {
        if (!nested_vmx_check_permission(vcpu))
                return 1;
-       free_nested(to_vmx(vcpu));
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       free_nested(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 /* Emulate the VMCLEAR instruction */
@@ -8509,25 +8555,28 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
        if (nested_vmx_get_vmptr(vcpu, &vmptr))
                return 1;
 
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMCLEAR_INVALID_ADDRESS);
 
-       if (vmptr == vmx->nested.vmxon_ptr) {
-               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmptr == vmx->nested.vmxon_ptr)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMCLEAR_VMXON_POINTER);
 
-       if (vmptr == vmx->nested.current_vmptr)
-               nested_release_vmcs12(vmx);
+       if (vmx->nested.hv_evmcs_page) {
+               if (vmptr == vmx->nested.hv_evmcs_vmptr)
+                       nested_release_evmcs(vcpu);
+       } else {
+               if (vmptr == vmx->nested.current_vmptr)
+                       nested_release_vmcs12(vcpu);
 
-       kvm_vcpu_write_guest(vcpu,
-                       vmptr + offsetof(struct vmcs12, launch_state),
-                       &zero, sizeof(zero));
+               kvm_vcpu_write_guest(vcpu,
+                                    vmptr + offsetof(struct vmcs12,
+                                                     launch_state),
+                                    &zero, sizeof(zero));
+       }
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
@@ -8610,6 +8659,395 @@ static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
 
 }
 
+static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
+{
+       struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+       struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+       vmcs12->hdr.revision_id = evmcs->revision_id;
+
+       /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
+       vmcs12->tpr_threshold = evmcs->tpr_threshold;
+       vmcs12->guest_rip = evmcs->guest_rip;
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
+               vmcs12->guest_rsp = evmcs->guest_rsp;
+               vmcs12->guest_rflags = evmcs->guest_rflags;
+               vmcs12->guest_interruptibility_info =
+                       evmcs->guest_interruptibility_info;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
+               vmcs12->cpu_based_vm_exec_control =
+                       evmcs->cpu_based_vm_exec_control;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
+               vmcs12->exception_bitmap = evmcs->exception_bitmap;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
+               vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
+               vmcs12->vm_entry_intr_info_field =
+                       evmcs->vm_entry_intr_info_field;
+               vmcs12->vm_entry_exception_error_code =
+                       evmcs->vm_entry_exception_error_code;
+               vmcs12->vm_entry_instruction_len =
+                       evmcs->vm_entry_instruction_len;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
+               vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
+               vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
+               vmcs12->host_cr0 = evmcs->host_cr0;
+               vmcs12->host_cr3 = evmcs->host_cr3;
+               vmcs12->host_cr4 = evmcs->host_cr4;
+               vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
+               vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
+               vmcs12->host_rip = evmcs->host_rip;
+               vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
+               vmcs12->host_es_selector = evmcs->host_es_selector;
+               vmcs12->host_cs_selector = evmcs->host_cs_selector;
+               vmcs12->host_ss_selector = evmcs->host_ss_selector;
+               vmcs12->host_ds_selector = evmcs->host_ds_selector;
+               vmcs12->host_fs_selector = evmcs->host_fs_selector;
+               vmcs12->host_gs_selector = evmcs->host_gs_selector;
+               vmcs12->host_tr_selector = evmcs->host_tr_selector;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
+               vmcs12->pin_based_vm_exec_control =
+                       evmcs->pin_based_vm_exec_control;
+               vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
+               vmcs12->secondary_vm_exec_control =
+                       evmcs->secondary_vm_exec_control;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
+               vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
+               vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
+               vmcs12->msr_bitmap = evmcs->msr_bitmap;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
+               vmcs12->guest_es_base = evmcs->guest_es_base;
+               vmcs12->guest_cs_base = evmcs->guest_cs_base;
+               vmcs12->guest_ss_base = evmcs->guest_ss_base;
+               vmcs12->guest_ds_base = evmcs->guest_ds_base;
+               vmcs12->guest_fs_base = evmcs->guest_fs_base;
+               vmcs12->guest_gs_base = evmcs->guest_gs_base;
+               vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
+               vmcs12->guest_tr_base = evmcs->guest_tr_base;
+               vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
+               vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
+               vmcs12->guest_es_limit = evmcs->guest_es_limit;
+               vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
+               vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
+               vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
+               vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
+               vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
+               vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
+               vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
+               vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
+               vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
+               vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
+               vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
+               vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
+               vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
+               vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
+               vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
+               vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
+               vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
+               vmcs12->guest_es_selector = evmcs->guest_es_selector;
+               vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
+               vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
+               vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
+               vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
+               vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
+               vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
+               vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
+               vmcs12->tsc_offset = evmcs->tsc_offset;
+               vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
+               vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
+               vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
+               vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
+               vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
+               vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
+               vmcs12->guest_cr0 = evmcs->guest_cr0;
+               vmcs12->guest_cr3 = evmcs->guest_cr3;
+               vmcs12->guest_cr4 = evmcs->guest_cr4;
+               vmcs12->guest_dr7 = evmcs->guest_dr7;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
+               vmcs12->host_fs_base = evmcs->host_fs_base;
+               vmcs12->host_gs_base = evmcs->host_gs_base;
+               vmcs12->host_tr_base = evmcs->host_tr_base;
+               vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
+               vmcs12->host_idtr_base = evmcs->host_idtr_base;
+               vmcs12->host_rsp = evmcs->host_rsp;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
+               vmcs12->ept_pointer = evmcs->ept_pointer;
+               vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
+       }
+
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
+               vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
+               vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
+               vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
+               vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
+               vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
+               vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
+               vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
+               vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
+               vmcs12->guest_pending_dbg_exceptions =
+                       evmcs->guest_pending_dbg_exceptions;
+               vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
+               vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
+               vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
+               vmcs12->guest_activity_state = evmcs->guest_activity_state;
+               vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
+       }
+
+       /*
+        * Not used?
+        * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
+        * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
+        * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
+        * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
+        * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
+        * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
+        * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
+        * vmcs12->page_fault_error_code_mask =
+        *              evmcs->page_fault_error_code_mask;
+        * vmcs12->page_fault_error_code_match =
+        *              evmcs->page_fault_error_code_match;
+        * vmcs12->cr3_target_count = evmcs->cr3_target_count;
+        * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
+        * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
+        * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
+        */
+
+       /*
+        * Read only fields:
+        * vmcs12->guest_physical_address = evmcs->guest_physical_address;
+        * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
+        * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
+        * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
+        * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
+        * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
+        * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
+        * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
+        * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
+        * vmcs12->exit_qualification = evmcs->exit_qualification;
+        * vmcs12->guest_linear_address = evmcs->guest_linear_address;
+        *
+        * Not present in struct vmcs12:
+        * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
+        * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
+        * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
+        * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
+        */
+
+       return 0;
+}
+
+static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
+{
+       struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+       struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+       /*
+        * Should not be changed by KVM:
+        *
+        * evmcs->host_es_selector = vmcs12->host_es_selector;
+        * evmcs->host_cs_selector = vmcs12->host_cs_selector;
+        * evmcs->host_ss_selector = vmcs12->host_ss_selector;
+        * evmcs->host_ds_selector = vmcs12->host_ds_selector;
+        * evmcs->host_fs_selector = vmcs12->host_fs_selector;
+        * evmcs->host_gs_selector = vmcs12->host_gs_selector;
+        * evmcs->host_tr_selector = vmcs12->host_tr_selector;
+        * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
+        * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
+        * evmcs->host_cr0 = vmcs12->host_cr0;
+        * evmcs->host_cr3 = vmcs12->host_cr3;
+        * evmcs->host_cr4 = vmcs12->host_cr4;
+        * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
+        * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
+        * evmcs->host_rip = vmcs12->host_rip;
+        * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
+        * evmcs->host_fs_base = vmcs12->host_fs_base;
+        * evmcs->host_gs_base = vmcs12->host_gs_base;
+        * evmcs->host_tr_base = vmcs12->host_tr_base;
+        * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
+        * evmcs->host_idtr_base = vmcs12->host_idtr_base;
+        * evmcs->host_rsp = vmcs12->host_rsp;
+        * sync_vmcs12() doesn't read these:
+        * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
+        * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
+        * evmcs->msr_bitmap = vmcs12->msr_bitmap;
+        * evmcs->ept_pointer = vmcs12->ept_pointer;
+        * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
+        * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
+        * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
+        * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
+        * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
+        * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
+        * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
+        * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
+        * evmcs->tpr_threshold = vmcs12->tpr_threshold;
+        * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
+        * evmcs->exception_bitmap = vmcs12->exception_bitmap;
+        * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
+        * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
+        * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
+        * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
+        * evmcs->page_fault_error_code_mask =
+        *              vmcs12->page_fault_error_code_mask;
+        * evmcs->page_fault_error_code_match =
+        *              vmcs12->page_fault_error_code_match;
+        * evmcs->cr3_target_count = vmcs12->cr3_target_count;
+        * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
+        * evmcs->tsc_offset = vmcs12->tsc_offset;
+        * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
+        * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
+        * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
+        * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
+        * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
+        * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
+        * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
+        * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
+        *
+        * Not present in struct vmcs12:
+        * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
+        * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
+        * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
+        * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
+        */
+
+       evmcs->guest_es_selector = vmcs12->guest_es_selector;
+       evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
+       evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
+       evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
+       evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
+       evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
+       evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
+       evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
+
+       evmcs->guest_es_limit = vmcs12->guest_es_limit;
+       evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
+       evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
+       evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
+       evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
+       evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
+       evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
+       evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
+       evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
+       evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
+
+       evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
+       evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
+       evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
+       evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
+       evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
+       evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
+       evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
+       evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
+
+       evmcs->guest_es_base = vmcs12->guest_es_base;
+       evmcs->guest_cs_base = vmcs12->guest_cs_base;
+       evmcs->guest_ss_base = vmcs12->guest_ss_base;
+       evmcs->guest_ds_base = vmcs12->guest_ds_base;
+       evmcs->guest_fs_base = vmcs12->guest_fs_base;
+       evmcs->guest_gs_base = vmcs12->guest_gs_base;
+       evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
+       evmcs->guest_tr_base = vmcs12->guest_tr_base;
+       evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
+       evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
+
+       evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
+       evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
+
+       evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
+       evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
+       evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
+       evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
+
+       evmcs->guest_pending_dbg_exceptions =
+               vmcs12->guest_pending_dbg_exceptions;
+       evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
+       evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
+
+       evmcs->guest_activity_state = vmcs12->guest_activity_state;
+       evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
+
+       evmcs->guest_cr0 = vmcs12->guest_cr0;
+       evmcs->guest_cr3 = vmcs12->guest_cr3;
+       evmcs->guest_cr4 = vmcs12->guest_cr4;
+       evmcs->guest_dr7 = vmcs12->guest_dr7;
+
+       evmcs->guest_physical_address = vmcs12->guest_physical_address;
+
+       evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
+       evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
+       evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
+       evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
+       evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
+       evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
+       evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
+       evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
+
+       evmcs->exit_qualification = vmcs12->exit_qualification;
+
+       evmcs->guest_linear_address = vmcs12->guest_linear_address;
+       evmcs->guest_rsp = vmcs12->guest_rsp;
+       evmcs->guest_rflags = vmcs12->guest_rflags;
+
+       evmcs->guest_interruptibility_info =
+               vmcs12->guest_interruptibility_info;
+       evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
+       evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
+       evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
+       evmcs->vm_entry_exception_error_code =
+               vmcs12->vm_entry_exception_error_code;
+       evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
+
+       evmcs->guest_rip = vmcs12->guest_rip;
+
+       evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
+
+       return 0;
+}
+
 /*
  * Copy the writable VMCS shadow fields back to the VMCS12, in case
  * they have been modified by the L1 guest. Note that the "read-only"
@@ -8683,20 +9121,6 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
        vmcs_load(vmx->loaded_vmcs->vmcs);
 }
 
-/*
- * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
- * used before) all generate the same failure when it is missing.
- */
-static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       if (vmx->nested.current_vmptr == -1ull) {
-               nested_vmx_failInvalid(vcpu);
-               return 0;
-       }
-       return 1;
-}
-
 static int handle_vmread(struct kvm_vcpu *vcpu)
 {
        unsigned long field;
@@ -8709,8 +9133,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (!nested_vmx_check_vmcs12(vcpu))
-               return kvm_skip_emulated_instruction(vcpu);
+       if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
 
        if (!is_guest_mode(vcpu))
                vmcs12 = get_vmcs12(vcpu);
@@ -8719,20 +9143,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
                 * to shadowed-field sets the ALU flags for VMfailInvalid.
                 */
-               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+                       return nested_vmx_failInvalid(vcpu);
                vmcs12 = get_shadow_vmcs12(vcpu);
        }
 
        /* Decode instruction info and find the field to read */
        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
        /* Read the field, zero-extended to a u64 field_value */
-       if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
-               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
        /*
         * Now copy part of this value to register or memory, as requested.
         * Note that the number of bits actually copied is 32 or 64 depending
@@ -8750,8 +9172,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                                            (is_long_mode(vcpu) ? 8 : 4), NULL);
        }
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 
@@ -8776,8 +9197,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (!nested_vmx_check_vmcs12(vcpu))
-               return kvm_skip_emulated_instruction(vcpu);
+       if (vmx->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
 
        if (vmx_instruction_info & (1u << 10))
                field_value = kvm_register_readl(vcpu,
@@ -8800,11 +9221,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
         * VMCS," then the "read-only" fields are actually read/write.
         */
        if (vmcs_field_readonly(field) &&
-           !nested_cpu_has_vmwrite_any_field(vcpu)) {
-               nested_vmx_failValid(vcpu,
+           !nested_cpu_has_vmwrite_any_field(vcpu))
+               return nested_vmx_failValid(vcpu,
                        VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
 
        if (!is_guest_mode(vcpu))
                vmcs12 = get_vmcs12(vcpu);
@@ -8813,18 +9232,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
                 * to shadowed-field sets the ALU flags for VMfailInvalid.
                 */
-               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+                       return nested_vmx_failInvalid(vcpu);
                vmcs12 = get_shadow_vmcs12(vcpu);
-
        }
 
-       if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
-               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmcs12_write_any(vmcs12, field, field_value) < 0)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 
        /*
         * Do not track vmcs12 dirty-state if in guest-mode
@@ -8846,8 +9261,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                }
        }
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
@@ -8858,7 +9272,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
                              SECONDARY_EXEC_SHADOW_VMCS);
                vmcs_write64(VMCS_LINK_POINTER,
                             __pa(vmx->vmcs01.shadow_vmcs));
-               vmx->nested.sync_shadow_vmcs = true;
+               vmx->nested.need_vmcs12_sync = true;
        }
        vmx->nested.dirty_vmcs12 = true;
 }
@@ -8875,36 +9289,37 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
        if (nested_vmx_get_vmptr(vcpu, &vmptr))
                return 1;
 
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMPTRLD_INVALID_ADDRESS);
 
-       if (vmptr == vmx->nested.vmxon_ptr) {
-               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmptr == vmx->nested.vmxon_ptr)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMPTRLD_VMXON_POINTER);
+
+       /* Forbid normal VMPTRLD if Enlightened version was used */
+       if (vmx->nested.hv_evmcs)
+               return 1;
 
        if (vmx->nested.current_vmptr != vmptr) {
                struct vmcs12 *new_vmcs12;
                struct page *page;
                page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-               if (is_error_page(page)) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (is_error_page(page))
+                       return nested_vmx_failInvalid(vcpu);
+
                new_vmcs12 = kmap(page);
                if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
                    (new_vmcs12->hdr.shadow_vmcs &&
                     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
                        kunmap(page);
                        kvm_release_page_clean(page);
-                       nested_vmx_failValid(vcpu,
+                       return nested_vmx_failValid(vcpu,
                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
-                       return kvm_skip_emulated_instruction(vcpu);
                }
 
-               nested_release_vmcs12(vmx);
+               nested_release_vmcs12(vcpu);
+
                /*
                 * Load VMCS12 from guest memory since it is not already
                 * cached.
@@ -8916,8 +9331,71 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                set_current_vmptr(vmx, vmptr);
        }
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
+}
+
+/*
+ * This is an equivalent of the nested hypervisor executing the vmptrld
+ * instruction.
+ */
+static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
+                                                bool from_launch)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct hv_vp_assist_page assist_page;
+
+       if (likely(!vmx->nested.enlightened_vmcs_enabled))
+               return 1;
+
+       if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
+               return 1;
+
+       if (unlikely(!assist_page.enlighten_vmentry))
+               return 1;
+
+       if (unlikely(assist_page.current_nested_vmcs !=
+                    vmx->nested.hv_evmcs_vmptr)) {
+
+               if (!vmx->nested.hv_evmcs)
+                       vmx->nested.current_vmptr = -1ull;
+
+               nested_release_evmcs(vcpu);
+
+               vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
+                       vcpu, assist_page.current_nested_vmcs);
+
+               if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
+                       return 0;
+
+               vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
+
+               if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
+                       nested_release_evmcs(vcpu);
+                       return 0;
+               }
+
+               vmx->nested.dirty_vmcs12 = true;
+               /*
+                * As we keep L2 state for one guest only 'hv_clean_fields' mask
+                * can't be used when we switch between them. Reset it here for
+                * simplicity.
+                */
+               vmx->nested.hv_evmcs->hv_clean_fields &=
+                       ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+               vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
+
+               /*
+                * Unlike normal vmcs12, enlightened vmcs12 is not fully
+                * reloaded from guest's memory (read only fields, fields not
+                * present in struct hv_enlightened_vmcs, ...). Make sure there
+                * are no leftovers.
+                */
+               if (from_launch)
+                       memset(vmx->nested.cached_vmcs12, 0,
+                              sizeof(*vmx->nested.cached_vmcs12));
+
+       }
+       return 1;
 }
 
 /* Emulate the VMPTRST instruction */
@@ -8932,6 +9410,9 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
+       if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
+               return 1;
+
        if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
                return 1;
        /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
@@ -8940,8 +9421,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 /* Emulate the INVEPT instruction */
@@ -8971,11 +9451,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
        types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
 
-       if (type >= 32 || !(types & (1 << type))) {
-               nested_vmx_failValid(vcpu,
+       if (type >= 32 || !(types & (1 << type)))
+               return nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
 
        /* According to the Intel VMX instruction reference, the memory
         * operand is read even if it isn't needed (e.g., for type==global)
@@ -8997,14 +9475,20 @@ static int handle_invept(struct kvm_vcpu *vcpu)
        case VMX_EPT_EXTENT_CONTEXT:
                kvm_mmu_sync_roots(vcpu);
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-               nested_vmx_succeed(vcpu);
                break;
        default:
                BUG_ON(1);
                break;
        }
 
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
+}
+
+static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
 }
 
 static int handle_invvpid(struct kvm_vcpu *vcpu)
@@ -9018,6 +9502,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                u64 vpid;
                u64 gla;
        } operand;
+       u16 vpid02;
 
        if (!(vmx->nested.msrs.secondary_ctls_high &
              SECONDARY_EXEC_ENABLE_VPID) ||
@@ -9035,11 +9520,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
        types = (vmx->nested.msrs.vpid_caps &
                        VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
 
-       if (type >= 32 || !(types & (1 << type))) {
-               nested_vmx_failValid(vcpu,
+       if (type >= 32 || !(types & (1 << type)))
+               return nested_vmx_failValid(vcpu,
                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
 
        /* according to the intel vmx instruction reference, the memory
         * operand is read even if it isn't needed (e.g., for type==global)
@@ -9051,47 +9534,39 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
-       if (operand.vpid >> 16) {
-               nested_vmx_failValid(vcpu,
+       if (operand.vpid >> 16)
+               return nested_vmx_failValid(vcpu,
                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
 
+       vpid02 = nested_get_vpid02(vcpu);
        switch (type) {
        case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
                if (!operand.vpid ||
-                   is_noncanonical_address(operand.gla, vcpu)) {
-                       nested_vmx_failValid(vcpu,
+                   is_noncanonical_address(operand.gla, vcpu))
+                       return nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
-               if (cpu_has_vmx_invvpid_individual_addr() &&
-                   vmx->nested.vpid02) {
+               if (cpu_has_vmx_invvpid_individual_addr()) {
                        __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
-                               vmx->nested.vpid02, operand.gla);
+                               vpid02, operand.gla);
                } else
-                       __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+                       __vmx_flush_tlb(vcpu, vpid02, false);
                break;
        case VMX_VPID_EXTENT_SINGLE_CONTEXT:
        case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
-               if (!operand.vpid) {
-                       nested_vmx_failValid(vcpu,
+               if (!operand.vpid)
+                       return nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
-               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+               __vmx_flush_tlb(vcpu, vpid02, false);
                break;
        case VMX_VPID_EXTENT_ALL_CONTEXT:
-               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+               __vmx_flush_tlb(vcpu, vpid02, false);
                break;
        default:
                WARN_ON_ONCE(1);
                return kvm_skip_emulated_instruction(vcpu);
        }
 
-       nested_vmx_succeed(vcpu);
-
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 static int handle_invpcid(struct kvm_vcpu *vcpu)
@@ -9162,11 +9637,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
                }
 
                for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
+                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
                            == operand.pcid)
                                roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
 
-               kvm_mmu_free_roots(vcpu, roots_to_free);
+               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
                /*
                 * If neither the current cr3 nor any of the prev_roots use the
                 * given PCID, then nothing needs to be done here because a
@@ -9293,7 +9768,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
 
                kvm_mmu_unload(vcpu);
                mmu->ept_ad = accessed_dirty;
-               mmu->base_role.ad_disabled = !accessed_dirty;
+               mmu->mmu_role.base.ad_disabled = !accessed_dirty;
                vmcs12->ept_pointer = address;
                /*
                 * TODO: Check what's the correct approach in case
@@ -9652,9 +10127,6 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
                        return false;
                else if (is_page_fault(intr_info))
                        return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
-               else if (is_no_device(intr_info) &&
-                        !(vmcs12->guest_cr0 & X86_CR0_TS))
-                       return false;
                else if (is_debug(intr_info) &&
                         vcpu->guest_debug &
                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -10676,9 +11148,25 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                vmcs_write32(PLE_WINDOW, vmx->ple_window);
        }
 
-       if (vmx->nested.sync_shadow_vmcs) {
-               copy_vmcs12_to_shadow(vmx);
-               vmx->nested.sync_shadow_vmcs = false;
+       if (vmx->nested.need_vmcs12_sync) {
+               /*
+                * hv_evmcs may end up being not mapped after migration (when
+                * L2 was running), map it here to make sure vmcs12 changes are
+                * properly reflected.
+                */
+               if (vmx->nested.enlightened_vmcs_enabled &&
+                   !vmx->nested.hv_evmcs)
+                       nested_vmx_handle_enlightened_vmptrld(vcpu, false);
+
+               if (vmx->nested.hv_evmcs) {
+                       copy_vmcs12_to_enlightened(vmx);
+                       /* All fields are clean */
+                       vmx->nested.hv_evmcs->hv_clean_fields |=
+                               HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+               } else {
+                       copy_vmcs12_to_shadow(vmx);
+               }
+               vmx->nested.need_vmcs12_sync = false;
        }
 
        if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -10745,7 +11233,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
                "jmp 1f \n\t"
                "2: \n\t"
-               __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+               __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
                "1: \n\t"
                /* Reload cr2 if changed */
                "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
@@ -10777,9 +11265,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
                /* Enter guest mode */
                "jne 1f \n\t"
-               __ex(ASM_VMX_VMLAUNCH) "\n\t"
+               __ex("vmlaunch") "\n\t"
                "jmp 2f \n\t"
-               "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+               "1: " __ex("vmresume") "\n\t"
                "2: "
                /* Save guest registers, load host registers, keep flags */
                "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
@@ -10801,6 +11289,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                "mov %%r13, %c[r13](%0) \n\t"
                "mov %%r14, %c[r14](%0) \n\t"
                "mov %%r15, %c[r15](%0) \n\t"
+               /*
+               * Clear host registers marked as clobbered to prevent
+               * speculative use.
+               */
                "xor %%r8d,  %%r8d \n\t"
                "xor %%r9d,  %%r9d \n\t"
                "xor %%r10d, %%r10d \n\t"
@@ -10958,6 +11450,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
        vmx->loaded_vmcs = vmcs;
        vmx_vcpu_load(vcpu, cpu);
        put_cpu();
+
+       vm_entry_controls_reset_shadow(vmx);
+       vm_exit_controls_reset_shadow(vmx);
+       vmx_segment_cache_clear(vmx);
 }
 
 /*
@@ -10966,12 +11462,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
  */
 static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       vcpu_load(vcpu);
-       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-       free_nested(vmx);
-       vcpu_put(vcpu);
+       vcpu_load(vcpu);
+       vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
+       free_nested(vcpu);
+       vcpu_put(vcpu);
 }
 
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -11334,28 +11828,28 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
        return get_vmcs12(vcpu)->ept_pointer;
 }
 
-static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
        WARN_ON(mmu_is_nested(vcpu));
-       if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
-               return 1;
 
+       vcpu->arch.mmu = &vcpu->arch.guest_mmu;
        kvm_init_shadow_ept_mmu(vcpu,
                        to_vmx(vcpu)->nested.msrs.ept_caps &
                        VMX_EPT_EXECUTE_ONLY_BIT,
                        nested_ept_ad_enabled(vcpu),
                        nested_ept_get_cr3(vcpu));
-       vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
-       vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
-       vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
+       vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
+       vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
 
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-       return 0;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+       vcpu->arch.mmu = &vcpu->arch.root_mmu;
+       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 }
 
 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -11716,7 +12210,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
            !nested_exit_intr_ack_set(vcpu) ||
            (vmcs12->posted_intr_nv & 0xff00) ||
            (vmcs12->posted_intr_desc_addr & 0x3f) ||
-           (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr))))
+           (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
                return -EINVAL;
 
        /* tpr shadow is needed by all apicv features. */
@@ -11772,15 +12266,12 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
                                         struct vmcs12 *vmcs12)
 {
-       u64 address = vmcs12->pml_address;
-       int maxphyaddr = cpuid_maxphyaddr(vcpu);
+       if (!nested_cpu_has_pml(vmcs12))
+               return 0;
 
-       if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
-               if (!nested_cpu_has_ept(vmcs12) ||
-                   !IS_ALIGNED(address, 4096)  ||
-                   address >> maxphyaddr)
-                       return -EINVAL;
-       }
+       if (!nested_cpu_has_ept(vmcs12) ||
+           !page_address_valid(vcpu, vmcs12->pml_address))
+               return -EINVAL;
 
        return 0;
 }
@@ -11960,112 +12451,87 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
        return 0;
 }
 
-static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
+/*
+ * Returns if KVM is able to config CPU to tag TLB entries
+ * populated by L2 differently than TLB entries populated
+ * by L1.
+ *
+ * If L1 uses EPT, then TLB entries are tagged with different EPTP.
+ *
+ * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
+ * with different VPID (L1 entries are tagged with vmx->vpid
+ * while L2 entries are tagged with vmx->nested.vpid02).
+ */
+static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
-       vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
-       vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
-       vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
-       vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
-       vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
-       vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
-       vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
-       vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
-       vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
-       vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
-       vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
-       vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
-       vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
-       vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
-       vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
-       vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
-       vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
-       vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
-       vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
-       vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
-       vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
-       vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
-       vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
-       vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
-       vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
-       vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
-       vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
-       vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
-       vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
-       vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
-
-       vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
-       vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
-               vmcs12->guest_pending_dbg_exceptions);
-       vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
-       vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+       return nested_cpu_has_ept(vmcs12) ||
+              (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
+}
 
-       if (nested_cpu_has_xsaves(vmcs12))
-               vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
-       vmcs_write64(VMCS_LINK_POINTER, -1ull);
+static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
+               return vmcs12->guest_ia32_efer;
+       else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+               return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
+       else
+               return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
+}
 
-       if (cpu_has_vmx_posted_intr())
-               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
+{
+       /*
+        * If vmcs02 hasn't been initialized, set the constant vmcs02 state
+        * according to L0's settings (vmcs12 is irrelevant here).  Host
+        * fields that come from L0 and are not constant, e.g. HOST_CR3,
+        * will be set as needed prior to VMLAUNCH/VMRESUME.
+        */
+       if (vmx->nested.vmcs02_initialized)
+               return;
+       vmx->nested.vmcs02_initialized = true;
 
        /*
-        * Whether page-faults are trapped is determined by a combination of
-        * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
-        * If enable_ept, L0 doesn't care about page faults and we should
-        * set all of these to L1's desires. However, if !enable_ept, L0 does
-        * care about (at least some) page faults, and because it is not easy
-        * (if at all possible?) to merge L0 and L1's desires, we simply ask
-        * to exit on each and every L2 page fault. This is done by setting
-        * MASK=MATCH=0 and (see below) EB.PF=1.
-        * Note that below we don't need special code to set EB.PF beyond the
-        * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
-        * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
-        * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+        * We don't care what the EPTP value is we just need to guarantee
+        * it's valid so we don't get a false positive when doing early
+        * consistency checks.
         */
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
-               enable_ept ? vmcs12->page_fault_error_code_mask : 0);
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
-               enable_ept ? vmcs12->page_fault_error_code_match : 0);
+       if (enable_ept && nested_early_check)
+               vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
 
        /* All VMFUNCs are currently emulated through L0 vmexits.  */
        if (cpu_has_vmx_vmfunc())
                vmcs_write64(VM_FUNCTION_CONTROL, 0);
 
-       if (cpu_has_vmx_apicv()) {
-               vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
-               vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
-               vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
-               vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
-       }
+       if (cpu_has_vmx_posted_intr())
+               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
 
-       /*
-        * Set host-state according to L0's settings (vmcs12 is irrelevant here)
-        * Some constant fields are set here by vmx_set_constant_host_state().
-        * Other fields are different per CPU, and will be set later when
-        * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
-        * is called.
-        */
-       vmx_set_constant_host_state(vmx);
+       if (cpu_has_vmx_msr_bitmap())
+               vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+
+       if (enable_pml)
+               vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 
        /*
-        * Set the MSR load/store lists to match L0's settings.
+        * Set the MSR load/store lists to match L0's settings.  Only the
+        * addresses are constant (for vmcs02), the counts can change based
+        * on L2's behavior, e.g. switching to/from long mode.
         */
        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
 
-       set_cr4_guest_host_mask(vmx);
+       vmx_set_constant_host_state(vmx);
+}
 
-       if (kvm_mpx_supported()) {
-               if (vmx->nested.nested_run_pending &&
-                       (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
-                       vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
-               else
-                       vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
-       }
+static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
+                                     struct vmcs12 *vmcs12)
+{
+       prepare_vmcs02_constant_state(vmx);
+
+       vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
        if (enable_vpid) {
                if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -12073,78 +12539,30 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                else
                        vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
        }
-
-       /*
-        * L1 may access the L2's PDPTR, so save them to construct vmcs12
-        */
-       if (enable_ept) {
-               vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
-               vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
-               vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
-               vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
-       }
-
-       if (cpu_has_vmx_msr_bitmap())
-               vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
 }
 
-/*
- * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
- * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
- * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
- * guest in a way that will both be appropriate to L1's requests, and our
- * needs. In addition to modifying the active vmcs (which is vmcs02), this
- * function also has additional necessary side-effects, like setting various
- * vcpu->arch fields.
- * Returns 0 on success, 1 on failure. Invalid state exit qualification code
- * is assigned to entry_failure_code on failure.
- */
-static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                         u32 *entry_failure_code)
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control, vmcs12_exec_ctrl;
+       u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
 
-       if (vmx->nested.dirty_vmcs12) {
-               prepare_vmcs02_full(vcpu, vmcs12);
-               vmx->nested.dirty_vmcs12 = false;
-       }
+       if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
+               prepare_vmcs02_early_full(vmx, vmcs12);
 
        /*
-        * First, the fields that are shadowed.  This must be kept in sync
-        * with vmx_shadow_fields.h.
+        * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+        * entry, but only if the current (host) sp changed from the value
+        * we wrote last (vmx->host_rsp).  This cache is no longer relevant
+        * if we switch vmcs, and rather than hold a separate cache per vmcs,
+        * here we just force the write to happen on entry.  host_rsp will
+        * also be written unconditionally by nested_vmx_check_vmentry_hw()
+        * if we are doing early consistency checks via hardware.
         */
+       vmx->host_rsp = 0;
 
-       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
-       vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
-       vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
-       vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
-       vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
-
-       if (vmx->nested.nested_run_pending &&
-           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
-               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
-               vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
-       } else {
-               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
-               vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
-       }
-       if (vmx->nested.nested_run_pending) {
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                            vmcs12->vm_entry_intr_info_field);
-               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-                            vmcs12->vm_entry_exception_error_code);
-               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-                            vmcs12->vm_entry_instruction_len);
-               vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-                            vmcs12->guest_interruptibility_info);
-               vmx->loaded_vmcs->nmi_known_unmasked =
-                       !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
-       } else {
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
-       }
-       vmx_set_rflags(vcpu, vmcs12->guest_rflags);
-
+       /*
+        * PIN CONTROLS
+        */
        exec_control = vmcs12->pin_based_vm_exec_control;
 
        /* Preemption timer setting is computed directly in vmx_vcpu_run.  */
@@ -12159,13 +12577,43 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        } else {
                exec_control &= ~PIN_BASED_POSTED_INTR;
        }
-
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
 
-       vmx->nested.preemption_timer_expired = false;
-       if (nested_cpu_has_preemption_timer(vmcs12))
-               vmx_start_preemption_timer(vcpu);
+       /*
+        * EXEC CONTROLS
+        */
+       exec_control = vmx_exec_control(vmx); /* L0's desires */
+       exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+       exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+       exec_control &= ~CPU_BASED_TPR_SHADOW;
+       exec_control |= vmcs12->cpu_based_vm_exec_control;
+
+       /*
+        * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+        * nested_get_vmcs12_pages can't fix it up, the illegal value
+        * will result in a VM entry failure.
+        */
+       if (exec_control & CPU_BASED_TPR_SHADOW) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+               vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+       } else {
+#ifdef CONFIG_X86_64
+               exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+                               CPU_BASED_CR8_STORE_EXITING;
+#endif
+       }
+
+       /*
+        * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+        * for I/O port accesses.
+        */
+       exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+       exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
 
+       /*
+        * SECONDARY EXEC CONTROLS
+        */
        if (cpu_has_secondary_exec_ctrls()) {
                exec_control = vmx->secondary_exec_control;
 
@@ -12206,43 +12654,214 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        }
 
        /*
-        * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
-        * entry, but only if the current (host) sp changed from the value
-        * we wrote last (vmx->host_rsp). This cache is no longer relevant
-        * if we switch vmcs, and rather than hold a separate cache per vmcs,
-        * here we just force the write to happen on entry.
+        * ENTRY CONTROLS
+        *
+        * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
+        * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
+        * on the related bits (if supported by the CPU) in the hope that
+        * we can avoid VMWrites during vmx_set_efer().
+        */
+       exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
+                       ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+       if (cpu_has_load_ia32_efer) {
+               if (guest_efer & EFER_LMA)
+                       exec_control |= VM_ENTRY_IA32E_MODE;
+               if (guest_efer != host_efer)
+                       exec_control |= VM_ENTRY_LOAD_IA32_EFER;
+       }
+       vm_entry_controls_init(vmx, exec_control);
+
+       /*
+        * EXIT CONTROLS
+        *
+        * L2->L1 exit controls are emulated - the hardware exit is to L0 so
+        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+        * bits may be modified by vmx_set_efer() in prepare_vmcs02().
         */
-       vmx->host_rsp = 0;
+       exec_control = vmcs_config.vmexit_ctrl;
+       if (cpu_has_load_ia32_efer && guest_efer != host_efer)
+               exec_control |= VM_EXIT_LOAD_IA32_EFER;
+       vm_exit_controls_init(vmx, exec_control);
 
-       exec_control = vmx_exec_control(vmx); /* L0's desires */
-       exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-       exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
-       exec_control &= ~CPU_BASED_TPR_SHADOW;
-       exec_control |= vmcs12->cpu_based_vm_exec_control;
+       /*
+        * Conceptually we want to copy the PML address and index from
+        * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+        * since we always flush the log on each vmexit and never change
+        * the PML address (once set), this happens to be equivalent to
+        * simply resetting the index in vmcs02.
+        */
+       if (enable_pml)
+               vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 
        /*
-        * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
-        * nested_get_vmcs12_pages can't fix it up, the illegal value
-        * will result in a VM entry failure.
+        * Interrupt/Exception Fields
         */
-       if (exec_control & CPU_BASED_TPR_SHADOW) {
-               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
-               vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+       if (vmx->nested.nested_run_pending) {
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                            vmcs12->vm_entry_intr_info_field);
+               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+                            vmcs12->vm_entry_exception_error_code);
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+                            vmcs12->vm_entry_instruction_len);
+               vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+                            vmcs12->guest_interruptibility_info);
+               vmx->loaded_vmcs->nmi_known_unmasked =
+                       !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
        } else {
-#ifdef CONFIG_X86_64
-               exec_control |= CPU_BASED_CR8_LOAD_EXITING |
-                               CPU_BASED_CR8_STORE_EXITING;
-#endif
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+       }
+}
+
+static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
+       struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+
+       if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+               vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+               vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+               vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+               vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+               vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+               vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+               vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+               vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+               vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+               vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+               vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+               vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+               vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+               vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+               vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+               vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+               vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+               vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+               vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+               vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+               vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+               vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+               vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+               vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+               vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+               vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+               vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+               vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+               vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+               vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+               vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+               vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+               vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+               vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+       }
+
+       if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
+               vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+               vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+                           vmcs12->guest_pending_dbg_exceptions);
+               vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+               vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+               /*
+                * L1 may access the L2's PDPTR, so save them to construct
+                * vmcs12
+                */
+               if (enable_ept) {
+                       vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+                       vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+                       vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+                       vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+               }
+       }
+
+       if (nested_cpu_has_xsaves(vmcs12))
+               vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
+
+       /*
+        * Whether page-faults are trapped is determined by a combination of
+        * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+        * If enable_ept, L0 doesn't care about page faults and we should
+        * set all of these to L1's desires. However, if !enable_ept, L0 does
+        * care about (at least some) page faults, and because it is not easy
+        * (if at all possible?) to merge L0 and L1's desires, we simply ask
+        * to exit on each and every L2 page fault. This is done by setting
+        * MASK=MATCH=0 and (see below) EB.PF=1.
+        * Note that below we don't need special code to set EB.PF beyond the
+        * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+        * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+        * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+        */
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+               enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+               enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+       if (cpu_has_vmx_apicv()) {
+               vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+               vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+               vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+               vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+       }
+
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+
+       set_cr4_guest_host_mask(vmx);
+
+       if (kvm_mpx_supported()) {
+               if (vmx->nested.nested_run_pending &&
+                       (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+                       vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+               else
+                       vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+       }
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                         u32 *entry_failure_code)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+
+       if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
+               prepare_vmcs02_full(vmx, vmcs12);
+               vmx->nested.dirty_vmcs12 = false;
        }
 
        /*
-        * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
-        * for I/O port accesses.
+        * First, the fields that are shadowed.  This must be kept in sync
+        * with vmx_shadow_fields.h.
         */
-       exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
-       exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+       if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+               vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+               vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+       }
 
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+               vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+       } else {
+               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+               vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
+       }
+       vmx_set_rflags(vcpu, vmcs12->guest_rflags);
+
+       vmx->nested.preemption_timer_expired = false;
+       if (nested_cpu_has_preemption_timer(vmcs12))
+               vmx_start_preemption_timer(vcpu);
 
        /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
         * bitwise-or of what L1 wants to trap for L2, and what we want to
@@ -12252,20 +12871,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-       /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
-        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
-        * bits are further modified by vmx_set_efer() below.
-        */
-       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
-       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
-        * emulated by vmx_set_efer(), below.
-        */
-       vm_entry_controls_init(vmx, 
-               (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
-                       ~VM_ENTRY_IA32E_MODE) |
-               (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
-
        if (vmx->nested.nested_run_pending &&
            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
@@ -12288,37 +12893,29 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 * influence global bitmap(for vpid01 and vpid02 allocation)
                 * even if spawn a lot of nested vCPUs.
                 */
-               if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+               if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
                        if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
                                vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-                               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+                               __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
                        }
                } else {
-                       vmx_flush_tlb(vcpu, true);
+                       /*
+                        * If L1 use EPT, then L0 needs to execute INVEPT on
+                        * EPTP02 instead of EPTP01. Therefore, delay TLB
+                        * flush until vmcs02->eptp is fully updated by
+                        * KVM_REQ_LOAD_CR3. Note that this assumes
+                        * KVM_REQ_TLB_FLUSH is evaluated after
+                        * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
+                        */
+                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                }
        }
 
-       if (enable_pml) {
-               /*
-                * Conceptually we want to copy the PML address and index from
-                * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
-                * since we always flush the log on each vmexit, this happens
-                * to be equivalent to simply resetting the fields in vmcs02.
-                */
-               ASSERT(vmx->pml_pg);
-               vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
-               vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
-       }
-
-       if (nested_cpu_has_ept(vmcs12)) {
-               if (nested_ept_init_mmu_context(vcpu)) {
-                       *entry_failure_code = ENTRY_FAIL_DEFAULT;
-                       return 1;
-               }
-       } else if (nested_cpu_has2(vmcs12,
-                                  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+       if (nested_cpu_has_ept(vmcs12))
+               nested_ept_init_mmu_context(vcpu);
+       else if (nested_cpu_has2(vmcs12,
+                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                vmx_flush_tlb(vcpu, true);
-       }
 
        /*
         * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
@@ -12334,14 +12931,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmx_set_cr4(vcpu, vmcs12->guest_cr4);
        vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-       if (vmx->nested.nested_run_pending &&
-           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
-               vcpu->arch.efer = vmcs12->guest_ia32_efer;
-       else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
-               vcpu->arch.efer |= (EFER_LMA | EFER_LME);
-       else
-               vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
-       /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+       vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
+       /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
        vmx_set_efer(vcpu, vcpu->arch.efer);
 
        /*
@@ -12383,6 +12974,7 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
 static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       bool ia32e;
 
        if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
            vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
@@ -12457,6 +13049,21 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
 
        /*
+        * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+        * IA32_EFER MSR must be 0 in the field for that register. In addition,
+        * the values of the LMA and LME bits in the field must each be that of
+        * the host address-space size VM-exit control.
+        */
+       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+               ia32e = (vmcs12->vm_exit_controls &
+                        VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+               if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
+                       return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+       }
+
+       /*
         * From the Intel SDM, volume 3:
         * Fields relevant to VM-entry event injection must be set properly.
         * These fields are the VM-entry interruption-information field, the
@@ -12512,6 +13119,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                }
        }
 
+       if (nested_cpu_has_ept(vmcs12) &&
+           !valid_ept_address(vcpu, vmcs12->ept_pointer))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
        return 0;
 }
 
@@ -12532,94 +13143,192 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
        if (is_error_page(page))
                return -EINVAL;
 
-       r = 0;
-       shadow = kmap(page);
-       if (shadow->hdr.revision_id != VMCS12_REVISION ||
-           shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
-               r = -EINVAL;
-       kunmap(page);
-       kvm_release_page_clean(page);
-       return r;
-}
+       r = 0;
+       shadow = kmap(page);
+       if (shadow->hdr.revision_id != VMCS12_REVISION ||
+           shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
+               r = -EINVAL;
+       kunmap(page);
+       kvm_release_page_clean(page);
+       return r;
+}
+
+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                                 u32 *exit_qual)
+{
+       bool ia32e;
+
+       *exit_qual = ENTRY_FAIL_DEFAULT;
+
+       if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
+           !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
+               return 1;
+
+       if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
+               *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
+               return 1;
+       }
+
+       /*
+        * If the load IA32_EFER VM-entry control is 1, the following checks
+        * are performed on the field for the IA32_EFER MSR:
+        * - Bits reserved in the IA32_EFER MSR must be 0.
+        * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+        *   the IA-32e mode guest VM-exit control. It must also be identical
+        *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
+        *   CR0.PG) is 1.
+        */
+       if (to_vmx(vcpu)->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
+               ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
+               if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
+                   ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
+                   ((vmcs12->guest_cr0 & X86_CR0_PG) &&
+                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
+                       return 1;
+       }
+
+       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
+               (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
+               (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
+                       return 1;
+
+       return 0;
+}
+
+static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr3, cr4;
+
+       if (!nested_early_check)
+               return 0;
+
+       if (vmx->msr_autoload.host.nr)
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+       if (vmx->msr_autoload.guest.nr)
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+       preempt_disable();
+
+       vmx_prepare_switch_to_guest(vcpu);
+
+       /*
+        * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
+        * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
+        * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
+        * there is no need to preserve other bits or save/restore the field.
+        */
+       vmcs_writel(GUEST_RFLAGS, 0);
+
+       vmcs_writel(HOST_RIP, vmx_early_consistency_check_return);
+
+       cr3 = __get_current_cr3_fast();
+       if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+               vmcs_writel(HOST_CR3, cr3);
+               vmx->loaded_vmcs->host_state.cr3 = cr3;
+       }
+
+       cr4 = cr4_read_shadow();
+       if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+               vmcs_writel(HOST_CR4, cr4);
+               vmx->loaded_vmcs->host_state.cr4 = cr4;
+       }
+
+       vmx->__launched = vmx->loaded_vmcs->launched;
+
+       asm(
+               /* Set HOST_RSP */
+               __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
+               "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t"
+
+               /* Check if vmlaunch of vmresume is needed */
+               "cmpl $0, %c[launched](%0)\n\t"
+               "je 1f\n\t"
+               __ex("vmresume") "\n\t"
+               "jmp 2f\n\t"
+               "1: " __ex("vmlaunch") "\n\t"
+               "jmp 2f\n\t"
+               "2: "
+
+               /* Set vmx->fail accordingly */
+               "setbe %c[fail](%0)\n\t"
+
+               ".pushsection .rodata\n\t"
+               ".global vmx_early_consistency_check_return\n\t"
+               "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t"
+               ".popsection"
+             :
+             : "c"(vmx), "d"((unsigned long)HOST_RSP),
+               [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
+               [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+               [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp))
+             : "rax", "cc", "memory"
+       );
 
-static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                                 u32 *exit_qual)
-{
-       bool ia32e;
+       vmcs_writel(HOST_RIP, vmx_return);
 
-       *exit_qual = ENTRY_FAIL_DEFAULT;
+       preempt_enable();
 
-       if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
-           !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
-               return 1;
+       if (vmx->msr_autoload.host.nr)
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       if (vmx->msr_autoload.guest.nr)
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
 
-       if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
-               *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
+       if (vmx->fail) {
+               WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+                            VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               vmx->fail = 0;
                return 1;
        }
 
        /*
-        * If the load IA32_EFER VM-entry control is 1, the following checks
-        * are performed on the field for the IA32_EFER MSR:
-        * - Bits reserved in the IA32_EFER MSR must be 0.
-        * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
-        *   the IA-32e mode guest VM-exit control. It must also be identical
-        *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
-        *   CR0.PG) is 1.
+        * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
         */
-       if (to_vmx(vcpu)->nested.nested_run_pending &&
-           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
-               ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
-               if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
-                   ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
-                   ((vmcs12->guest_cr0 & X86_CR0_PG) &&
-                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
-                       return 1;
-       }
+       local_irq_enable();
+       if (hw_breakpoint_active())
+               set_debugreg(__this_cpu_read(cpu_dr7), 7);
 
        /*
-        * If the load IA32_EFER VM-exit control is 1, bits reserved in the
-        * IA32_EFER MSR must be 0 in the field for that register. In addition,
-        * the values of the LMA and LME bits in the field must each be that of
-        * the host address-space size VM-exit control.
+        * A non-failing VMEntry means we somehow entered guest mode with
+        * an illegal RIP, and that's just the tip of the iceberg.  There
+        * is no telling what memory has been modified or what state has
+        * been exposed to unknown code.  Hitting this all but guarantees
+        * a (very critical) hardware issue.
         */
-       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
-               ia32e = (vmcs12->vm_exit_controls &
-                        VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
-               if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
-                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
-                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
-                       return 1;
-       }
-
-       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
-               (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
-               (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
-                       return 1;
+       WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
+               VMX_EXIT_REASONS_FAILED_VMENTRY));
 
        return 0;
 }
+STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
+
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+                                  struct vmcs12 *vmcs12);
 
 /*
- * If exit_qual is NULL, this is being called from state restore (either RSM
+ * If from_vmentry is false, this is being called from state restore (either RSM
  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
++ *
++ * Returns:
++ *   0 - success, i.e. proceed with actual VMEnter
++ *   1 - consistency check VMExit
++ *  -1 - consistency check VMFail
  */
-static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
+static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+                                         bool from_vmentry)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       bool from_vmentry = !!exit_qual;
-       u32 dummy_exit_qual;
        bool evaluate_pending_interrupts;
-       int r = 0;
+       u32 exit_reason = EXIT_REASON_INVALID_STATE;
+       u32 exit_qual;
 
        evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
                (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
        if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
                evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
 
-       enter_guest_mode(vcpu);
-
        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
                vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
        if (kvm_mpx_supported() &&
@@ -12627,24 +13336,35 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
                vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 
        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
-       vmx_segment_cache_clear(vmx);
 
+       prepare_vmcs02_early(vmx, vmcs12);
+
+       if (from_vmentry) {
+               nested_get_vmcs12_pages(vcpu);
+
+               if (nested_vmx_check_vmentry_hw(vcpu)) {
+                       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+                       return -1;
+               }
+
+               if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+                       goto vmentry_fail_vmexit;
+       }
+
+       enter_guest_mode(vcpu);
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
                vcpu->arch.tsc_offset += vmcs12->tsc_offset;
 
-       r = EXIT_REASON_INVALID_STATE;
-       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
-               goto fail;
+       if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
+               goto vmentry_fail_vmexit_guest_mode;
 
        if (from_vmentry) {
-               nested_get_vmcs12_pages(vcpu);
-
-               r = EXIT_REASON_MSR_LOAD_FAIL;
-               *exit_qual = nested_vmx_load_msr(vcpu,
-                                                vmcs12->vm_entry_msr_load_addr,
-                                                vmcs12->vm_entry_msr_load_count);
-               if (*exit_qual)
-                       goto fail;
+               exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+               exit_qual = nested_vmx_load_msr(vcpu,
+                                               vmcs12->vm_entry_msr_load_addr,
+                                               vmcs12->vm_entry_msr_load_count);
+               if (exit_qual)
+                       goto vmentry_fail_vmexit_guest_mode;
        } else {
                /*
                 * The MMU is not initialized to point at the right entities yet and
@@ -12681,12 +13401,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
         */
        return 0;
 
-fail:
+       /*
+        * A failed consistency check that leads to a VMExit during L1's
+        * VMEnter to L2 is a variation of a normal VMexit, as explained in
+        * 26.7 "VM-entry failures during or after loading guest state".
+        */
+vmentry_fail_vmexit_guest_mode:
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
                vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
        leave_guest_mode(vcpu);
+
+vmentry_fail_vmexit:
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-       return r;
+
+       if (!from_vmentry)
+               return 1;
+
+       load_vmcs12_host_state(vcpu, vmcs12);
+       vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+       vmcs12->exit_qualification = exit_qual;
+       if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
+               vmx->nested.need_vmcs12_sync = true;
+       return 1;
 }
 
 /*
@@ -12698,14 +13434,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        struct vmcs12 *vmcs12;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
-       u32 exit_qual;
        int ret;
 
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (!nested_vmx_check_vmcs12(vcpu))
-               goto out;
+       if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
+               return 1;
+
+       if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
 
        vmcs12 = get_vmcs12(vcpu);
 
@@ -12715,13 +13453,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         * rather than RFLAGS.ZF, and no error number is stored to the
         * VM-instruction error field.
         */
-       if (vmcs12->hdr.shadow_vmcs) {
-               nested_vmx_failInvalid(vcpu);
-               goto out;
-       }
+       if (vmcs12->hdr.shadow_vmcs)
+               return nested_vmx_failInvalid(vcpu);
 
-       if (enable_shadow_vmcs)
+       if (vmx->nested.hv_evmcs) {
+               copy_enlightened_to_vmcs12(vmx);
+               /* Enlightened VMCS doesn't have launch state */
+               vmcs12->launch_state = !launch;
+       } else if (enable_shadow_vmcs) {
                copy_shadow_to_vmcs12(vmx);
+       }
 
        /*
         * The nested entry process starts with enforcing various prerequisites
@@ -12733,59 +13474,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         * for misconfigurations which will anyway be caught by the processor
         * when using the merged vmcs02.
         */
-       if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
-               nested_vmx_failValid(vcpu,
-                                    VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
-               goto out;
-       }
+       if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
 
-       if (vmcs12->launch_state == launch) {
-               nested_vmx_failValid(vcpu,
+       if (vmcs12->launch_state == launch)
+               return nested_vmx_failValid(vcpu,
                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
                               : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
-               goto out;
-       }
 
        ret = check_vmentry_prereqs(vcpu, vmcs12);
-       if (ret) {
-               nested_vmx_failValid(vcpu, ret);
-               goto out;
-       }
-
-       /*
-        * After this point, the trap flag no longer triggers a singlestep trap
-        * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
-        * This is not 100% correct; for performance reasons, we delegate most
-        * of the checks on host state to the processor.  If those fail,
-        * the singlestep trap is missed.
-        */
-       skip_emulated_instruction(vcpu);
-
-       ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
-       if (ret) {
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                                        EXIT_REASON_INVALID_STATE, exit_qual);
-               return 1;
-       }
+       if (ret)
+               return nested_vmx_failValid(vcpu, ret);
 
        /*
         * We're finally done with prerequisite checking, and can start with
         * the nested entry.
         */
-
        vmx->nested.nested_run_pending = 1;
-       ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
-       if (ret) {
-               nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
-               vmx->nested.nested_run_pending = 0;
+       ret = nested_vmx_enter_non_root_mode(vcpu, true);
+       vmx->nested.nested_run_pending = !ret;
+       if (ret > 0)
                return 1;
-       }
+       else if (ret)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 
        /* Hide L1D cache contents from the nested guest.  */
        vmx->vcpu.arch.l1tf_flush_l1d = true;
 
        /*
-        * Must happen outside of enter_vmx_non_root_mode() as it will
+        * Must happen outside of nested_vmx_enter_non_root_mode() as it will
         * also be used as part of restoring nVMX state for
         * snapshot restore (migration).
         *
@@ -12806,9 +13525,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return kvm_vcpu_halt(vcpu);
        }
        return 1;
-
-out:
-       return kvm_skip_emulated_instruction(vcpu);
 }
 
 /*
@@ -13122,24 +13838,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        kvm_clear_interrupt_queue(vcpu);
 }
 
-static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
-                       struct vmcs12 *vmcs12)
-{
-       u32 entry_failure_code;
-
-       nested_ept_uninit_mmu_context(vcpu);
-
-       /*
-        * Only PDPTE load can fail as the value of cr3 was checked on entry and
-        * couldn't have changed.
-        */
-       if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
-               nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
-
-       if (!enable_ept)
-               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-}
-
 /*
  * A part of what we need to when the nested L2 guest exits and we want to
  * run its L1 parent, is to reset L1's guest state to the host state specified
@@ -13153,6 +13851,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                   struct vmcs12 *vmcs12)
 {
        struct kvm_segment seg;
+       u32 entry_failure_code;
 
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -13165,6 +13864,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
        vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
+       vmx_set_interrupt_shadow(vcpu, 0);
+
        /*
         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
         * actually changed, because vmx_set_cr0 refers to efer set above.
@@ -13179,23 +13880,35 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
        vmx_set_cr4(vcpu, vmcs12->host_cr4);
 
-       load_vmcs12_mmu_host_state(vcpu, vmcs12);
+       nested_ept_uninit_mmu_context(vcpu);
+
+       /*
+        * Only PDPTE load can fail as the value of cr3 was checked on entry and
+        * couldn't have changed.
+        */
+       if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+               nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+       if (!enable_ept)
+               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
 
        /*
-        * If vmcs01 don't use VPID, CPU flushes TLB on every
+        * If vmcs01 doesn't use VPID, CPU flushes TLB on every
         * VMEntry/VMExit. Thus, no need to flush TLB.
         *
-        * If vmcs12 uses VPID, TLB entries populated by L2 are
-        * tagged with vmx->nested.vpid02 while L1 entries are tagged
-        * with vmx->vpid. Thus, no need to flush TLB.
+        * If vmcs12 doesn't use VPID, L1 expects TLB to be
+        * flushed on every VMEntry/VMExit.
         *
-        * Therefore, flush TLB only in case vmcs01 uses VPID and
-        * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
-        * are both tagged with vmx->vpid.
+        * Otherwise, we can preserve TLB entries as long as we are
+        * able to tag L1 TLB entries differently than L2 TLB entries.
+        *
+        * If vmcs12 uses EPT, we need to execute this flush on EPTP01
+        * and therefore we request the TLB flush to happen only after VMCS EPTP
+        * has been set by KVM_REQ_LOAD_CR3.
         */
        if (enable_vpid &&
-           !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
-               vmx_flush_tlb(vcpu, true);
+           (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
        }
 
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
@@ -13275,6 +13988,140 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
 }
 
+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
+{
+       struct shared_msr_entry *efer_msr;
+       unsigned int i;
+
+       if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
+               return vmcs_read64(GUEST_IA32_EFER);
+
+       if (cpu_has_load_ia32_efer)
+               return host_efer;
+
+       for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
+               if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
+                       return vmx->msr_autoload.guest.val[i].value;
+       }
+
+       efer_msr = find_msr_entry(vmx, MSR_EFER);
+       if (efer_msr)
+               return efer_msr->data;
+
+       return host_efer;
+}
+
+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vmx_msr_entry g, h;
+       struct msr_data msr;
+       gpa_t gpa;
+       u32 i, j;
+
+       vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
+
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+               /*
+                * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
+                * as vmcs01.GUEST_DR7 contains a userspace defined value
+                * and vcpu->arch.dr7 is not squirreled away before the
+                * nested VMENTER (not worth adding a variable in nested_vmx).
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+                       kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+               else
+                       WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+       }
+
+       /*
+        * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+        * handle a variety of side effects to KVM's software model.
+        */
+       vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
+
+       vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+       vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
+
+       vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+       vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
+
+       nested_ept_uninit_mmu_context(vcpu);
+       vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+       /*
+        * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
+        * from vmcs01 (if necessary).  The PDPTRs are not loaded on
+        * VMFail, like everything else we just need to ensure our
+        * software model is up-to-date.
+        */
+       ept_save_pdptrs(vcpu);
+
+       kvm_mmu_reset_context(vcpu);
+
+       if (cpu_has_vmx_msr_bitmap())
+               vmx_update_msr_bitmap(vcpu);
+
+       /*
+        * This nasty bit of open coding is a compromise between blindly
+        * loading L1's MSRs using the exit load lists (incorrect emulation
+        * of VMFail), leaving the nested VM's MSRs in the software model
+        * (incorrect behavior) and snapshotting the modified MSRs (too
+        * expensive since the lists are unbound by hardware).  For each
+        * MSR that was (prematurely) loaded from the nested VMEntry load
+        * list, reload it from the exit load list if it exists and differs
+        * from the guest value.  The intent is to stuff host state as
+        * silently as possible, not to fully process the exit load list.
+        */
+       msr.host_initiated = false;
+       for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
+               gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
+               if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
+                       pr_debug_ratelimited(
+                               "%s read MSR index failed (%u, 0x%08llx)\n",
+                               __func__, i, gpa);
+                       goto vmabort;
+               }
+
+               for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
+                       gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
+                       if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
+                               pr_debug_ratelimited(
+                                       "%s read MSR failed (%u, 0x%08llx)\n",
+                                       __func__, j, gpa);
+                               goto vmabort;
+                       }
+                       if (h.index != g.index)
+                               continue;
+                       if (h.value == g.value)
+                               break;
+
+                       if (nested_vmx_load_msr_check(vcpu, &h)) {
+                               pr_debug_ratelimited(
+                                       "%s check failed (%u, 0x%x, 0x%x)\n",
+                                       __func__, j, h.index, h.reserved);
+                               goto vmabort;
+                       }
+
+                       msr.index = h.index;
+                       msr.data = h.value;
+                       if (kvm_set_msr(vcpu, &msr)) {
+                               pr_debug_ratelimited(
+                                       "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
+                                       __func__, j, h.index, h.value);
+                               goto vmabort;
+                       }
+               }
+       }
+
+       return;
+
+vmabort:
+       nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
 /*
  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
  * and modify vmcs12 to make it see what it would expect to see there if
@@ -13290,14 +14137,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
-       /*
-        * The only expected VM-instruction error is "VM entry with
-        * invalid control field(s)." Anything else indicates a
-        * problem with L0.
-        */
-       WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
-                                  VMXERR_ENTRY_INVALID_CONTROL_FIELD));
-
        leave_guest_mode(vcpu);
 
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -13324,12 +14163,19 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
                                         vmcs12->vm_exit_msr_store_count))
                        nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+       } else {
+               /*
+                * The only expected VM-instruction error is "VM entry with
+                * invalid control field(s)." Anything else indicates a
+                * problem with L0.  And we should never get here with a
+                * VMFail of any type if early consistency checks are enabled.
+                */
+               WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+                            VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               WARN_ON_ONCE(nested_early_check);
        }
 
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-       vm_entry_controls_reset_shadow(vmx);
-       vm_exit_controls_reset_shadow(vmx);
-       vmx_segment_cache_clear(vmx);
 
        /* Update any VMCS fields that might have changed while L2 ran */
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
@@ -13373,8 +14219,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         */
        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
-       if (enable_shadow_vmcs && exit_reason != -1)
-               vmx->nested.sync_shadow_vmcs = true;
+       if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+               vmx->nested.need_vmcs12_sync = true;
 
        /* in case we halted in L2 */
        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -13409,24 +14255,24 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
                return;
        }
-       
+
        /*
         * After an early L2 VM-entry failure, we're now back
         * in L1 which thinks it just finished a VMLAUNCH or
         * VMRESUME instruction, so we need to set the failure
         * flag and the VM-instruction error field of the VMCS
-        * accordingly.
+        * accordingly, and skip the emulated instruction.
         */
-       nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-
-       load_vmcs12_mmu_host_state(vcpu, vmcs12);
+       (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 
        /*
-        * The emulated instruction was already skipped in
-        * nested_vmx_run, but the updated RIP was never
-        * written back to the vmcs01.
+        * Restore L1's host state to KVM's software model.  We're here
+        * because a consistency check was caught by hardware, which
+        * means some amount of guest state has been propagated to KVM's
+        * model and needs to be unwound to the host's state.
         */
-       skip_emulated_instruction(vcpu);
+       nested_vmx_restore_host_state(vcpu);
+
        vmx->fail = 0;
 }
 
@@ -13439,26 +14285,7 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu)
                to_vmx(vcpu)->nested.nested_run_pending = 0;
                nested_vmx_vmexit(vcpu, -1, 0, 0);
        }
-       free_nested(to_vmx(vcpu));
-}
-
-/*
- * L1's failure to enter L2 is a subset of a normal exit, as explained in
- * 23.7 "VM-entry failures during or after loading guest state" (this also
- * lists the acceptable exit-reason and exit-qualification parameters).
- * It should only be called before L2 actually succeeded to run, and when
- * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
- */
-static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
-                       struct vmcs12 *vmcs12,
-                       u32 reason, unsigned long qualification)
-{
-       load_vmcs12_host_state(vcpu, vmcs12);
-       vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
-       vmcs12->exit_qualification = qualification;
-       nested_vmx_succeed(vcpu);
-       if (enable_shadow_vmcs)
-               to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
+       free_nested(vcpu);
 }
 
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -13884,7 +14711,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
 
        if (vmx->nested.smm.guest_mode) {
                vcpu->arch.hflags &= ~HF_SMM_MASK;
-               ret = enter_vmx_non_root_mode(vcpu, NULL);
+               ret = nested_vmx_enter_non_root_mode(vcpu, false);
                vcpu->arch.hflags |= HF_SMM_MASK;
                if (ret)
                        return ret;
@@ -13899,6 +14726,20 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       /*
+        * In case we do two consecutive get/set_nested_state()s while L2 was
+        * running hv_evmcs may end up not being mapped (we map it from
+        * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always
+        * have vmcs12 if it is true.
+        */
+       return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
+               vmx->nested.hv_evmcs;
+}
+
 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
                                struct kvm_nested_state __user *user_kvm_nested_state,
                                u32 user_data_size)
@@ -13918,12 +14759,16 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 
        vmx = to_vmx(vcpu);
        vmcs12 = get_vmcs12(vcpu);
+
+       if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
+               kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
+
        if (nested_vmx_allowed(vcpu) &&
            (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
                kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
                kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
 
-               if (vmx->nested.current_vmptr != -1ull) {
+               if (vmx_has_valid_vmcs12(vcpu)) {
                        kvm_state.size += VMCS12_SIZE;
 
                        if (is_guest_mode(vcpu) &&
@@ -13952,20 +14797,24 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
        if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
                return -EFAULT;
 
-       if (vmx->nested.current_vmptr == -1ull)
+       if (!vmx_has_valid_vmcs12(vcpu))
                goto out;
 
        /*
         * When running L2, the authoritative vmcs12 state is in the
         * vmcs02. When running L1, the authoritative vmcs12 state is
-        * in the shadow vmcs linked to vmcs01, unless
-        * sync_shadow_vmcs is set, in which case, the authoritative
+        * in the shadow or enlightened vmcs linked to vmcs01, unless
+        * need_vmcs12_sync is set, in which case, the authoritative
         * vmcs12 state is in the vmcs12 already.
         */
-       if (is_guest_mode(vcpu))
+       if (is_guest_mode(vcpu)) {
                sync_vmcs12(vcpu, vmcs12);
-       else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
-               copy_shadow_to_vmcs12(vmx);
+       } else if (!vmx->nested.need_vmcs12_sync) {
+               if (vmx->nested.hv_evmcs)
+                       copy_enlightened_to_vmcs12(vmx);
+               else if (enable_shadow_vmcs)
+                       copy_shadow_to_vmcs12(vmx);
+       }
 
        if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
                return -EFAULT;
@@ -13993,6 +14842,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
        if (kvm_state->format != 0)
                return -EINVAL;
 
+       if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
+               nested_enable_evmcs(vcpu, NULL);
+
        if (!nested_vmx_allowed(vcpu))
                return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
 
@@ -14010,13 +14862,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
        if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
                return -EINVAL;
 
-       if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
-               return -EINVAL;
-
-       if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
-           !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
-               return -EINVAL;
-
        if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
            (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
                return -EINVAL;
@@ -14046,7 +14891,25 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
        if (ret)
                return ret;
 
-       set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+       /* Empty 'VMXON' state is permitted */
+       if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+               return 0;
+
+       if (kvm_state->vmx.vmcs_pa != -1ull) {
+               if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
+                   !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
+                       return -EINVAL;
+
+               set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+       } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
+               /*
+                * Sync eVMCS upon entry as we may not have
+                * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
+                */
+               vmx->nested.need_vmcs12_sync = true;
+       } else {
+               return -EINVAL;
+       }
 
        if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
                vmx->nested.smm.vmxon = true;
@@ -14090,7 +14953,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        vmx->nested.dirty_vmcs12 = true;
-       ret = enter_vmx_non_root_mode(vcpu, NULL);
+       ret = nested_vmx_enter_non_root_mode(vcpu, false);
        if (ret)
                return -EINVAL;
 
@@ -14242,6 +15105,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .pre_enter_smm = vmx_pre_enter_smm,
        .pre_leave_smm = vmx_pre_leave_smm,
        .enable_smi_window = enable_smi_window,
+
+       .nested_enable_evmcs = nested_enable_evmcs,
 };
 
 static void vmx_cleanup_l1d_flush(void)
index cd0c75f..132432f 100644 (file)
@@ -28,7 +28,6 @@
  */
 
 /* 16-bits */
-SHADOW_FIELD_RW(GUEST_CS_SELECTOR)
 SHADOW_FIELD_RW(GUEST_INTR_STATUS)
 SHADOW_FIELD_RW(GUEST_PML_INDEX)
 SHADOW_FIELD_RW(HOST_FS_SELECTOR)
@@ -47,8 +46,8 @@ SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE)
 SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD)
 SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN)
 SHADOW_FIELD_RW(TPR_THRESHOLD)
-SHADOW_FIELD_RW(GUEST_CS_LIMIT)
 SHADOW_FIELD_RW(GUEST_CS_AR_BYTES)
+SHADOW_FIELD_RW(GUEST_SS_AR_BYTES)
 SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO)
 SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE)
 
@@ -61,8 +60,6 @@ SHADOW_FIELD_RW(GUEST_CR0)
 SHADOW_FIELD_RW(GUEST_CR3)
 SHADOW_FIELD_RW(GUEST_CR4)
 SHADOW_FIELD_RW(GUEST_RFLAGS)
-SHADOW_FIELD_RW(GUEST_CS_BASE)
-SHADOW_FIELD_RW(GUEST_ES_BASE)
 SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK)
 SHADOW_FIELD_RW(CR0_READ_SHADOW)
 SHADOW_FIELD_RW(CR4_READ_SHADOW)
index ca71773..66d66d7 100644 (file)
@@ -136,7 +136,7 @@ static u32 __read_mostly tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 
 /* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int __read_mostly lapic_timer_advance_ns = 0;
+unsigned int __read_mostly lapic_timer_advance_ns = 1000;
 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
 EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
 
@@ -400,9 +400,51 @@ static int exception_type(int vector)
        return EXCPT_FAULT;
 }
 
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
+{
+       unsigned nr = vcpu->arch.exception.nr;
+       bool has_payload = vcpu->arch.exception.has_payload;
+       unsigned long payload = vcpu->arch.exception.payload;
+
+       if (!has_payload)
+               return;
+
+       switch (nr) {
+       case DB_VECTOR:
+               /*
+                * "Certain debug exceptions may clear bit 0-3.  The
+                * remaining contents of the DR6 register are never
+                * cleared by the processor".
+                */
+               vcpu->arch.dr6 &= ~DR_TRAP_BITS;
+               /*
+                * DR6.RTM is set by all #DB exceptions that don't clear it.
+                */
+               vcpu->arch.dr6 |= DR6_RTM;
+               vcpu->arch.dr6 |= payload;
+               /*
+                * Bit 16 should be set in the payload whenever the #DB
+                * exception should clear DR6.RTM. This makes the payload
+                * compatible with the pending debug exceptions under VMX.
+                * Though not currently documented in the SDM, this also
+                * makes the payload compatible with the exit qualification
+                * for #DB exceptions under VMX.
+                */
+               vcpu->arch.dr6 ^= payload & DR6_RTM;
+               break;
+       case PF_VECTOR:
+               vcpu->arch.cr2 = payload;
+               break;
+       }
+
+       vcpu->arch.exception.has_payload = false;
+       vcpu->arch.exception.payload = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
+
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                unsigned nr, bool has_error, u32 error_code,
-               bool reinject)
+               bool has_payload, unsigned long payload, bool reinject)
 {
        u32 prev_nr;
        int class1, class2;
@@ -424,6 +466,14 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                         */
                        WARN_ON_ONCE(vcpu->arch.exception.pending);
                        vcpu->arch.exception.injected = true;
+                       if (WARN_ON_ONCE(has_payload)) {
+                               /*
+                                * A reinjected event has already
+                                * delivered its payload.
+                                */
+                               has_payload = false;
+                               payload = 0;
+                       }
                } else {
                        vcpu->arch.exception.pending = true;
                        vcpu->arch.exception.injected = false;
@@ -431,6 +481,22 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                vcpu->arch.exception.has_error_code = has_error;
                vcpu->arch.exception.nr = nr;
                vcpu->arch.exception.error_code = error_code;
+               vcpu->arch.exception.has_payload = has_payload;
+               vcpu->arch.exception.payload = payload;
+               /*
+                * In guest mode, payload delivery should be deferred,
+                * so that the L1 hypervisor can intercept #PF before
+                * CR2 is modified (or intercept #DB before DR6 is
+                * modified under nVMX).  However, for ABI
+                * compatibility with KVM_GET_VCPU_EVENTS and
+                * KVM_SET_VCPU_EVENTS, we can't delay payload
+                * delivery unless userspace has enabled this
+                * functionality via the per-VM capability,
+                * KVM_CAP_EXCEPTION_PAYLOAD.
+                */
+               if (!vcpu->kvm->arch.exception_payload_enabled ||
+                   !is_guest_mode(vcpu))
+                       kvm_deliver_exception_payload(vcpu);
                return;
        }
 
@@ -455,6 +521,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                vcpu->arch.exception.has_error_code = true;
                vcpu->arch.exception.nr = DF_VECTOR;
                vcpu->arch.exception.error_code = 0;
+               vcpu->arch.exception.has_payload = false;
+               vcpu->arch.exception.payload = 0;
        } else
                /* replace previous exception with a new one in a hope
                   that instruction re-execution will regenerate lost
@@ -464,16 +532,29 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
-       kvm_multiple_exception(vcpu, nr, false, 0, false);
+       kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
-       kvm_multiple_exception(vcpu, nr, false, 0, true);
+       kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
+static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
+                                 unsigned long payload)
+{
+       kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
+}
+
+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
+                                   u32 error_code, unsigned long payload)
+{
+       kvm_multiple_exception(vcpu, nr, true, error_code,
+                              true, payload, false);
+}
+
 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 {
        if (err)
@@ -490,11 +571,13 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
        ++vcpu->stat.pf_guest;
        vcpu->arch.exception.nested_apf =
                is_guest_mode(vcpu) && fault->async_page_fault;
-       if (vcpu->arch.exception.nested_apf)
+       if (vcpu->arch.exception.nested_apf) {
                vcpu->arch.apf.nested_apf_token = fault->address;
-       else
-               vcpu->arch.cr2 = fault->address;
-       kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+               kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+       } else {
+               kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
+                                       fault->address);
+       }
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
@@ -503,7 +586,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau
        if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
                vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
        else
-               vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+               vcpu->arch.mmu->inject_page_fault(vcpu, fault);
 
        return fault->nested_page_fault;
 }
@@ -517,13 +600,13 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 {
-       kvm_multiple_exception(vcpu, nr, true, error_code, false);
+       kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 {
-       kvm_multiple_exception(vcpu, nr, true, error_code, true);
+       kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
 
@@ -602,7 +685,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
                if ((pdpte[i] & PT_PRESENT_MASK) &&
                    (pdpte[i] &
-                    vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
+                    vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) {
                        ret = 0;
                        goto out;
                }
@@ -2477,7 +2560,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
                break;
        case MSR_KVM_PV_EOI_EN:
-               if (kvm_lapic_enable_pv_eoi(vcpu, data))
+               if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
                        return 1;
                break;
 
@@ -2912,6 +2995,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_HYPERV_VP_INDEX:
        case KVM_CAP_HYPERV_EVENTFD:
        case KVM_CAP_HYPERV_TLBFLUSH:
+       case KVM_CAP_HYPERV_SEND_IPI:
+       case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
        case KVM_CAP_PCI_SEGMENT:
        case KVM_CAP_DEBUGREGS:
        case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -2930,6 +3015,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_IMMEDIATE_EXIT:
        case KVM_CAP_GET_MSR_FEATURES:
        case KVM_CAP_MSR_PLATFORM_INFO:
+       case KVM_CAP_EXCEPTION_PAYLOAD:
                r = 1;
                break;
        case KVM_CAP_SYNC_REGS:
@@ -3362,19 +3448,33 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                                               struct kvm_vcpu_events *events)
 {
        process_nmi(vcpu);
+
        /*
-        * FIXME: pass injected and pending separately.  This is only
-        * needed for nested virtualization, whose state cannot be
-        * migrated yet.  For now we can combine them.
+        * The API doesn't provide the instruction length for software
+        * exceptions, so don't report them. As long as the guest RIP
+        * isn't advanced, we should expect to encounter the exception
+        * again.
         */
-       events->exception.injected =
-               (vcpu->arch.exception.pending ||
-                vcpu->arch.exception.injected) &&
-               !kvm_exception_is_soft(vcpu->arch.exception.nr);
+       if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
+               events->exception.injected = 0;
+               events->exception.pending = 0;
+       } else {
+               events->exception.injected = vcpu->arch.exception.injected;
+               events->exception.pending = vcpu->arch.exception.pending;
+               /*
+                * For ABI compatibility, deliberately conflate
+                * pending and injected exceptions when
+                * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
+                */
+               if (!vcpu->kvm->arch.exception_payload_enabled)
+                       events->exception.injected |=
+                               vcpu->arch.exception.pending;
+       }
        events->exception.nr = vcpu->arch.exception.nr;
        events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-       events->exception.pad = 0;
        events->exception.error_code = vcpu->arch.exception.error_code;
+       events->exception_has_payload = vcpu->arch.exception.has_payload;
+       events->exception_payload = vcpu->arch.exception.payload;
 
        events->interrupt.injected =
                vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
@@ -3398,6 +3498,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
                         | KVM_VCPUEVENT_VALID_SHADOW
                         | KVM_VCPUEVENT_VALID_SMM);
+       if (vcpu->kvm->arch.exception_payload_enabled)
+               events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+
        memset(&events->reserved, 0, sizeof(events->reserved));
 }
 
@@ -3409,12 +3512,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
        if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
                              | KVM_VCPUEVENT_VALID_SIPI_VECTOR
                              | KVM_VCPUEVENT_VALID_SHADOW
-                             | KVM_VCPUEVENT_VALID_SMM))
+                             | KVM_VCPUEVENT_VALID_SMM
+                             | KVM_VCPUEVENT_VALID_PAYLOAD))
                return -EINVAL;
 
-       if (events->exception.injected &&
-           (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
-            is_guest_mode(vcpu)))
+       if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
+               if (!vcpu->kvm->arch.exception_payload_enabled)
+                       return -EINVAL;
+               if (events->exception.pending)
+                       events->exception.injected = 0;
+               else
+                       events->exception_has_payload = 0;
+       } else {
+               events->exception.pending = 0;
+               events->exception_has_payload = 0;
+       }
+
+       if ((events->exception.injected || events->exception.pending) &&
+           (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
                return -EINVAL;
 
        /* INITs are latched while in SMM */
@@ -3424,11 +3539,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        process_nmi(vcpu);
-       vcpu->arch.exception.injected = false;
-       vcpu->arch.exception.pending = events->exception.injected;
+       vcpu->arch.exception.injected = events->exception.injected;
+       vcpu->arch.exception.pending = events->exception.pending;
        vcpu->arch.exception.nr = events->exception.nr;
        vcpu->arch.exception.has_error_code = events->exception.has_error_code;
        vcpu->arch.exception.error_code = events->exception.error_code;
+       vcpu->arch.exception.has_payload = events->exception_has_payload;
+       vcpu->arch.exception.payload = events->exception_payload;
 
        vcpu->arch.interrupt.injected = events->interrupt.injected;
        vcpu->arch.interrupt.nr = events->interrupt.nr;
@@ -3694,6 +3811,10 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                                     struct kvm_enable_cap *cap)
 {
+       int r;
+       uint16_t vmcs_version;
+       void __user *user_ptr;
+
        if (cap->flags)
                return -EINVAL;
 
@@ -3706,6 +3827,16 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                        return -EINVAL;
                return kvm_hv_activate_synic(vcpu, cap->cap ==
                                             KVM_CAP_HYPERV_SYNIC2);
+       case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+               r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version);
+               if (!r) {
+                       user_ptr = (void __user *)(uintptr_t)cap->args[0];
+                       if (copy_to_user(user_ptr, &vmcs_version,
+                                        sizeof(vmcs_version)))
+                               r = -EFAULT;
+               }
+               return r;
+
        default:
                return -EINVAL;
        }
@@ -4047,11 +4178,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                        break;
 
                if (kvm_state.flags &
-                   ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
+                   ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
+                     | KVM_STATE_NESTED_EVMCS))
                        break;
 
                /* nested_run_pending implies guest_mode.  */
-               if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
+               if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
+                   && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
                        break;
 
                r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
@@ -4363,6 +4496,10 @@ split_irqchip_unlock:
                kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
                r = 0;
                break;
+       case KVM_CAP_EXCEPTION_PAYLOAD:
+               kvm->arch.exception_payload_enabled = cap->args[0];
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -4803,7 +4940,7 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
 
        /* NPT walks are always user-walks */
        access |= PFERR_USER_MASK;
-       t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
+       t_gpa  = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
 
        return t_gpa;
 }
@@ -5889,7 +6026,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
        if (WARN_ON_ONCE(is_guest_mode(vcpu)))
                return false;
 
-       if (!vcpu->arch.mmu.direct_map) {
+       if (!vcpu->arch.mmu->direct_map) {
                /*
                 * Write permission should be allowed since only
                 * write access need to be emulated.
@@ -5922,7 +6059,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
        kvm_release_pfn_clean(pfn);
 
        /* The instructions are well-emulated on direct mmu. */
-       if (vcpu->arch.mmu.direct_map) {
+       if (vcpu->arch.mmu->direct_map) {
                unsigned int indirect_shadow_pages;
 
                spin_lock(&vcpu->kvm->mmu_lock);
@@ -5989,7 +6126,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
        vcpu->arch.last_retry_eip = ctxt->eip;
        vcpu->arch.last_retry_addr = cr2;
 
-       if (!vcpu->arch.mmu.direct_map)
+       if (!vcpu->arch.mmu->direct_map)
                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
 
        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -6049,14 +6186,7 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
                *r = EMULATE_USER_EXIT;
        } else {
-               /*
-                * "Certain debug exceptions may clear bit 0-3.  The
-                * remaining contents of the DR6 register are never
-                * cleared by the processor".
-                */
-               vcpu->arch.dr6 &= ~15;
-               vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
-               kvm_queue_exception(vcpu, DB_VECTOR);
+               kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
        }
 }
 
@@ -6995,10 +7125,22 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
                        __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
                                             X86_EFLAGS_RF);
 
-               if (vcpu->arch.exception.nr == DB_VECTOR &&
-                   (vcpu->arch.dr7 & DR7_GD)) {
-                       vcpu->arch.dr7 &= ~DR7_GD;
-                       kvm_update_dr7(vcpu);
+               if (vcpu->arch.exception.nr == DB_VECTOR) {
+                       /*
+                        * This code assumes that nSVM doesn't use
+                        * check_nested_events(). If it does, the
+                        * DR6/DR7 changes should happen before L1
+                        * gets a #VMEXIT for an intercepted #DB in
+                        * L2.  (Under VMX, on the other hand, the
+                        * DR6/DR7 changes should not happen in the
+                        * event of a VM-exit to L1 for an intercepted
+                        * #DB in L2.)
+                        */
+                       kvm_deliver_exception_payload(vcpu);
+                       if (vcpu->arch.dr7 & DR7_GD) {
+                               vcpu->arch.dr7 &= ~DR7_GD;
+                               kvm_update_dr7(vcpu);
+                       }
                }
 
                kvm_x86_ops->queue_exception(vcpu);
@@ -8478,7 +8620,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        kvm_vcpu_mtrr_init(vcpu);
        vcpu_load(vcpu);
        kvm_vcpu_reset(vcpu, false);
-       kvm_mmu_setup(vcpu);
+       kvm_init_mmu(vcpu, false);
        vcpu_put(vcpu);
        return 0;
 }
@@ -9327,7 +9469,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 {
        int r;
 
-       if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+       if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
              work->wakeup_all)
                return;
 
@@ -9335,11 +9477,11 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
        if (unlikely(r))
                return;
 
-       if (!vcpu->arch.mmu.direct_map &&
-             work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+       if (!vcpu->arch.mmu->direct_map &&
+             work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
                return;
 
-       vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+       vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true);
 }
 
 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
@@ -9463,6 +9605,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
                        vcpu->arch.exception.nr = 0;
                        vcpu->arch.exception.has_error_code = false;
                        vcpu->arch.exception.error_code = 0;
+                       vcpu->arch.exception.has_payload = false;
+                       vcpu->arch.exception.payload = 0;
                } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
                        fault.vector = PF_VECTOR;
                        fault.error_code_valid = true;
index 67b9568..224cd0a 100644 (file)
@@ -266,6 +266,8 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
 
 int handle_ud(struct kvm_vcpu *vcpu);
 
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu);
+
 void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
 u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
index c60395b..83e6d99 100644 (file)
@@ -372,6 +372,14 @@ config S390_CCW_IOMMU
          Enables bits of IOMMU API required by VFIO. The iommu_ops
          is not implemented as it is not necessary for VFIO.
 
+config S390_AP_IOMMU
+       bool "S390 AP IOMMU Support"
+       depends on S390 && ZCRYPT
+       select IOMMU_API
+       help
+         Enables bits of IOMMU API required by VFIO. The iommu_ops
+         is not implemented as it is not necessary for VFIO.
+
 config MTK_IOMMU
        bool "MTK IOMMU Support"
        depends on ARM || ARM64
index fd5e215..6ccd93d 100644 (file)
@@ -15,3 +15,7 @@ obj-$(CONFIG_ZCRYPT) += zcrypt_cex2c.o zcrypt_cex2a.o zcrypt_cex4.o
 # pkey kernel module
 pkey-objs := pkey_api.o
 obj-$(CONFIG_PKEY) += pkey.o
+
+# adjunct processor matrix
+vfio_ap-objs := vfio_ap_drv.o vfio_ap_ops.o
+obj-$(CONFIG_VFIO_AP) += vfio_ap.o
diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
new file mode 100644 (file)
index 0000000..7667b38
--- /dev/null
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VFIO based AP device driver
+ *
+ * Copyright IBM Corp. 2018
+ *
+ * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "vfio_ap_private.h"
+
+#define VFIO_AP_ROOT_NAME "vfio_ap"
+#define VFIO_AP_DEV_TYPE_NAME "ap_matrix"
+#define VFIO_AP_DEV_NAME "matrix"
+
+MODULE_AUTHOR("IBM Corporation");
+MODULE_DESCRIPTION("VFIO AP device driver, Copyright IBM Corp. 2018");
+MODULE_LICENSE("GPL v2");
+
+static struct ap_driver vfio_ap_drv;
+
+static struct device_type vfio_ap_dev_type = {
+       .name = VFIO_AP_DEV_TYPE_NAME,
+};
+
+struct ap_matrix_dev *matrix_dev;
+
+/* Only type 10 adapters (CEX4 and later) are supported
+ * by the AP matrix device driver
+ */
+static struct ap_device_id ap_queue_ids[] = {
+       { .dev_type = AP_DEVICE_TYPE_CEX4,
+         .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
+       { .dev_type = AP_DEVICE_TYPE_CEX5,
+         .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
+       { .dev_type = AP_DEVICE_TYPE_CEX6,
+         .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
+       { /* end of sibling */ },
+};
+
+MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids);
+
+static int vfio_ap_queue_dev_probe(struct ap_device *apdev)
+{
+       return 0;
+}
+
+static void vfio_ap_queue_dev_remove(struct ap_device *apdev)
+{
+       /* Nothing to do yet */
+}
+
+static void vfio_ap_matrix_dev_release(struct device *dev)
+{
+       struct ap_matrix_dev *matrix_dev = dev_get_drvdata(dev);
+
+       kfree(matrix_dev);
+}
+
+static int vfio_ap_matrix_dev_create(void)
+{
+       int ret;
+       struct device *root_device;
+
+       root_device = root_device_register(VFIO_AP_ROOT_NAME);
+       if (IS_ERR(root_device))
+               return PTR_ERR(root_device);
+
+       matrix_dev = kzalloc(sizeof(*matrix_dev), GFP_KERNEL);
+       if (!matrix_dev) {
+               ret = -ENOMEM;
+               goto matrix_alloc_err;
+       }
+
+       /* Fill in config info via PQAP(QCI), if available */
+       if (test_facility(12)) {
+               ret = ap_qci(&matrix_dev->info);
+               if (ret)
+                       goto matrix_alloc_err;
+       }
+
+       mutex_init(&matrix_dev->lock);
+       INIT_LIST_HEAD(&matrix_dev->mdev_list);
+
+       matrix_dev->device.type = &vfio_ap_dev_type;
+       dev_set_name(&matrix_dev->device, "%s", VFIO_AP_DEV_NAME);
+       matrix_dev->device.parent = root_device;
+       matrix_dev->device.release = vfio_ap_matrix_dev_release;
+       matrix_dev->device.driver = &vfio_ap_drv.driver;
+
+       ret = device_register(&matrix_dev->device);
+       if (ret)
+               goto matrix_reg_err;
+
+       return 0;
+
+matrix_reg_err:
+       put_device(&matrix_dev->device);
+matrix_alloc_err:
+       root_device_unregister(root_device);
+
+       return ret;
+}
+
+static void vfio_ap_matrix_dev_destroy(void)
+{
+       device_unregister(&matrix_dev->device);
+       root_device_unregister(matrix_dev->device.parent);
+}
+
+static int __init vfio_ap_init(void)
+{
+       int ret;
+
+       /* If there are no AP instructions, there is nothing to pass through. */
+       if (!ap_instructions_available())
+               return -ENODEV;
+
+       ret = vfio_ap_matrix_dev_create();
+       if (ret)
+               return ret;
+
+       memset(&vfio_ap_drv, 0, sizeof(vfio_ap_drv));
+       vfio_ap_drv.probe = vfio_ap_queue_dev_probe;
+       vfio_ap_drv.remove = vfio_ap_queue_dev_remove;
+       vfio_ap_drv.ids = ap_queue_ids;
+
+       ret = ap_driver_register(&vfio_ap_drv, THIS_MODULE, VFIO_AP_DRV_NAME);
+       if (ret) {
+               vfio_ap_matrix_dev_destroy();
+               return ret;
+       }
+
+       ret = vfio_ap_mdev_register();
+       if (ret) {
+               ap_driver_unregister(&vfio_ap_drv);
+               vfio_ap_matrix_dev_destroy();
+
+               return ret;
+       }
+
+       return 0;
+}
+
+static void __exit vfio_ap_exit(void)
+{
+       vfio_ap_mdev_unregister();
+       ap_driver_unregister(&vfio_ap_drv);
+       vfio_ap_matrix_dev_destroy();
+}
+
+module_init(vfio_ap_init);
+module_exit(vfio_ap_exit);
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
new file mode 100644 (file)
index 0000000..272ef42
--- /dev/null
@@ -0,0 +1,939 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Adjunct processor matrix VFIO device driver callbacks.
+ *
+ * Copyright IBM Corp. 2018
+ *
+ * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
+ *           Halil Pasic <pasic@linux.ibm.com>
+ *           Pierre Morel <pmorel@linux.ibm.com>
+ */
+#include <linux/string.h>
+#include <linux/vfio.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/bitops.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <asm/kvm.h>
+#include <asm/zcrypt.h>
+
+#include "vfio_ap_private.h"
+
+#define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough"
+#define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device"
+
+static void vfio_ap_matrix_init(struct ap_config_info *info,
+                               struct ap_matrix *matrix)
+{
+       matrix->apm_max = info->apxa ? info->Na : 63;
+       matrix->aqm_max = info->apxa ? info->Nd : 15;
+       matrix->adm_max = info->apxa ? info->Nd : 15;
+}
+
+static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+       struct ap_matrix_mdev *matrix_mdev;
+
+       if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0))
+               return -EPERM;
+
+       matrix_mdev = kzalloc(sizeof(*matrix_mdev), GFP_KERNEL);
+       if (!matrix_mdev) {
+               atomic_inc(&matrix_dev->available_instances);
+               return -ENOMEM;
+       }
+
+       vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix);
+       mdev_set_drvdata(mdev, matrix_mdev);
+       mutex_lock(&matrix_dev->lock);
+       list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
+       mutex_unlock(&matrix_dev->lock);
+
+       return 0;
+}
+
+static int vfio_ap_mdev_remove(struct mdev_device *mdev)
+{
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+
+       if (matrix_mdev->kvm)
+               return -EBUSY;
+
+       mutex_lock(&matrix_dev->lock);
+       list_del(&matrix_mdev->node);
+       mutex_unlock(&matrix_dev->lock);
+
+       kfree(matrix_mdev);
+       mdev_set_drvdata(mdev, NULL);
+       atomic_inc(&matrix_dev->available_instances);
+
+       return 0;
+}
+
+static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+       return sprintf(buf, "%s\n", VFIO_AP_MDEV_NAME_HWVIRT);
+}
+
+static MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t available_instances_show(struct kobject *kobj,
+                                       struct device *dev, char *buf)
+{
+       return sprintf(buf, "%d\n",
+                      atomic_read(&matrix_dev->available_instances));
+}
+
+static MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+                              char *buf)
+{
+       return sprintf(buf, "%s\n", VFIO_DEVICE_API_AP_STRING);
+}
+
+static MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *vfio_ap_mdev_type_attrs[] = {
+       &mdev_type_attr_name.attr,
+       &mdev_type_attr_device_api.attr,
+       &mdev_type_attr_available_instances.attr,
+       NULL,
+};
+
+static struct attribute_group vfio_ap_mdev_hwvirt_type_group = {
+       .name = VFIO_AP_MDEV_TYPE_HWVIRT,
+       .attrs = vfio_ap_mdev_type_attrs,
+};
+
+static struct attribute_group *vfio_ap_mdev_type_groups[] = {
+       &vfio_ap_mdev_hwvirt_type_group,
+       NULL,
+};
+
+struct vfio_ap_queue_reserved {
+       unsigned long *apid;
+       unsigned long *apqi;
+       bool reserved;
+};
+
+/**
+ * vfio_ap_has_queue
+ *
+ * @dev: an AP queue device
+ * @data: a struct vfio_ap_queue_reserved reference
+ *
+ * Flags whether the AP queue device (@dev) has a queue ID containing the APQN,
+ * apid or apqi specified in @data:
+ *
+ * - If @data contains both an apid and apqi value, then @data will be flagged
+ *   as reserved if the APID and APQI fields for the AP queue device matches
+ *
+ * - If @data contains only an apid value, @data will be flagged as
+ *   reserved if the APID field in the AP queue device matches
+ *
+ * - If @data contains only an apqi value, @data will be flagged as
+ *   reserved if the APQI field in the AP queue device matches
+ *
+ * Returns 0 to indicate the input to function succeeded. Returns -EINVAL if
+ * @data does not contain either an apid or apqi.
+ */
+static int vfio_ap_has_queue(struct device *dev, void *data)
+{
+       struct vfio_ap_queue_reserved *qres = data;
+       struct ap_queue *ap_queue = to_ap_queue(dev);
+       ap_qid_t qid;
+       unsigned long id;
+
+       if (qres->apid && qres->apqi) {
+               qid = AP_MKQID(*qres->apid, *qres->apqi);
+               if (qid == ap_queue->qid)
+                       qres->reserved = true;
+       } else if (qres->apid && !qres->apqi) {
+               id = AP_QID_CARD(ap_queue->qid);
+               if (id == *qres->apid)
+                       qres->reserved = true;
+       } else if (!qres->apid && qres->apqi) {
+               id = AP_QID_QUEUE(ap_queue->qid);
+               if (id == *qres->apqi)
+                       qres->reserved = true;
+       } else {
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/**
+ * vfio_ap_verify_queue_reserved
+ *
+ * @matrix_dev: a mediated matrix device
+ * @apid: an AP adapter ID
+ * @apqi: an AP queue index
+ *
+ * Verifies that the AP queue with @apid/@apqi is reserved by the VFIO AP device
+ * driver according to the following rules:
+ *
+ * - If both @apid and @apqi are not NULL, then there must be an AP queue
+ *   device bound to the vfio_ap driver with the APQN identified by @apid and
+ *   @apqi
+ *
+ * - If only @apid is not NULL, then there must be an AP queue device bound
+ *   to the vfio_ap driver with an APQN containing @apid
+ *
+ * - If only @apqi is not NULL, then there must be an AP queue device bound
+ *   to the vfio_ap driver with an APQN containing @apqi
+ *
+ * Returns 0 if the AP queue is reserved; otherwise, returns -EADDRNOTAVAIL.
+ */
+static int vfio_ap_verify_queue_reserved(unsigned long *apid,
+                                        unsigned long *apqi)
+{
+       int ret;
+       struct vfio_ap_queue_reserved qres;
+
+       qres.apid = apid;
+       qres.apqi = apqi;
+       qres.reserved = false;
+
+       ret = driver_for_each_device(matrix_dev->device.driver, NULL, &qres,
+                                    vfio_ap_has_queue);
+       if (ret)
+               return ret;
+
+       if (qres.reserved)
+               return 0;
+
+       return -EADDRNOTAVAIL;
+}
+
+static int
+vfio_ap_mdev_verify_queues_reserved_for_apid(struct ap_matrix_mdev *matrix_mdev,
+                                            unsigned long apid)
+{
+       int ret;
+       unsigned long apqi;
+       unsigned long nbits = matrix_mdev->matrix.aqm_max + 1;
+
+       if (find_first_bit_inv(matrix_mdev->matrix.aqm, nbits) >= nbits)
+               return vfio_ap_verify_queue_reserved(&apid, NULL);
+
+       for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, nbits) {
+               ret = vfio_ap_verify_queue_reserved(&apid, &apqi);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/**
+ * vfio_ap_mdev_verify_no_sharing
+ *
+ * Verifies that the APQNs derived from the cross product of the AP adapter IDs
+ * and AP queue indexes comprising the AP matrix are not configured for another
+ * mediated device. AP queue sharing is not allowed.
+ *
+ * @matrix_mdev: the mediated matrix device
+ *
+ * Returns 0 if the APQNs are not shared, otherwise; returns -EADDRINUSE.
+ */
+static int vfio_ap_mdev_verify_no_sharing(struct ap_matrix_mdev *matrix_mdev)
+{
+       struct ap_matrix_mdev *lstdev;
+       DECLARE_BITMAP(apm, AP_DEVICES);
+       DECLARE_BITMAP(aqm, AP_DOMAINS);
+
+       list_for_each_entry(lstdev, &matrix_dev->mdev_list, node) {
+               if (matrix_mdev == lstdev)
+                       continue;
+
+               memset(apm, 0, sizeof(apm));
+               memset(aqm, 0, sizeof(aqm));
+
+               /*
+                * We work on full longs, as we can only exclude the leftover
+                * bits in non-inverse order. The leftover is all zeros.
+                */
+               if (!bitmap_and(apm, matrix_mdev->matrix.apm,
+                               lstdev->matrix.apm, AP_DEVICES))
+                       continue;
+
+               if (!bitmap_and(aqm, matrix_mdev->matrix.aqm,
+                               lstdev->matrix.aqm, AP_DOMAINS))
+                       continue;
+
+               return -EADDRINUSE;
+       }
+
+       return 0;
+}
+
+/**
+ * assign_adapter_store
+ *
+ * @dev:       the matrix device
+ * @attr:      the mediated matrix device's assign_adapter attribute
+ * @buf:       a buffer containing the AP adapter number (APID) to
+ *             be assigned
+ * @count:     the number of bytes in @buf
+ *
+ * Parses the APID from @buf and sets the corresponding bit in the mediated
+ * matrix device's APM.
+ *
+ * Returns the number of bytes processed if the APID is valid; otherwise,
+ * returns one of the following errors:
+ *
+ *     1. -EINVAL
+ *        The APID is not a valid number
+ *
+ *     2. -ENODEV
+ *        The APID exceeds the maximum value configured for the system
+ *
+ *     3. -EADDRNOTAVAIL
+ *        An APQN derived from the cross product of the APID being assigned
+ *        and the APQIs previously assigned is not bound to the vfio_ap device
+ *        driver; or, if no APQIs have yet been assigned, the APID is not
+ *        contained in an APQN bound to the vfio_ap device driver.
+ *
+ *     4. -EADDRINUSE
+ *        An APQN derived from the cross product of the APID being assigned
+ *        and the APQIs previously assigned is being used by another mediated
+ *        matrix device
+ */
+static ssize_t assign_adapter_store(struct device *dev,
+                                   struct device_attribute *attr,
+                                   const char *buf, size_t count)
+{
+       int ret;
+       unsigned long apid;
+       struct mdev_device *mdev = mdev_from_dev(dev);
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+
+       /* If the guest is running, disallow assignment of adapter */
+       if (matrix_mdev->kvm)
+               return -EBUSY;
+
+       ret = kstrtoul(buf, 0, &apid);
+       if (ret)
+               return ret;
+
+       if (apid > matrix_mdev->matrix.apm_max)
+               return -ENODEV;
+
+       /*
+        * Set the bit in the AP mask (APM) corresponding to the AP adapter
+        * number (APID). The bits in the mask, from most significant to least
+        * significant bit, correspond to APIDs 0-255.
+        */
+       mutex_lock(&matrix_dev->lock);
+
+       ret = vfio_ap_mdev_verify_queues_reserved_for_apid(matrix_mdev, apid);
+       if (ret)
+               goto done;
+
+       set_bit_inv(apid, matrix_mdev->matrix.apm);
+
+       ret = vfio_ap_mdev_verify_no_sharing(matrix_mdev);
+       if (ret)
+               goto share_err;
+
+       ret = count;
+       goto done;
+
+share_err:
+       clear_bit_inv(apid, matrix_mdev->matrix.apm);
+done:
+       mutex_unlock(&matrix_dev->lock);
+
+       return ret;
+}
+static DEVICE_ATTR_WO(assign_adapter);
+
+/**
+ * unassign_adapter_store
+ *
+ * @dev:       the matrix device
+ * @attr:      the mediated matrix device's unassign_adapter attribute
+ * @buf:       a buffer containing the adapter number (APID) to be unassigned
+ * @count:     the number of bytes in @buf
+ *
+ * Parses the APID from @buf and clears the corresponding bit in the mediated
+ * matrix device's APM.
+ *
+ * Returns the number of bytes processed if the APID is valid; otherwise,
+ * returns one of the following errors:
+ *     -EINVAL if the APID is not a number
+ *     -ENODEV if the APID it exceeds the maximum value configured for the
+ *             system
+ */
+static ssize_t unassign_adapter_store(struct device *dev,
+                                     struct device_attribute *attr,
+                                     const char *buf, size_t count)
+{
+       int ret;
+       unsigned long apid;
+       struct mdev_device *mdev = mdev_from_dev(dev);
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+
+       /* If the guest is running, disallow un-assignment of adapter */
+       if (matrix_mdev->kvm)
+               return -EBUSY;
+
+       ret = kstrtoul(buf, 0, &apid);
+       if (ret)
+               return ret;
+
+       if (apid > matrix_mdev->matrix.apm_max)
+               return -ENODEV;
+
+       mutex_lock(&matrix_dev->lock);
+       clear_bit_inv((unsigned long)apid, matrix_mdev->matrix.apm);
+       mutex_unlock(&matrix_dev->lock);
+
+       return count;
+}
+static DEVICE_ATTR_WO(unassign_adapter);
+
+static int
+vfio_ap_mdev_verify_queues_reserved_for_apqi(struct ap_matrix_mdev *matrix_mdev,
+                                            unsigned long apqi)
+{
+       int ret;
+       unsigned long apid;
+       unsigned long nbits = matrix_mdev->matrix.apm_max + 1;
+
+       if (find_first_bit_inv(matrix_mdev->matrix.apm, nbits) >= nbits)
+               return vfio_ap_verify_queue_reserved(NULL, &apqi);
+
+       for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, nbits) {
+               ret = vfio_ap_verify_queue_reserved(&apid, &apqi);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/**
+ * assign_domain_store
+ *
+ * @dev:       the matrix device
+ * @attr:      the mediated matrix device's assign_domain attribute
+ * @buf:       a buffer containing the AP queue index (APQI) of the domain to
+ *             be assigned
+ * @count:     the number of bytes in @buf
+ *
+ * Parses the APQI from @buf and sets the corresponding bit in the mediated
+ * matrix device's AQM.
+ *
+ * Returns the number of bytes processed if the APQI is valid; otherwise returns
+ * one of the following errors:
+ *
+ *     1. -EINVAL
+ *        The APQI is not a valid number
+ *
+ *     2. -ENODEV
+ *        The APQI exceeds the maximum value configured for the system
+ *
+ *     3. -EADDRNOTAVAIL
+ *        An APQN derived from the cross product of the APQI being assigned
+ *        and the APIDs previously assigned is not bound to the vfio_ap device
+ *        driver; or, if no APIDs have yet been assigned, the APQI is not
+ *        contained in an APQN bound to the vfio_ap device driver.
+ *
+ *     4. -EADDRINUSE
+ *        An APQN derived from the cross product of the APQI being assigned
+ *        and the APIDs previously assigned is being used by another mediated
+ *        matrix device
+ */
+static ssize_t assign_domain_store(struct device *dev,
+                                  struct device_attribute *attr,
+                                  const char *buf, size_t count)
+{
+       int ret;
+       unsigned long apqi;
+       struct mdev_device *mdev = mdev_from_dev(dev);
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+       unsigned long max_apqi = matrix_mdev->matrix.aqm_max;
+
+       /* If the guest is running, disallow assignment of domain */
+       if (matrix_mdev->kvm)
+               return -EBUSY;
+
+       ret = kstrtoul(buf, 0, &apqi);
+       if (ret)
+               return ret;
+       if (apqi > max_apqi)
+               return -ENODEV;
+
+       mutex_lock(&matrix_dev->lock);
+
+       ret = vfio_ap_mdev_verify_queues_reserved_for_apqi(matrix_mdev, apqi);
+       if (ret)
+               goto done;
+
+       set_bit_inv(apqi, matrix_mdev->matrix.aqm);
+
+       ret = vfio_ap_mdev_verify_no_sharing(matrix_mdev);
+       if (ret)
+               goto share_err;
+
+       ret = count;
+       goto done;
+
+share_err:
+       clear_bit_inv(apqi, matrix_mdev->matrix.aqm);
+done:
+       mutex_unlock(&matrix_dev->lock);
+
+       return ret;
+}
+static DEVICE_ATTR_WO(assign_domain);
+
+
+/**
+ * unassign_domain_store
+ *
+ * @dev:       the matrix device
+ * @attr:      the mediated matrix device's unassign_domain attribute
+ * @buf:       a buffer containing the AP queue index (APQI) of the domain to
+ *             be unassigned
+ * @count:     the number of bytes in @buf
+ *
+ * Parses the APQI from @buf and clears the corresponding bit in the
+ * mediated matrix device's AQM.
+ *
+ * Returns the number of bytes processed if the APQI is valid; otherwise,
+ * returns one of the following errors:
+ *     -EINVAL if the APQI is not a number
+ *     -ENODEV if the APQI exceeds the maximum value configured for the system
+ */
+static ssize_t unassign_domain_store(struct device *dev,
+                                    struct device_attribute *attr,
+                                    const char *buf, size_t count)
+{
+       int ret;
+       unsigned long apqi;
+       struct mdev_device *mdev = mdev_from_dev(dev);
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+
+       /* If the guest is running, disallow un-assignment of domain */
+       if (matrix_mdev->kvm)
+               return -EBUSY;
+
+       ret = kstrtoul(buf, 0, &apqi);
+       if (ret)
+               return ret;
+
+       if (apqi > matrix_mdev->matrix.aqm_max)
+               return -ENODEV;
+
+       mutex_lock(&matrix_dev->lock);
+       clear_bit_inv((unsigned long)apqi, matrix_mdev->matrix.aqm);
+       mutex_unlock(&matrix_dev->lock);
+
+       return count;
+}
+static DEVICE_ATTR_WO(unassign_domain);
+
+/**
+ * assign_control_domain_store
+ *
+ * @dev:       the matrix device
+ * @attr:      the mediated matrix device's assign_control_domain attribute
+ * @buf:       a buffer containing the domain ID to be assigned
+ * @count:     the number of bytes in @buf
+ *
+ * Parses the domain ID from @buf and sets the corresponding bit in the mediated
+ * matrix device's ADM.
+ *
+ * Returns the number of bytes processed if the domain ID is valid; otherwise,
+ * returns one of the following errors:
+ *     -EINVAL if the ID is not a number
+ *     -ENODEV if the ID exceeds the maximum value configured for the system
+ */
+static ssize_t assign_control_domain_store(struct device *dev,
+                                          struct device_attribute *attr,
+                                          const char *buf, size_t count)
+{
+       int ret;
+       unsigned long id;
+       struct mdev_device *mdev = mdev_from_dev(dev);
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+
+       /* If the guest is running, disallow assignment of control domain */
+       if (matrix_mdev->kvm)
+               return -EBUSY;
+
+       ret = kstrtoul(buf, 0, &id);
+       if (ret)
+               return ret;
+
+       if (id > matrix_mdev->matrix.adm_max)
+               return -ENODEV;
+
+       /* Set the bit in the ADM (bitmask) corresponding to the AP control
+        * domain number (id). The bits in the mask, from most significant to
+        * least significant, correspond to IDs 0 up to the one less than the
+        * number of control domains that can be assigned.
+        */
+       mutex_lock(&matrix_dev->lock);
+       set_bit_inv(id, matrix_mdev->matrix.adm);
+       mutex_unlock(&matrix_dev->lock);
+
+       return count;
+}
+static DEVICE_ATTR_WO(assign_control_domain);
+
+/**
+ * unassign_control_domain_store
+ *
+ * @dev:       the matrix device
+ * @attr:      the mediated matrix device's unassign_control_domain attribute
+ * @buf:       a buffer containing the domain ID to be unassigned
+ * @count:     the number of bytes in @buf
+ *
+ * Parses the domain ID from @buf and clears the corresponding bit in the
+ * mediated matrix device's ADM.
+ *
+ * Returns the number of bytes processed if the domain ID is valid; otherwise,
+ * returns one of the following errors:
+ *     -EINVAL if the ID is not a number
+ *     -ENODEV if the ID exceeds the maximum value configured for the system
+ */
+static ssize_t unassign_control_domain_store(struct device *dev,
+                                            struct device_attribute *attr,
+                                            const char *buf, size_t count)
+{
+       int ret;
+       unsigned long domid;
+       struct mdev_device *mdev = mdev_from_dev(dev);
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+       unsigned long max_domid =  matrix_mdev->matrix.adm_max;
+
+       /* If the guest is running, disallow un-assignment of control domain */
+       if (matrix_mdev->kvm)
+               return -EBUSY;
+
+       ret = kstrtoul(buf, 0, &domid);
+       if (ret)
+               return ret;
+       if (domid > max_domid)
+               return -ENODEV;
+
+       mutex_lock(&matrix_dev->lock);
+       clear_bit_inv(domid, matrix_mdev->matrix.adm);
+       mutex_unlock(&matrix_dev->lock);
+
+       return count;
+}
+static DEVICE_ATTR_WO(unassign_control_domain);
+
+static ssize_t control_domains_show(struct device *dev,
+                                   struct device_attribute *dev_attr,
+                                   char *buf)
+{
+       unsigned long id;
+       int nchars = 0;
+       int n;
+       char *bufpos = buf;
+       struct mdev_device *mdev = mdev_from_dev(dev);
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+       unsigned long max_domid = matrix_mdev->matrix.adm_max;
+
+       mutex_lock(&matrix_dev->lock);
+       for_each_set_bit_inv(id, matrix_mdev->matrix.adm, max_domid + 1) {
+               n = sprintf(bufpos, "%04lx\n", id);
+               bufpos += n;
+               nchars += n;
+       }
+       mutex_unlock(&matrix_dev->lock);
+
+       return nchars;
+}
+static DEVICE_ATTR_RO(control_domains);
+
+static ssize_t matrix_show(struct device *dev, struct device_attribute *attr,
+                          char *buf)
+{
+       struct mdev_device *mdev = mdev_from_dev(dev);
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+       char *bufpos = buf;
+       unsigned long apid;
+       unsigned long apqi;
+       unsigned long apid1;
+       unsigned long apqi1;
+       unsigned long napm_bits = matrix_mdev->matrix.apm_max + 1;
+       unsigned long naqm_bits = matrix_mdev->matrix.aqm_max + 1;
+       int nchars = 0;
+       int n;
+
+       apid1 = find_first_bit_inv(matrix_mdev->matrix.apm, napm_bits);
+       apqi1 = find_first_bit_inv(matrix_mdev->matrix.aqm, naqm_bits);
+
+       mutex_lock(&matrix_dev->lock);
+
+       if ((apid1 < napm_bits) && (apqi1 < naqm_bits)) {
+               for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, napm_bits) {
+                       for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm,
+                                            naqm_bits) {
+                               n = sprintf(bufpos, "%02lx.%04lx\n", apid,
+                                           apqi);
+                               bufpos += n;
+                               nchars += n;
+                       }
+               }
+       } else if (apid1 < napm_bits) {
+               for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, napm_bits) {
+                       n = sprintf(bufpos, "%02lx.\n", apid);
+                       bufpos += n;
+                       nchars += n;
+               }
+       } else if (apqi1 < naqm_bits) {
+               for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, naqm_bits) {
+                       n = sprintf(bufpos, ".%04lx\n", apqi);
+                       bufpos += n;
+                       nchars += n;
+               }
+       }
+
+       mutex_unlock(&matrix_dev->lock);
+
+       return nchars;
+}
+static DEVICE_ATTR_RO(matrix);
+
+static struct attribute *vfio_ap_mdev_attrs[] = {
+       &dev_attr_assign_adapter.attr,
+       &dev_attr_unassign_adapter.attr,
+       &dev_attr_assign_domain.attr,
+       &dev_attr_unassign_domain.attr,
+       &dev_attr_assign_control_domain.attr,
+       &dev_attr_unassign_control_domain.attr,
+       &dev_attr_control_domains.attr,
+       &dev_attr_matrix.attr,
+       NULL,
+};
+
+static struct attribute_group vfio_ap_mdev_attr_group = {
+       .attrs = vfio_ap_mdev_attrs
+};
+
+static const struct attribute_group *vfio_ap_mdev_attr_groups[] = {
+       &vfio_ap_mdev_attr_group,
+       NULL
+};
+
+/**
+ * vfio_ap_mdev_set_kvm
+ *
+ * @matrix_mdev: a mediated matrix device
+ * @kvm: reference to KVM instance
+ *
+ * Verifies no other mediated matrix device has @kvm and sets a reference to
+ * it in @matrix_mdev->kvm.
+ *
+ * Return 0 if no other mediated matrix device has a reference to @kvm;
+ * otherwise, returns an -EPERM.
+ */
+static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
+                               struct kvm *kvm)
+{
+       struct ap_matrix_mdev *m;
+
+       mutex_lock(&matrix_dev->lock);
+
+       list_for_each_entry(m, &matrix_dev->mdev_list, node) {
+               if ((m != matrix_mdev) && (m->kvm == kvm)) {
+                       mutex_unlock(&matrix_dev->lock);
+                       return -EPERM;
+               }
+       }
+
+       matrix_mdev->kvm = kvm;
+       mutex_unlock(&matrix_dev->lock);
+
+       return 0;
+}
+
+static int vfio_ap_mdev_group_notifier(struct notifier_block *nb,
+                                      unsigned long action, void *data)
+{
+       int ret;
+       struct ap_matrix_mdev *matrix_mdev;
+
+       if (action != VFIO_GROUP_NOTIFY_SET_KVM)
+               return NOTIFY_OK;
+
+       matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier);
+
+       if (!data) {
+               matrix_mdev->kvm = NULL;
+               return NOTIFY_OK;
+       }
+
+       ret = vfio_ap_mdev_set_kvm(matrix_mdev, data);
+       if (ret)
+               return NOTIFY_DONE;
+
+       /* If there is no CRYCB pointer, then we can't copy the masks */
+       if (!matrix_mdev->kvm->arch.crypto.crycbd)
+               return NOTIFY_DONE;
+
+       kvm_arch_crypto_set_masks(matrix_mdev->kvm, matrix_mdev->matrix.apm,
+                                 matrix_mdev->matrix.aqm,
+                                 matrix_mdev->matrix.adm);
+
+       return NOTIFY_OK;
+}
+
+static int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi,
+                                   unsigned int retry)
+{
+       struct ap_queue_status status;
+
+       do {
+               status = ap_zapq(AP_MKQID(apid, apqi));
+               switch (status.response_code) {
+               case AP_RESPONSE_NORMAL:
+                       return 0;
+               case AP_RESPONSE_RESET_IN_PROGRESS:
+               case AP_RESPONSE_BUSY:
+                       msleep(20);
+                       break;
+               default:
+                       /* things are really broken, give up */
+                       return -EIO;
+               }
+       } while (retry--);
+
+       return -EBUSY;
+}
+
+static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev)
+{
+       int ret;
+       int rc = 0;
+       unsigned long apid, apqi;
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+
+       for_each_set_bit_inv(apid, matrix_mdev->matrix.apm,
+                            matrix_mdev->matrix.apm_max + 1) {
+               for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm,
+                                    matrix_mdev->matrix.aqm_max + 1) {
+                       ret = vfio_ap_mdev_reset_queue(apid, apqi, 1);
+                       /*
+                        * Regardless whether a queue turns out to be busy, or
+                        * is not operational, we need to continue resetting
+                        * the remaining queues.
+                        */
+                       if (ret)
+                               rc = ret;
+               }
+       }
+
+       return rc;
+}
+
+static int vfio_ap_mdev_open(struct mdev_device *mdev)
+{
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+       unsigned long events;
+       int ret;
+
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       matrix_mdev->group_notifier.notifier_call = vfio_ap_mdev_group_notifier;
+       events = VFIO_GROUP_NOTIFY_SET_KVM;
+
+       ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
+                                    &events, &matrix_mdev->group_notifier);
+       if (ret) {
+               module_put(THIS_MODULE);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void vfio_ap_mdev_release(struct mdev_device *mdev)
+{
+       struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
+
+       if (matrix_mdev->kvm)
+               kvm_arch_crypto_clear_masks(matrix_mdev->kvm);
+
+       vfio_ap_mdev_reset_queues(mdev);
+       vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
+                                &matrix_mdev->group_notifier);
+       matrix_mdev->kvm = NULL;
+       module_put(THIS_MODULE);
+}
+
+static int vfio_ap_mdev_get_device_info(unsigned long arg)
+{
+       unsigned long minsz;
+       struct vfio_device_info info;
+
+       minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+       if (copy_from_user(&info, (void __user *)arg, minsz))
+               return -EFAULT;
+
+       if (info.argsz < minsz)
+               return -EINVAL;
+
+       info.flags = VFIO_DEVICE_FLAGS_AP | VFIO_DEVICE_FLAGS_RESET;
+       info.num_regions = 0;
+       info.num_irqs = 0;
+
+       return copy_to_user((void __user *)arg, &info, minsz);
+}
+
+static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev,
+                                   unsigned int cmd, unsigned long arg)
+{
+       int ret;
+
+       switch (cmd) {
+       case VFIO_DEVICE_GET_INFO:
+               ret = vfio_ap_mdev_get_device_info(arg);
+               break;
+       case VFIO_DEVICE_RESET:
+               ret = vfio_ap_mdev_reset_queues(mdev);
+               break;
+       default:
+               ret = -EOPNOTSUPP;
+               break;
+       }
+
+       return ret;
+}
+
+static const struct mdev_parent_ops vfio_ap_matrix_ops = {
+       .owner                  = THIS_MODULE,
+       .supported_type_groups  = vfio_ap_mdev_type_groups,
+       .mdev_attr_groups       = vfio_ap_mdev_attr_groups,
+       .create                 = vfio_ap_mdev_create,
+       .remove                 = vfio_ap_mdev_remove,
+       .open                   = vfio_ap_mdev_open,
+       .release                = vfio_ap_mdev_release,
+       .ioctl                  = vfio_ap_mdev_ioctl,
+};
+
+int vfio_ap_mdev_register(void)
+{
+       atomic_set(&matrix_dev->available_instances, MAX_ZDEV_ENTRIES_EXT);
+
+       return mdev_register_device(&matrix_dev->device, &vfio_ap_matrix_ops);
+}
+
+void vfio_ap_mdev_unregister(void)
+{
+       mdev_unregister_device(&matrix_dev->device);
+}
diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
new file mode 100644 (file)
index 0000000..5675492
--- /dev/null
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Private data and functions for adjunct processor VFIO matrix driver.
+ *
+ * Author(s): Tony Krowiak <akrowiak@linux.ibm.com>
+ *           Halil Pasic <pasic@linux.ibm.com>
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#ifndef _VFIO_AP_PRIVATE_H_
+#define _VFIO_AP_PRIVATE_H_
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/mdev.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+
+#include "ap_bus.h"
+
+#define VFIO_AP_MODULE_NAME "vfio_ap"
+#define VFIO_AP_DRV_NAME "vfio_ap"
+
+/**
+ * ap_matrix_dev - the AP matrix device structure
+ * @device:    generic device structure associated with the AP matrix device
+ * @available_instances: number of mediated matrix devices that can be created
+ * @info:      the struct containing the output from the PQAP(QCI) instruction
+ * mdev_list:  the list of mediated matrix devices created
+ * lock:       mutex for locking the AP matrix device. This lock will be
+ *             taken every time we fiddle with state managed by the vfio_ap
+ *             driver, be it using @mdev_list or writing the state of a
+ *             single ap_matrix_mdev device. It's quite coarse but we don't
+ *             expect much contention.
+ */
+struct ap_matrix_dev {
+       struct device device;
+       atomic_t available_instances;
+       struct ap_config_info info;
+       struct list_head mdev_list;
+       struct mutex lock;
+};
+
+extern struct ap_matrix_dev *matrix_dev;
+
+/**
+ * The AP matrix is comprised of three bit masks identifying the adapters,
+ * queues (domains) and control domains that belong to an AP matrix. The bits i
+ * each mask, from least significant to most significant bit, correspond to IDs
+ * 0 to 255. When a bit is set, the corresponding ID belongs to the matrix.
+ *
+ * @apm_max: max adapter number in @apm
+ * @apm identifies the AP adapters in the matrix
+ * @aqm_max: max domain number in @aqm
+ * @aqm identifies the AP queues (domains) in the matrix
+ * @adm_max: max domain number in @adm
+ * @adm identifies the AP control domains in the matrix
+ */
+struct ap_matrix {
+       unsigned long apm_max;
+       DECLARE_BITMAP(apm, 256);
+       unsigned long aqm_max;
+       DECLARE_BITMAP(aqm, 256);
+       unsigned long adm_max;
+       DECLARE_BITMAP(adm, 256);
+};
+
+/**
+ * struct ap_matrix_mdev - the mediated matrix device structure
+ * @list:      allows the ap_matrix_mdev struct to be added to a list
+ * @matrix:    the adapters, usage domains and control domains assigned to the
+ *             mediated matrix device.
+ * @group_notifier: notifier block used for specifying callback function for
+ *                 handling the VFIO_GROUP_NOTIFY_SET_KVM event
+ * @kvm:       the struct holding guest's state
+ */
+struct ap_matrix_mdev {
+       struct list_head node;
+       struct ap_matrix matrix;
+       struct notifier_block group_notifier;
+       struct kvm *kvm;
+};
+
+extern int vfio_ap_mdev_register(void);
+extern void vfio_ap_mdev_unregister(void);
+
+#endif /* _VFIO_AP_PRIVATE_H_ */
index 96721b1..b30926e 100644 (file)
@@ -444,7 +444,7 @@ static void tce_iommu_unuse_page_v2(struct tce_container *container,
        struct mm_iommu_table_group_mem_t *mem = NULL;
        int ret;
        unsigned long hpa = 0;
-       __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+       __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 
        if (!pua)
                return;
@@ -467,8 +467,27 @@ static int tce_iommu_clear(struct tce_container *container,
        unsigned long oldhpa;
        long ret;
        enum dma_data_direction direction;
+       unsigned long lastentry = entry + pages;
+
+       for ( ; entry < lastentry; ++entry) {
+               if (tbl->it_indirect_levels && tbl->it_userspace) {
+                       /*
+                        * For multilevel tables, we can take a shortcut here
+                        * and skip some TCEs as we know that the userspace
+                        * addresses cache is a mirror of the real TCE table
+                        * and if it is missing some indirect levels, then
+                        * the hardware table does not have them allocated
+                        * either and therefore does not require updating.
+                        */
+                       __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
+                                       entry);
+                       if (!pua) {
+                               /* align to level_size which is power of two */
+                               entry |= tbl->it_level_size - 1;
+                               continue;
+                       }
+               }
 
-       for ( ; pages; --pages, ++entry) {
                cond_resched();
 
                direction = DMA_NONE;
index c2a7b86..071b4cb 100644 (file)
 #define GITS_CBASER_RaWaWt     GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
 #define GITS_CBASER_RaWaWb     GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
 
+#define GITS_CBASER_ADDRESS(cbaser)    ((cbaser) & GENMASK_ULL(51, 12))
+
 #define GITS_BASER_NR_REGS             8
 
 #define GITS_BASER_VALID                       (1ULL << 63)
 #define GITS_BASER_ENTRY_SIZE_MASK     GENMASK_ULL(52, 48)
 #define GITS_BASER_PHYS_52_to_48(phys)                                 \
        (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12)
+#define GITS_BASER_ADDR_48_to_52(baser)                                        \
+       (((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48)
+
 #define GITS_BASER_SHAREABILITY_SHIFT  (10)
 #define GITS_BASER_InnerShareable                                      \
        GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
index 251be35..2b7a652 100644 (file)
@@ -420,13 +420,19 @@ struct kvm_run {
 struct kvm_coalesced_mmio_zone {
        __u64 addr;
        __u32 size;
-       __u32 pad;
+       union {
+               __u32 pad;
+               __u32 pio;
+       };
 };
 
 struct kvm_coalesced_mmio {
        __u64 phys_addr;
        __u32 len;
-       __u32 pad;
+       union {
+               __u32 pad;
+               __u32 pio;
+       };
        __u8  data[8];
 };
 
@@ -719,6 +725,7 @@ struct kvm_ppc_one_seg_page_size {
 
 #define KVM_PPC_PAGE_SIZES_REAL                0x00000001
 #define KVM_PPC_1T_SEGMENTS            0x00000002
+#define KVM_PPC_NO_HASH                        0x00000004
 
 struct kvm_ppc_smmu_info {
        __u64 flags;
@@ -751,6 +758,15 @@ struct kvm_ppc_resize_hpt {
 #define KVM_S390_SIE_PAGE_OFFSET 1
 
 /*
+ * On arm64, machine type can be used to request the physical
+ * address size for the VM. Bits[7-0] are reserved for the guest
+ * PA size shift (i.e, log2(PA_Size)). For backward compatibility,
+ * value 0 implies the default IPA size, 40bits.
+ */
+#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK  0xffULL
+#define KVM_VM_TYPE_ARM_IPA_SIZE(x)            \
+       ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+/*
  * ioctls for /dev/kvm fds:
  */
 #define KVM_GET_API_VERSION       _IO(KVMIO,   0x00)
@@ -953,6 +969,12 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_NESTED_STATE 157
 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158
 #define KVM_CAP_MSR_PLATFORM_INFO 159
+#define KVM_CAP_PPC_NESTED_HV 160
+#define KVM_CAP_HYPERV_SEND_IPI 161
+#define KVM_CAP_COALESCED_PIO 162
+#define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
+#define KVM_CAP_EXCEPTION_PAYLOAD 164
+#define KVM_CAP_ARM_VM_IPA_SIZE 165
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index 1aa7b82..f378b98 100644 (file)
@@ -200,6 +200,7 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)    /* vfio-platform device */
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)       /* vfio-amba device */
 #define VFIO_DEVICE_FLAGS_CCW  (1 << 4)        /* vfio-ccw device */
+#define VFIO_DEVICE_FLAGS_AP   (1 << 5)        /* vfio-ap device */
        __u32   num_regions;    /* Max region index + 1 */
        __u32   num_irqs;       /* Max IRQ index + 1 */
 };
@@ -215,6 +216,7 @@ struct vfio_device_info {
 #define VFIO_DEVICE_API_PLATFORM_STRING                "vfio-platform"
 #define VFIO_DEVICE_API_AMBA_STRING            "vfio-amba"
 #define VFIO_DEVICE_API_CCW_STRING             "vfio-ccw"
+#define VFIO_DEVICE_API_AP_STRING              "vfio-ap"
 
 /**
  * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
index fd23d57..8a6eff9 100644 (file)
@@ -288,6 +288,7 @@ struct kvm_reinject_control {
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR        0x00000002
 #define KVM_VCPUEVENT_VALID_SHADOW     0x00000004
 #define KVM_VCPUEVENT_VALID_SMM                0x00000008
+#define KVM_VCPUEVENT_VALID_PAYLOAD    0x00000010
 
 /* Interrupt shadow states */
 #define KVM_X86_SHADOW_INT_MOV_SS      0x01
@@ -299,7 +300,10 @@ struct kvm_vcpu_events {
                __u8 injected;
                __u8 nr;
                __u8 has_error_code;
-               __u8 pad;
+               union {
+                       __u8 pad;
+                       __u8 pending;
+               };
                __u32 error_code;
        } exception;
        struct {
@@ -322,7 +326,9 @@ struct kvm_vcpu_events {
                __u8 smm_inside_nmi;
                __u8 latched_init;
        } smi;
-       __u32 reserved[9];
+       __u8 reserved[27];
+       __u8 exception_has_payload;
+       __u64 exception_payload;
 };
 
 /* for KVM_GET/SET_DEBUGREGS */
index 251be35..2875ce8 100644 (file)
@@ -719,6 +719,7 @@ struct kvm_ppc_one_seg_page_size {
 
 #define KVM_PPC_PAGE_SIZES_REAL                0x00000001
 #define KVM_PPC_1T_SEGMENTS            0x00000002
+#define KVM_PPC_NO_HASH                        0x00000004
 
 struct kvm_ppc_smmu_info {
        __u64 flags;
@@ -953,6 +954,10 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_NESTED_STATE 157
 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158
 #define KVM_CAP_MSR_PLATFORM_INFO 159
+#define KVM_CAP_PPC_NESTED_HV 160
+#define KVM_CAP_HYPERV_SEND_IPI 161
+#define KVM_CAP_COALESCED_PIO 162
+#define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index 853b95d..2011376 100644 (file)
@@ -15,7 +15,6 @@
        {0x400, "INST_STORAGE"}, \
        {0x480, "INST_SEGMENT"}, \
        {0x500, "EXTERNAL"}, \
-       {0x501, "EXTERNAL_LEVEL"}, \
        {0x502, "EXTERNAL_HV"}, \
        {0x600, "ALIGNMENT"}, \
        {0x700, "PROGRAM"}, \
index 5c34752..6210ba4 100644 (file)
@@ -1,6 +1,8 @@
-cr4_cpuid_sync_test
-platform_info_test
-set_sregs_test
-sync_regs_test
-vmx_tsc_adjust_test
-state_test
+/x86_64/cr4_cpuid_sync_test
+/x86_64/evmcs_test
+/x86_64/platform_info_test
+/x86_64/set_sregs_test
+/x86_64/sync_regs_test
+/x86_64/vmx_tsc_adjust_test
+/x86_64/state_test
+/dirty_log_test
index ec32dad..01a2192 100644 (file)
@@ -1,26 +1,30 @@
 all:
 
-top_srcdir = ../../../../
+top_srcdir = ../../../..
 UNAME_M := $(shell uname -m)
 
-LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c
-LIBKVM_x86_64 = lib/x86.c lib/vmx.c
-
-TEST_GEN_PROGS_x86_64 = platform_info_test
-TEST_GEN_PROGS_x86_64 += set_sregs_test
-TEST_GEN_PROGS_x86_64 += sync_regs_test
-TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test
-TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test
-TEST_GEN_PROGS_x86_64 += state_test
+LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/ucall.c lib/sparsebit.c
+LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c
+LIBKVM_aarch64 = lib/aarch64/processor.c
+
+TEST_GEN_PROGS_x86_64 = x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
+TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 += x86_64/state_test
+TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 
+TEST_GEN_PROGS_aarch64 += dirty_log_test
+
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
 LIBKVM += $(LIBKVM_$(UNAME_M))
 
 INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
-LINUX_TOOL_INCLUDE = $(top_srcdir)tools/include
-CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -I..
+LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
+CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(UNAME_M) -I..
 LDFLAGS += -pthread
 
 # After inclusion, $(OUTPUT) is defined and
@@ -29,7 +33,7 @@ include ../lib.mk
 
 STATIC_LIBS := $(OUTPUT)/libkvm.a
 LIBKVM_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM))
-EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS)
+EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS) cscope.*
 
 x := $(shell mkdir -p $(sort $(dir $(LIBKVM_OBJ))))
 $(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c
@@ -41,3 +45,12 @@ $(OUTPUT)/libkvm.a: $(LIBKVM_OBJ)
 all: $(STATIC_LIBS)
 $(TEST_GEN_PROGS): $(STATIC_LIBS)
 $(STATIC_LIBS):| khdr
+
+cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib ..
+cscope:
+       $(RM) cscope.*
+       (find $(include_paths) -name '*.h' \
+               -exec realpath --relative-base=$(PWD) {} \;; \
+       find . -name '*.c' \
+               -exec realpath --relative-base=$(PWD) {} \;) | sort -u > cscope.files
+       cscope -b
index 0c2cdc1..d59820c 100644 (file)
@@ -5,6 +5,8 @@
  * Copyright (C) 2018, Red Hat, Inc.
  */
 
+#define _GNU_SOURCE /* for program_invocation_name */
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 
 #include "test_util.h"
 #include "kvm_util.h"
+#include "processor.h"
+
+#define DEBUG printf
 
-#define  DEBUG                 printf
+#define VCPU_ID                                1
 
-#define  VCPU_ID                        1
 /* The memory slot index to track dirty pages */
-#define  TEST_MEM_SLOT_INDEX            1
-/*
- * GPA offset of the testing memory slot. Must be bigger than the
- * default vm mem slot, which is DEFAULT_GUEST_PHY_PAGES.
- */
-#define  TEST_MEM_OFFSET                (1ULL << 30) /* 1G */
-/* Size of the testing memory slot */
-#define  TEST_MEM_PAGES                 (1ULL << 18) /* 1G for 4K pages */
+#define TEST_MEM_SLOT_INDEX            1
+
+/* Default guest test memory offset, 1G */
+#define DEFAULT_GUEST_TEST_MEM         0x40000000
+
 /* How many pages to dirty for each guest loop */
-#define  TEST_PAGES_PER_LOOP            1024
+#define TEST_PAGES_PER_LOOP            1024
+
 /* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */
-#define  TEST_HOST_LOOP_N               32
+#define TEST_HOST_LOOP_N               32
+
 /* Interval for each host loop (ms) */
-#define  TEST_HOST_LOOP_INTERVAL        10
+#define TEST_HOST_LOOP_INTERVAL                10
+
+/*
+ * Guest/Host shared variables. Ensure addr_gva2hva() and/or
+ * sync_global_to/from_guest() are used when accessing from
+ * the host. READ/WRITE_ONCE() should also be used with anything
+ * that may change.
+ */
+static uint64_t host_page_size;
+static uint64_t guest_page_size;
+static uint64_t guest_num_pages;
+static uint64_t random_array[TEST_PAGES_PER_LOOP];
+static uint64_t iteration;
 
 /*
- * Guest variables.  We use these variables to share data between host
- * and guest.  There are two copies of the variables, one in host memory
- * (which is unused) and one in guest memory.  When the host wants to
- * access these variables, it needs to call addr_gva2hva() to access the
- * guest copy.
+ * GPA offset of the testing memory slot. Must be bigger than
+ * DEFAULT_GUEST_PHY_PAGES.
  */
-uint64_t guest_random_array[TEST_PAGES_PER_LOOP];
-uint64_t guest_iteration;
-uint64_t guest_page_size;
+static uint64_t guest_test_mem = DEFAULT_GUEST_TEST_MEM;
 
 /*
- * Writes to the first byte of a random page within the testing memory
- * region continuously.
+ * Continuously write to the first 8 bytes of a random pages within
+ * the testing memory region.
  */
-void guest_code(void)
+static void guest_code(void)
 {
-       int i = 0;
-       uint64_t volatile *array = guest_random_array;
-       uint64_t volatile *guest_addr;
+       int i;
 
        while (true) {
                for (i = 0; i < TEST_PAGES_PER_LOOP; i++) {
-                       /*
-                        * Write to the first 8 bytes of a random page
-                        * on the testing memory region.
-                        */
-                       guest_addr = (uint64_t *)
-                           (TEST_MEM_OFFSET +
-                            (array[i] % TEST_MEM_PAGES) * guest_page_size);
-                       *guest_addr = guest_iteration;
+                       uint64_t addr = guest_test_mem;
+                       addr += (READ_ONCE(random_array[i]) % guest_num_pages)
+                               * guest_page_size;
+                       addr &= ~(host_page_size - 1);
+                       *(uint64_t *)addr = READ_ONCE(iteration);
                }
+
                /* Tell the host that we need more random numbers */
                GUEST_SYNC(1);
        }
 }
 
-/*
- * Host variables.  These variables should only be used by the host
- * rather than the guest.
- */
-bool host_quit;
+/* Host variables */
+static bool host_quit;
 
 /* Points to the test VM memory region on which we track dirty logs */
-void *host_test_mem;
+static void *host_test_mem;
+static uint64_t host_num_pages;
 
 /* For statistics only */
-uint64_t host_dirty_count;
-uint64_t host_clear_count;
-uint64_t host_track_next_count;
+static uint64_t host_dirty_count;
+static uint64_t host_clear_count;
+static uint64_t host_track_next_count;
 
 /*
  * We use this bitmap to track some pages that should have its dirty
@@ -93,40 +97,34 @@ uint64_t host_track_next_count;
  * page bit is cleared in the latest bitmap, then the system must
  * report that write in the next get dirty log call.
  */
-unsigned long *host_bmap_track;
+static unsigned long *host_bmap_track;
 
-void generate_random_array(uint64_t *guest_array, uint64_t size)
+static void generate_random_array(uint64_t *guest_array, uint64_t size)
 {
        uint64_t i;
 
-       for (i = 0; i < size; i++) {
+       for (i = 0; i < size; i++)
                guest_array[i] = random();
-       }
 }
 
-void *vcpu_worker(void *data)
+static void *vcpu_worker(void *data)
 {
        int ret;
-       uint64_t loops, *guest_array, pages_count = 0;
        struct kvm_vm *vm = data;
+       uint64_t *guest_array;
+       uint64_t pages_count = 0;
        struct kvm_run *run;
-       struct guest_args args;
+       struct ucall uc;
 
        run = vcpu_state(vm, VCPU_ID);
 
-       /* Retrieve the guest random array pointer and cache it */
-       guest_array = addr_gva2hva(vm, (vm_vaddr_t)guest_random_array);
-
-       DEBUG("VCPU starts\n");
-
+       guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array);
        generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
 
        while (!READ_ONCE(host_quit)) {
-               /* Let the guest to dirty these random pages */
+               /* Let the guest dirty the random pages */
                ret = _vcpu_run(vm, VCPU_ID);
-               guest_args_read(vm, VCPU_ID, &args);
-               if (run->exit_reason == KVM_EXIT_IO &&
-                   args.port == GUEST_PORT_SYNC) {
+               if (get_ucall(vm, VCPU_ID, &uc) == UCALL_SYNC) {
                        pages_count += TEST_PAGES_PER_LOOP;
                        generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
                } else {
@@ -137,18 +135,20 @@ void *vcpu_worker(void *data)
                }
        }
 
-       DEBUG("VCPU exits, dirtied %"PRIu64" pages\n", pages_count);
+       DEBUG("Dirtied %"PRIu64" pages\n", pages_count);
 
        return NULL;
 }
 
-void vm_dirty_log_verify(unsigned long *bmap, uint64_t iteration)
+static void vm_dirty_log_verify(unsigned long *bmap)
 {
        uint64_t page;
-       uint64_t volatile *value_ptr;
+       uint64_t *value_ptr;
+       uint64_t step = host_page_size >= guest_page_size ? 1 :
+                               guest_page_size / host_page_size;
 
-       for (page = 0; page < TEST_MEM_PAGES; page++) {
-               value_ptr = host_test_mem + page * getpagesize();
+       for (page = 0; page < host_num_pages; page += step) {
+               value_ptr = host_test_mem + page * host_page_size;
 
                /* If this is a special page that we were tracking... */
                if (test_and_clear_bit(page, host_bmap_track)) {
@@ -208,88 +208,117 @@ void vm_dirty_log_verify(unsigned long *bmap, uint64_t iteration)
        }
 }
 
-void help(char *name)
+static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
+                               uint64_t extra_mem_pages, void *guest_code)
 {
-       puts("");
-       printf("usage: %s [-i iterations] [-I interval] [-h]\n", name);
-       puts("");
-       printf(" -i: specify iteration counts (default: %"PRIu64")\n",
-              TEST_HOST_LOOP_N);
-       printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
-              TEST_HOST_LOOP_INTERVAL);
-       puts("");
-       exit(0);
+       struct kvm_vm *vm;
+       uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
+
+       vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
+       kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+#ifdef __x86_64__
+       vm_create_irqchip(vm);
+#endif
+       vm_vcpu_add_default(vm, vcpuid, guest_code);
+       return vm;
 }
 
-int main(int argc, char *argv[])
+static void run_test(enum vm_guest_mode mode, unsigned long iterations,
+                    unsigned long interval, bool top_offset)
 {
+       unsigned int guest_pa_bits, guest_page_shift;
        pthread_t vcpu_thread;
        struct kvm_vm *vm;
-       uint64_t volatile *psize, *iteration;
-       unsigned long *bmap, iterations = TEST_HOST_LOOP_N,
-           interval = TEST_HOST_LOOP_INTERVAL;
-       int opt;
-
-       while ((opt = getopt(argc, argv, "hi:I:")) != -1) {
-               switch (opt) {
-               case 'i':
-                       iterations = strtol(optarg, NULL, 10);
-                       break;
-               case 'I':
-                       interval = strtol(optarg, NULL, 10);
-                       break;
-               case 'h':
-               default:
-                       help(argv[0]);
-                       break;
-               }
+       uint64_t max_gfn;
+       unsigned long *bmap;
+
+       switch (mode) {
+       case VM_MODE_P52V48_4K:
+               guest_pa_bits = 52;
+               guest_page_shift = 12;
+               break;
+       case VM_MODE_P52V48_64K:
+               guest_pa_bits = 52;
+               guest_page_shift = 16;
+               break;
+       case VM_MODE_P40V48_4K:
+               guest_pa_bits = 40;
+               guest_page_shift = 12;
+               break;
+       case VM_MODE_P40V48_64K:
+               guest_pa_bits = 40;
+               guest_page_shift = 16;
+               break;
+       default:
+               TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
        }
 
-       TEST_ASSERT(iterations > 2, "Iteration must be bigger than zero\n");
-       TEST_ASSERT(interval > 0, "Interval must be bigger than zero");
+       DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
 
-       DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
-             iterations, interval);
+       max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1;
+       guest_page_size = (1ul << guest_page_shift);
+       /* 1G of guest page sized pages */
+       guest_num_pages = (1ul << (30 - guest_page_shift));
+       host_page_size = getpagesize();
+       host_num_pages = (guest_num_pages * guest_page_size) / host_page_size +
+                        !!((guest_num_pages * guest_page_size) % host_page_size);
 
-       srandom(time(0));
+       if (top_offset) {
+               guest_test_mem = (max_gfn - guest_num_pages) * guest_page_size;
+               guest_test_mem &= ~(host_page_size - 1);
+       }
 
-       bmap = bitmap_alloc(TEST_MEM_PAGES);
-       host_bmap_track = bitmap_alloc(TEST_MEM_PAGES);
+       DEBUG("guest test mem offset: 0x%lx\n", guest_test_mem);
 
-       vm = vm_create_default(VCPU_ID, TEST_MEM_PAGES, guest_code);
+       bmap = bitmap_alloc(host_num_pages);
+       host_bmap_track = bitmap_alloc(host_num_pages);
+
+       vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code);
 
        /* Add an extra memory slot for testing dirty logging */
        vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-                                   TEST_MEM_OFFSET,
+                                   guest_test_mem,
                                    TEST_MEM_SLOT_INDEX,
-                                   TEST_MEM_PAGES,
+                                   guest_num_pages,
                                    KVM_MEM_LOG_DIRTY_PAGES);
-       /* Cache the HVA pointer of the region */
-       host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)TEST_MEM_OFFSET);
 
        /* Do 1:1 mapping for the dirty track memory slot */
-       virt_map(vm, TEST_MEM_OFFSET, TEST_MEM_OFFSET,
-                TEST_MEM_PAGES * getpagesize(), 0);
+       virt_map(vm, guest_test_mem, guest_test_mem,
+                guest_num_pages * guest_page_size, 0);
+
+       /* Cache the HVA pointer of the region */
+       host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_mem);
 
+#ifdef __x86_64__
        vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+#endif
+#ifdef __aarch64__
+       ucall_init(vm, UCALL_MMIO, NULL);
+#endif
 
-       /* Tell the guest about the page size on the system */
-       psize = addr_gva2hva(vm, (vm_vaddr_t)&guest_page_size);
-       *psize = getpagesize();
+       /* Export the shared variables to the guest */
+       sync_global_to_guest(vm, host_page_size);
+       sync_global_to_guest(vm, guest_page_size);
+       sync_global_to_guest(vm, guest_test_mem);
+       sync_global_to_guest(vm, guest_num_pages);
 
        /* Start the iterations */
-       iteration = addr_gva2hva(vm, (vm_vaddr_t)&guest_iteration);
-       *iteration = 1;
+       iteration = 1;
+       sync_global_to_guest(vm, iteration);
+       host_quit = false;
+       host_dirty_count = 0;
+       host_clear_count = 0;
+       host_track_next_count = 0;
 
-       /* Start dirtying pages */
        pthread_create(&vcpu_thread, NULL, vcpu_worker, vm);
 
-       while (*iteration < iterations) {
+       while (iteration < iterations) {
                /* Give the vcpu thread some time to dirty some pages */
                usleep(interval * 1000);
                kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
-               vm_dirty_log_verify(bmap, *iteration);
-               (*iteration)++;
+               vm_dirty_log_verify(bmap);
+               iteration++;
+               sync_global_to_guest(vm, iteration);
        }
 
        /* Tell the vcpu thread to quit */
@@ -302,7 +331,118 @@ int main(int argc, char *argv[])
 
        free(bmap);
        free(host_bmap_track);
+       ucall_uninit(vm);
        kvm_vm_free(vm);
+}
+
+static struct vm_guest_modes {
+       enum vm_guest_mode mode;
+       bool supported;
+       bool enabled;
+} vm_guest_modes[NUM_VM_MODES] = {
+#if defined(__x86_64__)
+       { VM_MODE_P52V48_4K,    1, 1, },
+       { VM_MODE_P52V48_64K,   0, 0, },
+       { VM_MODE_P40V48_4K,    0, 0, },
+       { VM_MODE_P40V48_64K,   0, 0, },
+#elif defined(__aarch64__)
+       { VM_MODE_P52V48_4K,    0, 0, },
+       { VM_MODE_P52V48_64K,   0, 0, },
+       { VM_MODE_P40V48_4K,    1, 1, },
+       { VM_MODE_P40V48_64K,   1, 1, },
+#endif
+};
+
+static void help(char *name)
+{
+       int i;
+
+       puts("");
+       printf("usage: %s [-h] [-i iterations] [-I interval] "
+              "[-o offset] [-t] [-m mode]\n", name);
+       puts("");
+       printf(" -i: specify iteration counts (default: %"PRIu64")\n",
+              TEST_HOST_LOOP_N);
+       printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
+              TEST_HOST_LOOP_INTERVAL);
+       printf(" -o: guest test memory offset (default: 0x%lx)\n",
+              DEFAULT_GUEST_TEST_MEM);
+       printf(" -t: map guest test memory at the top of the allowed "
+              "physical address range\n");
+       printf(" -m: specify the guest mode ID to test "
+              "(default: test all supported modes)\n"
+              "     This option may be used multiple times.\n"
+              "     Guest mode IDs:\n");
+       for (i = 0; i < NUM_VM_MODES; ++i) {
+               printf("         %d:    %s%s\n",
+                      vm_guest_modes[i].mode,
+                      vm_guest_mode_string(vm_guest_modes[i].mode),
+                      vm_guest_modes[i].supported ? " (supported)" : "");
+       }
+       puts("");
+       exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+       unsigned long iterations = TEST_HOST_LOOP_N;
+       unsigned long interval = TEST_HOST_LOOP_INTERVAL;
+       bool mode_selected = false;
+       bool top_offset = false;
+       unsigned int mode;
+       int opt, i;
+
+       while ((opt = getopt(argc, argv, "hi:I:o:tm:")) != -1) {
+               switch (opt) {
+               case 'i':
+                       iterations = strtol(optarg, NULL, 10);
+                       break;
+               case 'I':
+                       interval = strtol(optarg, NULL, 10);
+                       break;
+               case 'o':
+                       guest_test_mem = strtoull(optarg, NULL, 0);
+                       break;
+               case 't':
+                       top_offset = true;
+                       break;
+               case 'm':
+                       if (!mode_selected) {
+                               for (i = 0; i < NUM_VM_MODES; ++i)
+                                       vm_guest_modes[i].enabled = 0;
+                               mode_selected = true;
+                       }
+                       mode = strtoul(optarg, NULL, 10);
+                       TEST_ASSERT(mode < NUM_VM_MODES,
+                                   "Guest mode ID %d too big", mode);
+                       vm_guest_modes[mode].enabled = 1;
+                       break;
+               case 'h':
+               default:
+                       help(argv[0]);
+                       break;
+               }
+       }
+
+       TEST_ASSERT(iterations > 2, "Iterations must be greater than two");
+       TEST_ASSERT(interval > 0, "Interval must be greater than zero");
+       TEST_ASSERT(!top_offset || guest_test_mem == DEFAULT_GUEST_TEST_MEM,
+                   "Cannot use both -o [offset] and -t at the same time");
+
+       DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
+             iterations, interval);
+
+       srandom(time(0));
+
+       for (i = 0; i < NUM_VM_MODES; ++i) {
+               if (!vm_guest_modes[i].enabled)
+                       continue;
+               TEST_ASSERT(vm_guest_modes[i].supported,
+                           "Guest mode ID %d (%s) not supported.",
+                           vm_guest_modes[i].mode,
+                           vm_guest_mode_string(vm_guest_modes[i].mode));
+               run_test(vm_guest_modes[i].mode, iterations, interval, top_offset);
+       }
 
        return 0;
 }
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h
new file mode 100644 (file)
index 0000000..9ef2ab1
--- /dev/null
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AArch64 processor specific defines
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+#include "kvm_util.h"
+
+
+#define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+                          KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
+
+#define CPACR_EL1      3, 0,  1, 0, 2
+#define TCR_EL1                3, 0,  2, 0, 2
+#define MAIR_EL1       3, 0, 10, 2, 0
+#define TTBR0_EL1      3, 0,  2, 0, 0
+#define SCTLR_EL1      3, 0,  1, 0, 0
+
+/*
+ * Default MAIR
+ *                  index   attribute
+ * DEVICE_nGnRnE      0     0000:0000
+ * DEVICE_nGnRE       1     0000:0100
+ * DEVICE_GRE         2     0000:1100
+ * NORMAL_NC          3     0100:0100
+ * NORMAL             4     1111:1111
+ * NORMAL_WT          5     1011:1011
+ */
+#define DEFAULT_MAIR_EL1 ((0x00ul << (0 * 8)) | \
+                         (0x04ul << (1 * 8)) | \
+                         (0x0cul << (2 * 8)) | \
+                         (0x44ul << (3 * 8)) | \
+                         (0xfful << (4 * 8)) | \
+                         (0xbbul << (5 * 8)))
+
+static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr)
+{
+       struct kvm_one_reg reg;
+       reg.id = id;
+       reg.addr = (uint64_t)addr;
+       vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, &reg);
+}
+
+static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t val)
+{
+       struct kvm_one_reg reg;
+       reg.id = id;
+       reg.addr = (uint64_t)&val;
+       vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, &reg);
+}
+
+#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h
new file mode 100644 (file)
index 0000000..4059014
--- /dev/null
@@ -0,0 +1,1098 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * tools/testing/selftests/kvm/include/vmx.h
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ */
+
+#ifndef SELFTEST_KVM_EVMCS_H
+#define SELFTEST_KVM_EVMCS_H
+
+#include <stdint.h>
+#include "vmx.h"
+
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+extern bool enable_evmcs;
+
+struct hv_vp_assist_page {
+       __u32 apic_assist;
+       __u32 reserved;
+       __u64 vtl_control[2];
+       __u64 nested_enlightenments_control[2];
+       __u32 enlighten_vmentry;
+       __u64 current_nested_vmcs;
+};
+
+struct hv_enlightened_vmcs {
+       u32 revision_id;
+       u32 abort;
+
+       u16 host_es_selector;
+       u16 host_cs_selector;
+       u16 host_ss_selector;
+       u16 host_ds_selector;
+       u16 host_fs_selector;
+       u16 host_gs_selector;
+       u16 host_tr_selector;
+
+       u64 host_ia32_pat;
+       u64 host_ia32_efer;
+
+       u64 host_cr0;
+       u64 host_cr3;
+       u64 host_cr4;
+
+       u64 host_ia32_sysenter_esp;
+       u64 host_ia32_sysenter_eip;
+       u64 host_rip;
+       u32 host_ia32_sysenter_cs;
+
+       u32 pin_based_vm_exec_control;
+       u32 vm_exit_controls;
+       u32 secondary_vm_exec_control;
+
+       u64 io_bitmap_a;
+       u64 io_bitmap_b;
+       u64 msr_bitmap;
+
+       u16 guest_es_selector;
+       u16 guest_cs_selector;
+       u16 guest_ss_selector;
+       u16 guest_ds_selector;
+       u16 guest_fs_selector;
+       u16 guest_gs_selector;
+       u16 guest_ldtr_selector;
+       u16 guest_tr_selector;
+
+       u32 guest_es_limit;
+       u32 guest_cs_limit;
+       u32 guest_ss_limit;
+       u32 guest_ds_limit;
+       u32 guest_fs_limit;
+       u32 guest_gs_limit;
+       u32 guest_ldtr_limit;
+       u32 guest_tr_limit;
+       u32 guest_gdtr_limit;
+       u32 guest_idtr_limit;
+
+       u32 guest_es_ar_bytes;
+       u32 guest_cs_ar_bytes;
+       u32 guest_ss_ar_bytes;
+       u32 guest_ds_ar_bytes;
+       u32 guest_fs_ar_bytes;
+       u32 guest_gs_ar_bytes;
+       u32 guest_ldtr_ar_bytes;
+       u32 guest_tr_ar_bytes;
+
+       u64 guest_es_base;
+       u64 guest_cs_base;
+       u64 guest_ss_base;
+       u64 guest_ds_base;
+       u64 guest_fs_base;
+       u64 guest_gs_base;
+       u64 guest_ldtr_base;
+       u64 guest_tr_base;
+       u64 guest_gdtr_base;
+       u64 guest_idtr_base;
+
+       u64 padding64_1[3];
+
+       u64 vm_exit_msr_store_addr;
+       u64 vm_exit_msr_load_addr;
+       u64 vm_entry_msr_load_addr;
+
+       u64 cr3_target_value0;
+       u64 cr3_target_value1;
+       u64 cr3_target_value2;
+       u64 cr3_target_value3;
+
+       u32 page_fault_error_code_mask;
+       u32 page_fault_error_code_match;
+
+       u32 cr3_target_count;
+       u32 vm_exit_msr_store_count;
+       u32 vm_exit_msr_load_count;
+       u32 vm_entry_msr_load_count;
+
+       u64 tsc_offset;
+       u64 virtual_apic_page_addr;
+       u64 vmcs_link_pointer;
+
+       u64 guest_ia32_debugctl;
+       u64 guest_ia32_pat;
+       u64 guest_ia32_efer;
+
+       u64 guest_pdptr0;
+       u64 guest_pdptr1;
+       u64 guest_pdptr2;
+       u64 guest_pdptr3;
+
+       u64 guest_pending_dbg_exceptions;
+       u64 guest_sysenter_esp;
+       u64 guest_sysenter_eip;
+
+       u32 guest_activity_state;
+       u32 guest_sysenter_cs;
+
+       u64 cr0_guest_host_mask;
+       u64 cr4_guest_host_mask;
+       u64 cr0_read_shadow;
+       u64 cr4_read_shadow;
+       u64 guest_cr0;
+       u64 guest_cr3;
+       u64 guest_cr4;
+       u64 guest_dr7;
+
+       u64 host_fs_base;
+       u64 host_gs_base;
+       u64 host_tr_base;
+       u64 host_gdtr_base;
+       u64 host_idtr_base;
+       u64 host_rsp;
+
+       u64 ept_pointer;
+
+       u16 virtual_processor_id;
+       u16 padding16[3];
+
+       u64 padding64_2[5];
+       u64 guest_physical_address;
+
+       u32 vm_instruction_error;
+       u32 vm_exit_reason;
+       u32 vm_exit_intr_info;
+       u32 vm_exit_intr_error_code;
+       u32 idt_vectoring_info_field;
+       u32 idt_vectoring_error_code;
+       u32 vm_exit_instruction_len;
+       u32 vmx_instruction_info;
+
+       u64 exit_qualification;
+       u64 exit_io_instruction_ecx;
+       u64 exit_io_instruction_esi;
+       u64 exit_io_instruction_edi;
+       u64 exit_io_instruction_eip;
+
+       u64 guest_linear_address;
+       u64 guest_rsp;
+       u64 guest_rflags;
+
+       u32 guest_interruptibility_info;
+       u32 cpu_based_vm_exec_control;
+       u32 exception_bitmap;
+       u32 vm_entry_controls;
+       u32 vm_entry_intr_info_field;
+       u32 vm_entry_exception_error_code;
+       u32 vm_entry_instruction_len;
+       u32 tpr_threshold;
+
+       u64 guest_rip;
+
+       u32 hv_clean_fields;
+       u32 hv_padding_32;
+       u32 hv_synthetic_controls;
+       struct {
+               u32 nested_flush_hypercall:1;
+               u32 msr_bitmap:1;
+               u32 reserved:30;
+       } hv_enlightenments_control;
+       u32 hv_vp_id;
+
+       u64 hv_vm_id;
+       u64 partition_assist_page;
+       u64 padding64_4[4];
+       u64 guest_bndcfgs;
+       u64 padding64_5[7];
+       u64 xss_exit_bitmap;
+       u64 padding64_6[7];
+};
+
+#define HV_X64_MSR_VP_ASSIST_PAGE              0x40000073
+#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE       0x00000001
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT        12
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
+               (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+struct hv_enlightened_vmcs *current_evmcs;
+struct hv_vp_assist_page *current_vp_assist;
+
+static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist)
+{
+       u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) |
+               HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+       wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val);
+
+       current_vp_assist = vp_assist;
+
+       enable_evmcs = true;
+
+       return 0;
+}
+
+static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs)
+{
+       current_vp_assist->current_nested_vmcs = vmcs_pa;
+       current_vp_assist->enlighten_vmentry = 1;
+
+       current_evmcs = vmcs;
+
+       return 0;
+}
+
+static inline int evmcs_vmptrst(uint64_t *value)
+{
+       *value = current_vp_assist->current_nested_vmcs &
+               ~HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+       return 0;
+}
+
+static inline int evmcs_vmread(uint64_t encoding, uint64_t *value)
+{
+       switch (encoding) {
+       case GUEST_RIP:
+               *value = current_evmcs->guest_rip;
+               break;
+       case GUEST_RSP:
+               *value = current_evmcs->guest_rsp;
+               break;
+       case GUEST_RFLAGS:
+               *value = current_evmcs->guest_rflags;
+               break;
+       case HOST_IA32_PAT:
+               *value = current_evmcs->host_ia32_pat;
+               break;
+       case HOST_IA32_EFER:
+               *value = current_evmcs->host_ia32_efer;
+               break;
+       case HOST_CR0:
+               *value = current_evmcs->host_cr0;
+               break;
+       case HOST_CR3:
+               *value = current_evmcs->host_cr3;
+               break;
+       case HOST_CR4:
+               *value = current_evmcs->host_cr4;
+               break;
+       case HOST_IA32_SYSENTER_ESP:
+               *value = current_evmcs->host_ia32_sysenter_esp;
+               break;
+       case HOST_IA32_SYSENTER_EIP:
+               *value = current_evmcs->host_ia32_sysenter_eip;
+               break;
+       case HOST_RIP:
+               *value = current_evmcs->host_rip;
+               break;
+       case IO_BITMAP_A:
+               *value = current_evmcs->io_bitmap_a;
+               break;
+       case IO_BITMAP_B:
+               *value = current_evmcs->io_bitmap_b;
+               break;
+       case MSR_BITMAP:
+               *value = current_evmcs->msr_bitmap;
+               break;
+       case GUEST_ES_BASE:
+               *value = current_evmcs->guest_es_base;
+               break;
+       case GUEST_CS_BASE:
+               *value = current_evmcs->guest_cs_base;
+               break;
+       case GUEST_SS_BASE:
+               *value = current_evmcs->guest_ss_base;
+               break;
+       case GUEST_DS_BASE:
+               *value = current_evmcs->guest_ds_base;
+               break;
+       case GUEST_FS_BASE:
+               *value = current_evmcs->guest_fs_base;
+               break;
+       case GUEST_GS_BASE:
+               *value = current_evmcs->guest_gs_base;
+               break;
+       case GUEST_LDTR_BASE:
+               *value = current_evmcs->guest_ldtr_base;
+               break;
+       case GUEST_TR_BASE:
+               *value = current_evmcs->guest_tr_base;
+               break;
+       case GUEST_GDTR_BASE:
+               *value = current_evmcs->guest_gdtr_base;
+               break;
+       case GUEST_IDTR_BASE:
+               *value = current_evmcs->guest_idtr_base;
+               break;
+       case TSC_OFFSET:
+               *value = current_evmcs->tsc_offset;
+               break;
+       case VIRTUAL_APIC_PAGE_ADDR:
+               *value = current_evmcs->virtual_apic_page_addr;
+               break;
+       case VMCS_LINK_POINTER:
+               *value = current_evmcs->vmcs_link_pointer;
+               break;
+       case GUEST_IA32_DEBUGCTL:
+               *value = current_evmcs->guest_ia32_debugctl;
+               break;
+       case GUEST_IA32_PAT:
+               *value = current_evmcs->guest_ia32_pat;
+               break;
+       case GUEST_IA32_EFER:
+               *value = current_evmcs->guest_ia32_efer;
+               break;
+       case GUEST_PDPTR0:
+               *value = current_evmcs->guest_pdptr0;
+               break;
+       case GUEST_PDPTR1:
+               *value = current_evmcs->guest_pdptr1;
+               break;
+       case GUEST_PDPTR2:
+               *value = current_evmcs->guest_pdptr2;
+               break;
+       case GUEST_PDPTR3:
+               *value = current_evmcs->guest_pdptr3;
+               break;
+       case GUEST_PENDING_DBG_EXCEPTIONS:
+               *value = current_evmcs->guest_pending_dbg_exceptions;
+               break;
+       case GUEST_SYSENTER_ESP:
+               *value = current_evmcs->guest_sysenter_esp;
+               break;
+       case GUEST_SYSENTER_EIP:
+               *value = current_evmcs->guest_sysenter_eip;
+               break;
+       case CR0_GUEST_HOST_MASK:
+               *value = current_evmcs->cr0_guest_host_mask;
+               break;
+       case CR4_GUEST_HOST_MASK:
+               *value = current_evmcs->cr4_guest_host_mask;
+               break;
+       case CR0_READ_SHADOW:
+               *value = current_evmcs->cr0_read_shadow;
+               break;
+       case CR4_READ_SHADOW:
+               *value = current_evmcs->cr4_read_shadow;
+               break;
+       case GUEST_CR0:
+               *value = current_evmcs->guest_cr0;
+               break;
+       case GUEST_CR3:
+               *value = current_evmcs->guest_cr3;
+               break;
+       case GUEST_CR4:
+               *value = current_evmcs->guest_cr4;
+               break;
+       case GUEST_DR7:
+               *value = current_evmcs->guest_dr7;
+               break;
+       case HOST_FS_BASE:
+               *value = current_evmcs->host_fs_base;
+               break;
+       case HOST_GS_BASE:
+               *value = current_evmcs->host_gs_base;
+               break;
+       case HOST_TR_BASE:
+               *value = current_evmcs->host_tr_base;
+               break;
+       case HOST_GDTR_BASE:
+               *value = current_evmcs->host_gdtr_base;
+               break;
+       case HOST_IDTR_BASE:
+               *value = current_evmcs->host_idtr_base;
+               break;
+       case HOST_RSP:
+               *value = current_evmcs->host_rsp;
+               break;
+       case EPT_POINTER:
+               *value = current_evmcs->ept_pointer;
+               break;
+       case GUEST_BNDCFGS:
+               *value = current_evmcs->guest_bndcfgs;
+               break;
+       case XSS_EXIT_BITMAP:
+               *value = current_evmcs->xss_exit_bitmap;
+               break;
+       case GUEST_PHYSICAL_ADDRESS:
+               *value = current_evmcs->guest_physical_address;
+               break;
+       case EXIT_QUALIFICATION:
+               *value = current_evmcs->exit_qualification;
+               break;
+       case GUEST_LINEAR_ADDRESS:
+               *value = current_evmcs->guest_linear_address;
+               break;
+       case VM_EXIT_MSR_STORE_ADDR:
+               *value = current_evmcs->vm_exit_msr_store_addr;
+               break;
+       case VM_EXIT_MSR_LOAD_ADDR:
+               *value = current_evmcs->vm_exit_msr_load_addr;
+               break;
+       case VM_ENTRY_MSR_LOAD_ADDR:
+               *value = current_evmcs->vm_entry_msr_load_addr;
+               break;
+       case CR3_TARGET_VALUE0:
+               *value = current_evmcs->cr3_target_value0;
+               break;
+       case CR3_TARGET_VALUE1:
+               *value = current_evmcs->cr3_target_value1;
+               break;
+       case CR3_TARGET_VALUE2:
+               *value = current_evmcs->cr3_target_value2;
+               break;
+       case CR3_TARGET_VALUE3:
+               *value = current_evmcs->cr3_target_value3;
+               break;
+       case TPR_THRESHOLD:
+               *value = current_evmcs->tpr_threshold;
+               break;
+       case GUEST_INTERRUPTIBILITY_INFO:
+               *value = current_evmcs->guest_interruptibility_info;
+               break;
+       case CPU_BASED_VM_EXEC_CONTROL:
+               *value = current_evmcs->cpu_based_vm_exec_control;
+               break;
+       case EXCEPTION_BITMAP:
+               *value = current_evmcs->exception_bitmap;
+               break;
+       case VM_ENTRY_CONTROLS:
+               *value = current_evmcs->vm_entry_controls;
+               break;
+       case VM_ENTRY_INTR_INFO_FIELD:
+               *value = current_evmcs->vm_entry_intr_info_field;
+               break;
+       case VM_ENTRY_EXCEPTION_ERROR_CODE:
+               *value = current_evmcs->vm_entry_exception_error_code;
+               break;
+       case VM_ENTRY_INSTRUCTION_LEN:
+               *value = current_evmcs->vm_entry_instruction_len;
+               break;
+       case HOST_IA32_SYSENTER_CS:
+               *value = current_evmcs->host_ia32_sysenter_cs;
+               break;
+       case PIN_BASED_VM_EXEC_CONTROL:
+               *value = current_evmcs->pin_based_vm_exec_control;
+               break;
+       case VM_EXIT_CONTROLS:
+               *value = current_evmcs->vm_exit_controls;
+               break;
+       case SECONDARY_VM_EXEC_CONTROL:
+               *value = current_evmcs->secondary_vm_exec_control;
+               break;
+       case GUEST_ES_LIMIT:
+               *value = current_evmcs->guest_es_limit;
+               break;
+       case GUEST_CS_LIMIT:
+               *value = current_evmcs->guest_cs_limit;
+               break;
+       case GUEST_SS_LIMIT:
+               *value = current_evmcs->guest_ss_limit;
+               break;
+       case GUEST_DS_LIMIT:
+               *value = current_evmcs->guest_ds_limit;
+               break;
+       case GUEST_FS_LIMIT:
+               *value = current_evmcs->guest_fs_limit;
+               break;
+       case GUEST_GS_LIMIT:
+               *value = current_evmcs->guest_gs_limit;
+               break;
+       case GUEST_LDTR_LIMIT:
+               *value = current_evmcs->guest_ldtr_limit;
+               break;
+       case GUEST_TR_LIMIT:
+               *value = current_evmcs->guest_tr_limit;
+               break;
+       case GUEST_GDTR_LIMIT:
+               *value = current_evmcs->guest_gdtr_limit;
+               break;
+       case GUEST_IDTR_LIMIT:
+               *value = current_evmcs->guest_idtr_limit;
+               break;
+       case GUEST_ES_AR_BYTES:
+               *value = current_evmcs->guest_es_ar_bytes;
+               break;
+       case GUEST_CS_AR_BYTES:
+               *value = current_evmcs->guest_cs_ar_bytes;
+               break;
+       case GUEST_SS_AR_BYTES:
+               *value = current_evmcs->guest_ss_ar_bytes;
+               break;
+       case GUEST_DS_AR_BYTES:
+               *value = current_evmcs->guest_ds_ar_bytes;
+               break;
+       case GUEST_FS_AR_BYTES:
+               *value = current_evmcs->guest_fs_ar_bytes;
+               break;
+       case GUEST_GS_AR_BYTES:
+               *value = current_evmcs->guest_gs_ar_bytes;
+               break;
+       case GUEST_LDTR_AR_BYTES:
+               *value = current_evmcs->guest_ldtr_ar_bytes;
+               break;
+       case GUEST_TR_AR_BYTES:
+               *value = current_evmcs->guest_tr_ar_bytes;
+               break;
+       case GUEST_ACTIVITY_STATE:
+               *value = current_evmcs->guest_activity_state;
+               break;
+       case GUEST_SYSENTER_CS:
+               *value = current_evmcs->guest_sysenter_cs;
+               break;
+       case VM_INSTRUCTION_ERROR:
+               *value = current_evmcs->vm_instruction_error;
+               break;
+       case VM_EXIT_REASON:
+               *value = current_evmcs->vm_exit_reason;
+               break;
+       case VM_EXIT_INTR_INFO:
+               *value = current_evmcs->vm_exit_intr_info;
+               break;
+       case VM_EXIT_INTR_ERROR_CODE:
+               *value = current_evmcs->vm_exit_intr_error_code;
+               break;
+       case IDT_VECTORING_INFO_FIELD:
+               *value = current_evmcs->idt_vectoring_info_field;
+               break;
+       case IDT_VECTORING_ERROR_CODE:
+               *value = current_evmcs->idt_vectoring_error_code;
+               break;
+       case VM_EXIT_INSTRUCTION_LEN:
+               *value = current_evmcs->vm_exit_instruction_len;
+               break;
+       case VMX_INSTRUCTION_INFO:
+               *value = current_evmcs->vmx_instruction_info;
+               break;
+       case PAGE_FAULT_ERROR_CODE_MASK:
+               *value = current_evmcs->page_fault_error_code_mask;
+               break;
+       case PAGE_FAULT_ERROR_CODE_MATCH:
+               *value = current_evmcs->page_fault_error_code_match;
+               break;
+       case CR3_TARGET_COUNT:
+               *value = current_evmcs->cr3_target_count;
+               break;
+       case VM_EXIT_MSR_STORE_COUNT:
+               *value = current_evmcs->vm_exit_msr_store_count;
+               break;
+       case VM_EXIT_MSR_LOAD_COUNT:
+               *value = current_evmcs->vm_exit_msr_load_count;
+               break;
+       case VM_ENTRY_MSR_LOAD_COUNT:
+               *value = current_evmcs->vm_entry_msr_load_count;
+               break;
+       case HOST_ES_SELECTOR:
+               *value = current_evmcs->host_es_selector;
+               break;
+       case HOST_CS_SELECTOR:
+               *value = current_evmcs->host_cs_selector;
+               break;
+       case HOST_SS_SELECTOR:
+               *value = current_evmcs->host_ss_selector;
+               break;
+       case HOST_DS_SELECTOR:
+               *value = current_evmcs->host_ds_selector;
+               break;
+       case HOST_FS_SELECTOR:
+               *value = current_evmcs->host_fs_selector;
+               break;
+       case HOST_GS_SELECTOR:
+               *value = current_evmcs->host_gs_selector;
+               break;
+       case HOST_TR_SELECTOR:
+               *value = current_evmcs->host_tr_selector;
+               break;
+       case GUEST_ES_SELECTOR:
+               *value = current_evmcs->guest_es_selector;
+               break;
+       case GUEST_CS_SELECTOR:
+               *value = current_evmcs->guest_cs_selector;
+               break;
+       case GUEST_SS_SELECTOR:
+               *value = current_evmcs->guest_ss_selector;
+               break;
+       case GUEST_DS_SELECTOR:
+               *value = current_evmcs->guest_ds_selector;
+               break;
+       case GUEST_FS_SELECTOR:
+               *value = current_evmcs->guest_fs_selector;
+               break;
+       case GUEST_GS_SELECTOR:
+               *value = current_evmcs->guest_gs_selector;
+               break;
+       case GUEST_LDTR_SELECTOR:
+               *value = current_evmcs->guest_ldtr_selector;
+               break;
+       case GUEST_TR_SELECTOR:
+               *value = current_evmcs->guest_tr_selector;
+               break;
+       case VIRTUAL_PROCESSOR_ID:
+               *value = current_evmcs->virtual_processor_id;
+               break;
+       default: return 1;
+       }
+
+       return 0;
+}
+
+static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value)
+{
+       switch (encoding) {
+       case GUEST_RIP:
+               current_evmcs->guest_rip = value;
+               break;
+       case GUEST_RSP:
+               current_evmcs->guest_rsp = value;
+               break;
+       case GUEST_RFLAGS:
+               current_evmcs->guest_rflags = value;
+               break;
+       case HOST_IA32_PAT:
+               current_evmcs->host_ia32_pat = value;
+               break;
+       case HOST_IA32_EFER:
+               current_evmcs->host_ia32_efer = value;
+               break;
+       case HOST_CR0:
+               current_evmcs->host_cr0 = value;
+               break;
+       case HOST_CR3:
+               current_evmcs->host_cr3 = value;
+               break;
+       case HOST_CR4:
+               current_evmcs->host_cr4 = value;
+               break;
+       case HOST_IA32_SYSENTER_ESP:
+               current_evmcs->host_ia32_sysenter_esp = value;
+               break;
+       case HOST_IA32_SYSENTER_EIP:
+               current_evmcs->host_ia32_sysenter_eip = value;
+               break;
+       case HOST_RIP:
+               current_evmcs->host_rip = value;
+               break;
+       case IO_BITMAP_A:
+               current_evmcs->io_bitmap_a = value;
+               break;
+       case IO_BITMAP_B:
+               current_evmcs->io_bitmap_b = value;
+               break;
+       case MSR_BITMAP:
+               current_evmcs->msr_bitmap = value;
+               break;
+       case GUEST_ES_BASE:
+               current_evmcs->guest_es_base = value;
+               break;
+       case GUEST_CS_BASE:
+               current_evmcs->guest_cs_base = value;
+               break;
+       case GUEST_SS_BASE:
+               current_evmcs->guest_ss_base = value;
+               break;
+       case GUEST_DS_BASE:
+               current_evmcs->guest_ds_base = value;
+               break;
+       case GUEST_FS_BASE:
+               current_evmcs->guest_fs_base = value;
+               break;
+       case GUEST_GS_BASE:
+               current_evmcs->guest_gs_base = value;
+               break;
+       case GUEST_LDTR_BASE:
+               current_evmcs->guest_ldtr_base = value;
+               break;
+       case GUEST_TR_BASE:
+               current_evmcs->guest_tr_base = value;
+               break;
+       case GUEST_GDTR_BASE:
+               current_evmcs->guest_gdtr_base = value;
+               break;
+       case GUEST_IDTR_BASE:
+               current_evmcs->guest_idtr_base = value;
+               break;
+       case TSC_OFFSET:
+               current_evmcs->tsc_offset = value;
+               break;
+       case VIRTUAL_APIC_PAGE_ADDR:
+               current_evmcs->virtual_apic_page_addr = value;
+               break;
+       case VMCS_LINK_POINTER:
+               current_evmcs->vmcs_link_pointer = value;
+               break;
+       case GUEST_IA32_DEBUGCTL:
+               current_evmcs->guest_ia32_debugctl = value;
+               break;
+       case GUEST_IA32_PAT:
+               current_evmcs->guest_ia32_pat = value;
+               break;
+       case GUEST_IA32_EFER:
+               current_evmcs->guest_ia32_efer = value;
+               break;
+       case GUEST_PDPTR0:
+               current_evmcs->guest_pdptr0 = value;
+               break;
+       case GUEST_PDPTR1:
+               current_evmcs->guest_pdptr1 = value;
+               break;
+       case GUEST_PDPTR2:
+               current_evmcs->guest_pdptr2 = value;
+               break;
+       case GUEST_PDPTR3:
+               current_evmcs->guest_pdptr3 = value;
+               break;
+       case GUEST_PENDING_DBG_EXCEPTIONS:
+               current_evmcs->guest_pending_dbg_exceptions = value;
+               break;
+       case GUEST_SYSENTER_ESP:
+               current_evmcs->guest_sysenter_esp = value;
+               break;
+       case GUEST_SYSENTER_EIP:
+               current_evmcs->guest_sysenter_eip = value;
+               break;
+       case CR0_GUEST_HOST_MASK:
+               current_evmcs->cr0_guest_host_mask = value;
+               break;
+       case CR4_GUEST_HOST_MASK:
+               current_evmcs->cr4_guest_host_mask = value;
+               break;
+       case CR0_READ_SHADOW:
+               current_evmcs->cr0_read_shadow = value;
+               break;
+       case CR4_READ_SHADOW:
+               current_evmcs->cr4_read_shadow = value;
+               break;
+       case GUEST_CR0:
+               current_evmcs->guest_cr0 = value;
+               break;
+       case GUEST_CR3:
+               current_evmcs->guest_cr3 = value;
+               break;
+       case GUEST_CR4:
+               current_evmcs->guest_cr4 = value;
+               break;
+       case GUEST_DR7:
+               current_evmcs->guest_dr7 = value;
+               break;
+       case HOST_FS_BASE:
+               current_evmcs->host_fs_base = value;
+               break;
+       case HOST_GS_BASE:
+               current_evmcs->host_gs_base = value;
+               break;
+       case HOST_TR_BASE:
+               current_evmcs->host_tr_base = value;
+               break;
+       case HOST_GDTR_BASE:
+               current_evmcs->host_gdtr_base = value;
+               break;
+       case HOST_IDTR_BASE:
+               current_evmcs->host_idtr_base = value;
+               break;
+       case HOST_RSP:
+               current_evmcs->host_rsp = value;
+               break;
+       case EPT_POINTER:
+               current_evmcs->ept_pointer = value;
+               break;
+       case GUEST_BNDCFGS:
+               current_evmcs->guest_bndcfgs = value;
+               break;
+       case XSS_EXIT_BITMAP:
+               current_evmcs->xss_exit_bitmap = value;
+               break;
+       case GUEST_PHYSICAL_ADDRESS:
+               current_evmcs->guest_physical_address = value;
+               break;
+       case EXIT_QUALIFICATION:
+               current_evmcs->exit_qualification = value;
+               break;
+       case GUEST_LINEAR_ADDRESS:
+               current_evmcs->guest_linear_address = value;
+               break;
+       case VM_EXIT_MSR_STORE_ADDR:
+               current_evmcs->vm_exit_msr_store_addr = value;
+               break;
+       case VM_EXIT_MSR_LOAD_ADDR:
+               current_evmcs->vm_exit_msr_load_addr = value;
+               break;
+       case VM_ENTRY_MSR_LOAD_ADDR:
+               current_evmcs->vm_entry_msr_load_addr = value;
+               break;
+       case CR3_TARGET_VALUE0:
+               current_evmcs->cr3_target_value0 = value;
+               break;
+       case CR3_TARGET_VALUE1:
+               current_evmcs->cr3_target_value1 = value;
+               break;
+       case CR3_TARGET_VALUE2:
+               current_evmcs->cr3_target_value2 = value;
+               break;
+       case CR3_TARGET_VALUE3:
+               current_evmcs->cr3_target_value3 = value;
+               break;
+       case TPR_THRESHOLD:
+               current_evmcs->tpr_threshold = value;
+               break;
+       case GUEST_INTERRUPTIBILITY_INFO:
+               current_evmcs->guest_interruptibility_info = value;
+               break;
+       case CPU_BASED_VM_EXEC_CONTROL:
+               current_evmcs->cpu_based_vm_exec_control = value;
+               break;
+       case EXCEPTION_BITMAP:
+               current_evmcs->exception_bitmap = value;
+               break;
+       case VM_ENTRY_CONTROLS:
+               current_evmcs->vm_entry_controls = value;
+               break;
+       case VM_ENTRY_INTR_INFO_FIELD:
+               current_evmcs->vm_entry_intr_info_field = value;
+               break;
+       case VM_ENTRY_EXCEPTION_ERROR_CODE:
+               current_evmcs->vm_entry_exception_error_code = value;
+               break;
+       case VM_ENTRY_INSTRUCTION_LEN:
+               current_evmcs->vm_entry_instruction_len = value;
+               break;
+       case HOST_IA32_SYSENTER_CS:
+               current_evmcs->host_ia32_sysenter_cs = value;
+               break;
+       case PIN_BASED_VM_EXEC_CONTROL:
+               current_evmcs->pin_based_vm_exec_control = value;
+               break;
+       case VM_EXIT_CONTROLS:
+               current_evmcs->vm_exit_controls = value;
+               break;
+       case SECONDARY_VM_EXEC_CONTROL:
+               current_evmcs->secondary_vm_exec_control = value;
+               break;
+       case GUEST_ES_LIMIT:
+               current_evmcs->guest_es_limit = value;
+               break;
+       case GUEST_CS_LIMIT:
+               current_evmcs->guest_cs_limit = value;
+               break;
+       case GUEST_SS_LIMIT:
+               current_evmcs->guest_ss_limit = value;
+               break;
+       case GUEST_DS_LIMIT:
+               current_evmcs->guest_ds_limit = value;
+               break;
+       case GUEST_FS_LIMIT:
+               current_evmcs->guest_fs_limit = value;
+               break;
+       case GUEST_GS_LIMIT:
+               current_evmcs->guest_gs_limit = value;
+               break;
+       case GUEST_LDTR_LIMIT:
+               current_evmcs->guest_ldtr_limit = value;
+               break;
+       case GUEST_TR_LIMIT:
+               current_evmcs->guest_tr_limit = value;
+               break;
+       case GUEST_GDTR_LIMIT:
+               current_evmcs->guest_gdtr_limit = value;
+               break;
+       case GUEST_IDTR_LIMIT:
+               current_evmcs->guest_idtr_limit = value;
+               break;
+       case GUEST_ES_AR_BYTES:
+               current_evmcs->guest_es_ar_bytes = value;
+               break;
+       case GUEST_CS_AR_BYTES:
+               current_evmcs->guest_cs_ar_bytes = value;
+               break;
+       case GUEST_SS_AR_BYTES:
+               current_evmcs->guest_ss_ar_bytes = value;
+               break;
+       case GUEST_DS_AR_BYTES:
+               current_evmcs->guest_ds_ar_bytes = value;
+               break;
+       case GUEST_FS_AR_BYTES:
+               current_evmcs->guest_fs_ar_bytes = value;
+               break;
+       case GUEST_GS_AR_BYTES:
+               current_evmcs->guest_gs_ar_bytes = value;
+               break;
+       case GUEST_LDTR_AR_BYTES:
+               current_evmcs->guest_ldtr_ar_bytes = value;
+               break;
+       case GUEST_TR_AR_BYTES:
+               current_evmcs->guest_tr_ar_bytes = value;
+               break;
+       case GUEST_ACTIVITY_STATE:
+               current_evmcs->guest_activity_state = value;
+               break;
+       case GUEST_SYSENTER_CS:
+               current_evmcs->guest_sysenter_cs = value;
+               break;
+       case VM_INSTRUCTION_ERROR:
+               current_evmcs->vm_instruction_error = value;
+               break;
+       case VM_EXIT_REASON:
+               current_evmcs->vm_exit_reason = value;
+               break;
+       case VM_EXIT_INTR_INFO:
+               current_evmcs->vm_exit_intr_info = value;
+               break;
+       case VM_EXIT_INTR_ERROR_CODE:
+               current_evmcs->vm_exit_intr_error_code = value;
+               break;
+       case IDT_VECTORING_INFO_FIELD:
+               current_evmcs->idt_vectoring_info_field = value;
+               break;
+       case IDT_VECTORING_ERROR_CODE:
+               current_evmcs->idt_vectoring_error_code = value;
+               break;
+       case VM_EXIT_INSTRUCTION_LEN:
+               current_evmcs->vm_exit_instruction_len = value;
+               break;
+       case VMX_INSTRUCTION_INFO:
+               current_evmcs->vmx_instruction_info = value;
+               break;
+       case PAGE_FAULT_ERROR_CODE_MASK:
+               current_evmcs->page_fault_error_code_mask = value;
+               break;
+       case PAGE_FAULT_ERROR_CODE_MATCH:
+               current_evmcs->page_fault_error_code_match = value;
+               break;
+       case CR3_TARGET_COUNT:
+               current_evmcs->cr3_target_count = value;
+               break;
+       case VM_EXIT_MSR_STORE_COUNT:
+               current_evmcs->vm_exit_msr_store_count = value;
+               break;
+       case VM_EXIT_MSR_LOAD_COUNT:
+               current_evmcs->vm_exit_msr_load_count = value;
+               break;
+       case VM_ENTRY_MSR_LOAD_COUNT:
+               current_evmcs->vm_entry_msr_load_count = value;
+               break;
+       case HOST_ES_SELECTOR:
+               current_evmcs->host_es_selector = value;
+               break;
+       case HOST_CS_SELECTOR:
+               current_evmcs->host_cs_selector = value;
+               break;
+       case HOST_SS_SELECTOR:
+               current_evmcs->host_ss_selector = value;
+               break;
+       case HOST_DS_SELECTOR:
+               current_evmcs->host_ds_selector = value;
+               break;
+       case HOST_FS_SELECTOR:
+               current_evmcs->host_fs_selector = value;
+               break;
+       case HOST_GS_SELECTOR:
+               current_evmcs->host_gs_selector = value;
+               break;
+       case HOST_TR_SELECTOR:
+               current_evmcs->host_tr_selector = value;
+               break;
+       case GUEST_ES_SELECTOR:
+               current_evmcs->guest_es_selector = value;
+               break;
+       case GUEST_CS_SELECTOR:
+               current_evmcs->guest_cs_selector = value;
+               break;
+       case GUEST_SS_SELECTOR:
+               current_evmcs->guest_ss_selector = value;
+               break;
+       case GUEST_DS_SELECTOR:
+               current_evmcs->guest_ds_selector = value;
+               break;
+       case GUEST_FS_SELECTOR:
+               current_evmcs->guest_fs_selector = value;
+               break;
+       case GUEST_GS_SELECTOR:
+               current_evmcs->guest_gs_selector = value;
+               break;
+       case GUEST_LDTR_SELECTOR:
+               current_evmcs->guest_ldtr_selector = value;
+               break;
+       case GUEST_TR_SELECTOR:
+               current_evmcs->guest_tr_selector = value;
+               break;
+       case VIRTUAL_PROCESSOR_ID:
+               current_evmcs->virtual_processor_id = value;
+               break;
+       default: return 1;
+       }
+
+       return 0;
+}
+
+static inline int evmcs_vmlaunch(void)
+{
+       int ret;
+
+       current_evmcs->hv_clean_fields = 0;
+
+       __asm__ __volatile__("push %%rbp;"
+                            "push %%rcx;"
+                            "push %%rdx;"
+                            "push %%rsi;"
+                            "push %%rdi;"
+                            "push $0;"
+                            "mov %%rsp, (%[host_rsp]);"
+                            "lea 1f(%%rip), %%rax;"
+                            "mov %%rax, (%[host_rip]);"
+                            "vmlaunch;"
+                            "incq (%%rsp);"
+                            "1: pop %%rax;"
+                            "pop %%rdi;"
+                            "pop %%rsi;"
+                            "pop %%rdx;"
+                            "pop %%rcx;"
+                            "pop %%rbp;"
+                            : [ret]"=&a"(ret)
+                            : [host_rsp]"r"
+                              ((uint64_t)&current_evmcs->host_rsp),
+                              [host_rip]"r"
+                              ((uint64_t)&current_evmcs->host_rip)
+                            : "memory", "cc", "rbx", "r8", "r9", "r10",
+                              "r11", "r12", "r13", "r14", "r15");
+       return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmresume.
+ */
+static inline int evmcs_vmresume(void)
+{
+       int ret;
+
+       current_evmcs->hv_clean_fields = 0;
+
+       __asm__ __volatile__("push %%rbp;"
+                            "push %%rcx;"
+                            "push %%rdx;"
+                            "push %%rsi;"
+                            "push %%rdi;"
+                            "push $0;"
+                            "mov %%rsp, (%[host_rsp]);"
+                            "lea 1f(%%rip), %%rax;"
+                            "mov %%rax, (%[host_rip]);"
+                            "vmresume;"
+                            "incq (%%rsp);"
+                            "1: pop %%rax;"
+                            "pop %%rdi;"
+                            "pop %%rsi;"
+                            "pop %%rdx;"
+                            "pop %%rcx;"
+                            "pop %%rbp;"
+                            : [ret]"=&a"(ret)
+                            : [host_rsp]"r"
+                              ((uint64_t)&current_evmcs->host_rsp),
+                              [host_rip]"r"
+                              ((uint64_t)&current_evmcs->host_rip)
+                            : "memory", "cc", "rbx", "r8", "r9", "r10",
+                              "r11", "r12", "r13", "r14", "r15");
+       return ret;
+}
+
+#endif /* !SELFTEST_KVM_EVMCS_H */
index 3acf9a9..a4e59e3 100644 (file)
@@ -7,7 +7,7 @@
  *
  */
 #ifndef SELFTEST_KVM_UTIL_H
-#define SELFTEST_KVM_UTIL_H 1
+#define SELFTEST_KVM_UTIL_H
 
 #include "test_util.h"
 
 
 #include "sparsebit.h"
 
-/*
- * Memslots can't cover the gfn starting at this gpa otherwise vCPUs can't be
- * created. Only applies to VMs using EPT.
- */
-#define KVM_DEFAULT_IDENTITY_MAP_ADDRESS 0xfffbc000ul
-
 
 /* Callers of kvm_util only have an incomplete/opaque description of the
  * structure kvm_util is using to maintain the state of a VM.
@@ -33,16 +27,23 @@ typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
 typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
 
 /* Minimum allocated guest virtual and physical addresses */
-#define KVM_UTIL_MIN_VADDR 0x2000
+#define KVM_UTIL_MIN_VADDR             0x2000
 
 #define DEFAULT_GUEST_PHY_PAGES                512
 #define DEFAULT_GUEST_STACK_VADDR_MIN  0xab6000
-#define DEFAULT_STACK_PGS               5
+#define DEFAULT_STACK_PGS              5
 
 enum vm_guest_mode {
-       VM_MODE_FLAT48PG,
+       VM_MODE_P52V48_4K,
+       VM_MODE_P52V48_64K,
+       VM_MODE_P40V48_4K,
+       VM_MODE_P40V48_64K,
+       NUM_VM_MODES,
 };
 
+#define vm_guest_mode_string(m) vm_guest_mode_string[m]
+extern const char * const vm_guest_mode_string[];
+
 enum vm_mem_backing_src_type {
        VM_MEM_SRC_ANONYMOUS,
        VM_MEM_SRC_ANONYMOUS_THP,
@@ -58,15 +59,15 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm);
 void kvm_vm_release(struct kvm_vm *vmp);
 void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log);
 
-int kvm_memcmp_hva_gva(void *hva,
-       struct kvm_vm *vm, const vm_vaddr_t gva, size_t len);
+int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva,
+                      size_t len);
 
 void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
-       uint32_t data_memslot, uint32_t pgd_memslot);
+                    uint32_t data_memslot, uint32_t pgd_memslot);
 
 void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
-void vcpu_dump(FILE *stream, struct kvm_vm *vm,
-       uint32_t vcpuid, uint8_t indent);
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid,
+              uint8_t indent);
 
 void vm_create_irqchip(struct kvm_vm *vm);
 
@@ -75,13 +76,14 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        uint64_t guest_paddr, uint32_t slot, uint64_t npages,
        uint32_t flags);
 
-void vcpu_ioctl(struct kvm_vm *vm,
-       uint32_t vcpuid, unsigned long ioctl, void *arg);
+void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
+               void *arg);
 void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
-void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot);
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot,
+                int gdt_memslot);
 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
-       uint32_t data_memslot, uint32_t pgd_memslot);
+                         uint32_t data_memslot, uint32_t pgd_memslot);
 void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
              size_t size, uint32_t pgd_memslot);
 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
@@ -93,56 +95,35 @@ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
 int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
-       struct kvm_mp_state *mp_state);
-void vcpu_regs_get(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_regs *regs);
-void vcpu_regs_set(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_regs *regs);
+                      struct kvm_mp_state *mp_state);
+void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
+void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
 void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...);
-void vcpu_sregs_get(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_sregs *sregs);
-void vcpu_sregs_set(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_sregs *sregs);
-int _vcpu_sregs_set(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_sregs *sregs);
+void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid,
+                   struct kvm_sregs *sregs);
+void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
+                   struct kvm_sregs *sregs);
+int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
+                   struct kvm_sregs *sregs);
 void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
-                         struct kvm_vcpu_events *events);
+                    struct kvm_vcpu_events *events);
 void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
-                         struct kvm_vcpu_events *events);
-uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index);
-void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
-       uint64_t msr_value);
+                    struct kvm_vcpu_events *events);
 
 const char *exit_reason_str(unsigned int exit_reason);
 
 void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);
 void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-       uint32_t pgd_memslot);
-vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
-       vm_paddr_t paddr_min, uint32_t memslot);
-
-struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
-void vcpu_set_cpuid(
-       struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid);
-
-struct kvm_cpuid_entry2 *
-kvm_get_supported_cpuid_index(uint32_t function, uint32_t index);
-
-static inline struct kvm_cpuid_entry2 *
-kvm_get_supported_cpuid_entry(uint32_t function)
-{
-       return kvm_get_supported_cpuid_index(function, 0);
-}
+                uint32_t pgd_memslot);
+vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
+                            uint32_t memslot);
+vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+                             vm_paddr_t paddr_min, uint32_t memslot);
 
 struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_size,
                                 void *guest_code);
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
 
-typedef void (*vmx_guest_code_t)(vm_vaddr_t vmxon_vaddr,
-                                vm_paddr_t vmxon_paddr,
-                                vm_vaddr_t vmcs_vaddr,
-                                vm_paddr_t vmcs_paddr);
-
 struct kvm_userspace_memory_region *
 kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
                                 uint64_t end);
@@ -152,43 +133,49 @@ allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region);
 
 int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd);
 
-#define GUEST_PORT_SYNC         0x1000
-#define GUEST_PORT_ABORT        0x1001
-#define GUEST_PORT_DONE         0x1002
-
-static inline void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1)
-{
-       __asm__ __volatile__("in %[port], %%al"
-                            :
-                            : [port]"d"(port), "D"(arg0), "S"(arg1)
-                            : "rax");
-}
-
-/*
- * Allows to pass three arguments to the host: port is 16bit wide,
- * arg0 & arg1 are 64bit wide
- */
-#define GUEST_SYNC_ARGS(_port, _arg0, _arg1) \
-       __exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1))
-
-#define GUEST_ASSERT(_condition) do {                          \
-               if (!(_condition))                              \
-                       GUEST_SYNC_ARGS(GUEST_PORT_ABORT,       \
-                                       "Failed guest assert: " \
-                                       #_condition, __LINE__); \
-       } while (0)
-
-#define GUEST_SYNC(stage)  GUEST_SYNC_ARGS(GUEST_PORT_SYNC, "hello", stage)
+#define sync_global_to_guest(vm, g) ({                         \
+       typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g));     \
+       memcpy(_p, &(g), sizeof(g));                            \
+})
+
+#define sync_global_from_guest(vm, g) ({                       \
+       typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g));     \
+       memcpy(&(g), _p, sizeof(g));                            \
+})
+
+/* ucall implementation types */
+typedef enum {
+       UCALL_PIO,
+       UCALL_MMIO,
+} ucall_type_t;
+
+/* Common ucalls */
+enum {
+       UCALL_NONE,
+       UCALL_SYNC,
+       UCALL_ABORT,
+       UCALL_DONE,
+};
 
-#define GUEST_DONE()  GUEST_SYNC_ARGS(GUEST_PORT_DONE, 0, 0)
+#define UCALL_MAX_ARGS 6
 
-struct guest_args {
-       uint64_t arg0;
-       uint64_t arg1;
-       uint16_t port;
-} __attribute__ ((packed));
+struct ucall {
+       uint64_t cmd;
+       uint64_t args[UCALL_MAX_ARGS];
+};
 
-void guest_args_read(struct kvm_vm *vm, uint32_t vcpu_id,
-                    struct guest_args *args);
+void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg);
+void ucall_uninit(struct kvm_vm *vm);
+void ucall(uint64_t cmd, int nargs, ...);
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
+
+#define GUEST_SYNC(stage)      ucall(UCALL_SYNC, 2, "hello", stage)
+#define GUEST_DONE()           ucall(UCALL_DONE, 0)
+#define GUEST_ASSERT(_condition) do {                  \
+       if (!(_condition))                              \
+               ucall(UCALL_ABORT, 2,                   \
+                       "Failed guest assert: "         \
+                       #_condition, __LINE__);         \
+} while (0)
 
 #endif /* SELFTEST_KVM_UTIL_H */
index 54cfeb6..31e0309 100644 (file)
@@ -15,8 +15,8 @@
  * even in the case where most bits are set.
  */
 
-#ifndef _TEST_SPARSEBIT_H_
-#define _TEST_SPARSEBIT_H_
+#ifndef SELFTEST_KVM_SPARSEBIT_H
+#define SELFTEST_KVM_SPARSEBIT_H
 
 #include <stdbool.h>
 #include <stdint.h>
@@ -72,4 +72,4 @@ void sparsebit_validate_internal(struct sparsebit *sbit);
 }
 #endif
 
-#endif /* _TEST_SPARSEBIT_H_ */
+#endif /* SELFTEST_KVM_SPARSEBIT_H */
index 73c3933..c7dafe8 100644 (file)
@@ -7,8 +7,8 @@
  *
  */
 
-#ifndef TEST_UTIL_H
-#define TEST_UTIL_H 1
+#ifndef SELFTEST_KVM_TEST_UTIL_H
+#define SELFTEST_KVM_TEST_UTIL_H
 
 #include <stdlib.h>
 #include <stdarg.h>
@@ -41,4 +41,4 @@ void test_assert(bool exp, const char *exp_str,
                    #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \
 } while (0)
 
-#endif /* TEST_UTIL_H */
+#endif /* SELFTEST_KVM_TEST_UTIL_H */
@@ -1,5 +1,5 @@
 /*
- * tools/testing/selftests/kvm/include/x86.h
+ * tools/testing/selftests/kvm/include/x86_64/processor.h
  *
  * Copyright (C) 2018, Google LLC.
  *
@@ -7,8 +7,8 @@
  *
  */
 
-#ifndef SELFTEST_KVM_X86_H
-#define SELFTEST_KVM_X86_H
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
 
 #include <assert.h>
 #include <stdint.h>
@@ -305,7 +305,25 @@ static inline unsigned long get_xmm(int n)
 
 struct kvm_x86_state;
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
-void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state);
+void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
+                    struct kvm_x86_state *state);
+
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
+void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
+                   struct kvm_cpuid2 *cpuid);
+
+struct kvm_cpuid_entry2 *
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index);
+
+static inline struct kvm_cpuid_entry2 *
+kvm_get_supported_cpuid_entry(uint32_t function)
+{
+       return kvm_get_supported_cpuid_index(function, 0);
+}
+
+uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index);
+void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+                 uint64_t msr_value);
 
 /*
  * Basic CPU control in CR0
@@ -1044,4 +1062,4 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 #define MSR_VM_IGNNE                    0xc0010115
 #define MSR_VM_HSAVE_PA                 0xc0010117
 
-#endif /* !SELFTEST_KVM_X86_H */
+#endif /* SELFTEST_KVM_PROCESSOR_H */
similarity index 96%
rename from tools/testing/selftests/kvm/include/vmx.h
rename to tools/testing/selftests/kvm/include/x86_64/vmx.h
index b9ffe10..c9bd935 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * tools/testing/selftests/kvm/include/vmx.h
+ * tools/testing/selftests/kvm/include/x86_64/vmx.h
  *
  * Copyright (C) 2018, Google LLC.
  *
@@ -11,7 +11,7 @@
 #define SELFTEST_KVM_VMX_H
 
 #include <stdint.h>
-#include "x86.h"
+#include "processor.h"
 
 #define CPUID_VMX_BIT                          5
 
@@ -339,6 +339,8 @@ struct vmx_msr_entry {
        uint64_t value;
 } __attribute__ ((aligned(16)));
 
+#include "evmcs.h"
+
 static inline int vmxon(uint64_t phys)
 {
        uint8_t ret;
@@ -372,6 +374,9 @@ static inline int vmptrld(uint64_t vmcs_pa)
 {
        uint8_t ret;
 
+       if (enable_evmcs)
+               return -1;
+
        __asm__ __volatile__ ("vmptrld %[pa]; setna %[ret]"
                : [ret]"=rm"(ret)
                : [pa]"m"(vmcs_pa)
@@ -385,6 +390,9 @@ static inline int vmptrst(uint64_t *value)
        uint64_t tmp;
        uint8_t ret;
 
+       if (enable_evmcs)
+               return evmcs_vmptrst(value);
+
        __asm__ __volatile__("vmptrst %[value]; setna %[ret]"
                : [value]"=m"(tmp), [ret]"=rm"(ret)
                : : "cc", "memory");
@@ -411,6 +419,9 @@ static inline int vmlaunch(void)
 {
        int ret;
 
+       if (enable_evmcs)
+               return evmcs_vmlaunch();
+
        __asm__ __volatile__("push %%rbp;"
                             "push %%rcx;"
                             "push %%rdx;"
@@ -443,6 +454,9 @@ static inline int vmresume(void)
 {
        int ret;
 
+       if (enable_evmcs)
+               return evmcs_vmresume();
+
        __asm__ __volatile__("push %%rbp;"
                             "push %%rcx;"
                             "push %%rdx;"
@@ -482,6 +496,9 @@ static inline int vmread(uint64_t encoding, uint64_t *value)
        uint64_t tmp;
        uint8_t ret;
 
+       if (enable_evmcs)
+               return evmcs_vmread(encoding, value);
+
        __asm__ __volatile__("vmread %[encoding], %[value]; setna %[ret]"
                : [value]"=rm"(tmp), [ret]"=rm"(ret)
                : [encoding]"r"(encoding)
@@ -506,6 +523,9 @@ static inline int vmwrite(uint64_t encoding, uint64_t value)
 {
        uint8_t ret;
 
+       if (enable_evmcs)
+               return evmcs_vmwrite(encoding, value);
+
        __asm__ __volatile__ ("vmwrite %[value], %[encoding]; setna %[ret]"
                : [ret]"=rm"(ret)
                : [value]"rm"(value), [encoding]"r"(encoding)
@@ -543,10 +563,19 @@ struct vmx_pages {
        void *vmwrite_hva;
        uint64_t vmwrite_gpa;
        void *vmwrite;
+
+       void *vp_assist_hva;
+       uint64_t vp_assist_gpa;
+       void *vp_assist;
+
+       void *enlightened_vmcs_hva;
+       uint64_t enlightened_vmcs_gpa;
+       void *enlightened_vmcs;
 };
 
 struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
 bool prepare_for_vmx_operation(struct vmx_pages *vmx);
 void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
+bool load_vmcs(struct vmx_pages *vmx);
 
-#endif /* !SELFTEST_KVM_VMX_H */
+#endif /* SELFTEST_KVM_VMX_H */
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
new file mode 100644 (file)
index 0000000..b6022e2
--- /dev/null
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AArch64 code
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+
+#define KVM_GUEST_PAGE_TABLE_MIN_PADDR         0x180000
+#define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN    0xac0000
+
+static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
+{
+       return (v + vm->page_size) & ~(vm->page_size - 1);
+}
+
+static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+       unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+       uint64_t mask = (1UL << (vm->va_bits - shift)) - 1;
+
+       return (gva >> shift) & mask;
+}
+
+static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+       unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift;
+       uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+
+       TEST_ASSERT(vm->pgtable_levels == 4,
+               "Mode %d does not have 4 page table levels", vm->mode);
+
+       return (gva >> shift) & mask;
+}
+
+static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+       unsigned int shift = (vm->page_shift - 3) + vm->page_shift;
+       uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+
+       TEST_ASSERT(vm->pgtable_levels >= 3,
+               "Mode %d does not have >= 3 page table levels", vm->mode);
+
+       return (gva >> shift) & mask;
+}
+
+static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+       uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+       return (gva >> vm->page_shift) & mask;
+}
+
+static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry)
+{
+       uint64_t mask = ((1UL << (vm->va_bits - vm->page_shift)) - 1) << vm->page_shift;
+       return entry & mask;
+}
+
+static uint64_t ptrs_per_pgd(struct kvm_vm *vm)
+{
+       unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+       return 1 << (vm->va_bits - shift);
+}
+
+static uint64_t ptrs_per_pte(struct kvm_vm *vm)
+{
+       return 1 << (vm->page_shift - 3);
+}
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
+{
+       int rc;
+
+       if (!vm->pgd_created) {
+               vm_paddr_t paddr = vm_phy_pages_alloc(vm,
+                       page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size,
+                       KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+               vm->pgd = paddr;
+               vm->pgd_created = true;
+       }
+}
+
+void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+                 uint32_t pgd_memslot, uint64_t flags)
+{
+       uint8_t attr_idx = flags & 7;
+       uint64_t *ptep;
+
+       TEST_ASSERT((vaddr % vm->page_size) == 0,
+               "Virtual address not on page boundary,\n"
+               "  vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size);
+       TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+               (vaddr >> vm->page_shift)),
+               "Invalid virtual address, vaddr: 0x%lx", vaddr);
+       TEST_ASSERT((paddr % vm->page_size) == 0,
+               "Physical address not on page boundary,\n"
+               "  paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size);
+       TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+               "Physical address beyond beyond maximum supported,\n"
+               "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+               paddr, vm->max_gfn, vm->page_size);
+
+       ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8;
+       if (!*ptep) {
+               *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+               *ptep |= 3;
+       }
+
+       switch (vm->pgtable_levels) {
+       case 4:
+               ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8;
+               if (!*ptep) {
+                       *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+                       *ptep |= 3;
+               }
+               /* fall through */
+       case 3:
+               ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8;
+               if (!*ptep) {
+                       *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+                       *ptep |= 3;
+               }
+               /* fall through */
+       case 2:
+               ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8;
+               break;
+       default:
+               TEST_ASSERT(false, "Page table levels must be 2, 3, or 4");
+       }
+
+       *ptep = paddr | 3;
+       *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */;
+}
+
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+                uint32_t pgd_memslot)
+{
+       uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */
+
+       _virt_pg_map(vm, vaddr, paddr, pgd_memslot, attr_idx);
+}
+
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+       uint64_t *ptep;
+
+       if (!vm->pgd_created)
+               goto unmapped_gva;
+
+       ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8;
+       if (!ptep)
+               goto unmapped_gva;
+
+       switch (vm->pgtable_levels) {
+       case 4:
+               ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8;
+               if (!ptep)
+                       goto unmapped_gva;
+               /* fall through */
+       case 3:
+               ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, gva) * 8;
+               if (!ptep)
+                       goto unmapped_gva;
+               /* fall through */
+       case 2:
+               ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, gva) * 8;
+               if (!ptep)
+                       goto unmapped_gva;
+               break;
+       default:
+               TEST_ASSERT(false, "Page table levels must be 2, 3, or 4");
+       }
+
+       return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
+
+unmapped_gva:
+       TEST_ASSERT(false, "No mapping for vm virtual address, "
+                   "gva: 0x%lx", gva);
+}
+
+static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level)
+{
+#ifdef DEBUG_VM
+       static const char * const type[] = { "", "pud", "pmd", "pte" };
+       uint64_t pte, *ptep;
+
+       if (level == 4)
+               return;
+
+       for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) {
+               ptep = addr_gpa2hva(vm, pte);
+               if (!*ptep)
+                       continue;
+               printf("%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep);
+               pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1);
+       }
+#endif
+}
+
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+       int level = 4 - (vm->pgtable_levels - 1);
+       uint64_t pgd, *ptep;
+
+       if (!vm->pgd_created)
+               return;
+
+       for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) {
+               ptep = addr_gpa2hva(vm, pgd);
+               if (!*ptep)
+                       continue;
+               printf("%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep);
+               pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level);
+       }
+}
+
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+                                void *guest_code)
+{
+       uint64_t ptrs_per_4k_pte = 512;
+       uint64_t extra_pg_pages = (extra_mem_pages / ptrs_per_4k_pte) * 2;
+       struct kvm_vm *vm;
+
+       vm = vm_create(VM_MODE_P52V48_4K, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
+
+       kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+       vm_vcpu_add_default(vm, vcpuid, guest_code);
+
+       return vm;
+}
+
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+       size_t stack_size = vm->page_size == 4096 ?
+                                       DEFAULT_STACK_PGS * vm->page_size :
+                                       vm->page_size;
+       uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size,
+                                       DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0);
+
+       vm_vcpu_add(vm, vcpuid, 0, 0);
+
+       set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
+       set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
+}
+
+void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
+{
+       struct kvm_vcpu_init init;
+       uint64_t sctlr_el1, tcr_el1;
+
+       memset(&init, 0, sizeof(init));
+       init.target = KVM_ARM_TARGET_GENERIC_V8;
+       vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, &init);
+
+       /*
+        * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15
+        * registers, which the variable argument list macros do.
+        */
+       set_reg(vm, vcpuid, ARM64_SYS_REG(CPACR_EL1), 3 << 20);
+
+       get_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), &sctlr_el1);
+       get_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), &tcr_el1);
+
+       switch (vm->mode) {
+       case VM_MODE_P52V48_4K:
+               tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
+               tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
+               break;
+       case VM_MODE_P52V48_64K:
+               tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
+               tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
+               break;
+       case VM_MODE_P40V48_4K:
+               tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
+               tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
+               break;
+       case VM_MODE_P40V48_64K:
+               tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
+               tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
+               break;
+       default:
+               TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode);
+       }
+
+       sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */;
+       /* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */;
+       tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12);
+       tcr_el1 |= (64 - vm->va_bits) /* T0SZ */;
+
+       set_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), sctlr_el1);
+       set_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), tcr_el1);
+       set_reg(vm, vcpuid, ARM64_SYS_REG(MAIR_EL1), DEFAULT_MAIR_EL1);
+       set_reg(vm, vcpuid, ARM64_SYS_REG(TTBR0_EL1), vm->pgd);
+}
+
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
+{
+       uint64_t pstate, pc;
+
+       get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate);
+       get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc);
+
+        fprintf(stream, "%*spstate: 0x%.16llx pc: 0x%.16llx\n",
+                indent, "", pstate, pc);
+
+}
index cd01144..6398efe 100644 (file)
@@ -13,7 +13,7 @@
 #include <execinfo.h>
 #include <sys/syscall.h>
 
-#include "../../kselftest.h"
+#include "kselftest.h"
 
 /* Dumps the current stack trace to stderr. */
 static void __attribute__((noinline)) test_dump_stack(void);
index 6fd8c08..8c06da4 100644 (file)
 #include <sys/stat.h>
 #include <linux/kernel.h>
 
-#define KVM_DEV_PATH "/dev/kvm"
-
 #define KVM_UTIL_PGS_PER_HUGEPG 512
-#define KVM_UTIL_MIN_PADDR      0x2000
+#define KVM_UTIL_MIN_PFN       2
 
 /* Aligns x up to the next multiple of size. Size must be a power of 2. */
 static void *align(void *x, size_t size)
@@ -30,7 +28,8 @@ static void *align(void *x, size_t size)
        return (void *) (((size_t) x + mask) & ~mask);
 }
 
-/* Capability
+/*
+ * Capability
  *
  * Input Args:
  *   cap - Capability
@@ -92,16 +91,23 @@ static void vm_open(struct kvm_vm *vm, int perm)
        if (vm->kvm_fd < 0)
                exit(KSFT_SKIP);
 
-       /* Create VM. */
        vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, NULL);
        TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
                "rc: %i errno: %i", vm->fd, errno);
 }
 
-/* VM Create
+const char * const vm_guest_mode_string[] = {
+       "PA-bits:52, VA-bits:48, 4K pages",
+       "PA-bits:52, VA-bits:48, 64K pages",
+       "PA-bits:40, VA-bits:48, 4K pages",
+       "PA-bits:40, VA-bits:48, 64K pages",
+};
+
+/*
+ * VM Create
  *
  * Input Args:
- *   mode - VM Mode (e.g. VM_MODE_FLAT48PG)
+ *   mode - VM Mode (e.g. VM_MODE_P52V48_4K)
  *   phy_pages - Physical memory pages
  *   perm - permission
  *
@@ -110,7 +116,7 @@ static void vm_open(struct kvm_vm *vm, int perm)
  * Return:
  *   Pointer to opaque structure that describes the created VM.
  *
- * Creates a VM with the mode specified by mode (e.g. VM_MODE_FLAT48PG).
+ * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K).
  * When phy_pages is non-zero, a memory region of phy_pages physical pages
  * is created and mapped starting at guest physical address 0.  The file
  * descriptor to control the created VM is created with the permissions
@@ -121,7 +127,6 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
        struct kvm_vm *vm;
        int kvm_fd;
 
-       /* Allocate memory. */
        vm = calloc(1, sizeof(*vm));
        TEST_ASSERT(vm != NULL, "Insufficent Memory");
 
@@ -130,26 +135,48 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 
        /* Setup mode specific traits. */
        switch (vm->mode) {
-       case VM_MODE_FLAT48PG:
+       case VM_MODE_P52V48_4K:
+               vm->pgtable_levels = 4;
                vm->page_size = 0x1000;
                vm->page_shift = 12;
-
-               /* Limit to 48-bit canonical virtual addresses. */
-               vm->vpages_valid = sparsebit_alloc();
-               sparsebit_set_num(vm->vpages_valid,
-                       0, (1ULL << (48 - 1)) >> vm->page_shift);
-               sparsebit_set_num(vm->vpages_valid,
-                       (~((1ULL << (48 - 1)) - 1)) >> vm->page_shift,
-                       (1ULL << (48 - 1)) >> vm->page_shift);
-
-               /* Limit physical addresses to 52-bits. */
-               vm->max_gfn = ((1ULL << 52) >> vm->page_shift) - 1;
+               vm->va_bits = 48;
+               break;
+       case VM_MODE_P52V48_64K:
+               vm->pgtable_levels = 3;
+               vm->pa_bits = 52;
+               vm->page_size = 0x10000;
+               vm->page_shift = 16;
+               vm->va_bits = 48;
+               break;
+       case VM_MODE_P40V48_4K:
+               vm->pgtable_levels = 4;
+               vm->pa_bits = 40;
+               vm->va_bits = 48;
+               vm->page_size = 0x1000;
+               vm->page_shift = 12;
+               break;
+       case VM_MODE_P40V48_64K:
+               vm->pgtable_levels = 3;
+               vm->pa_bits = 40;
+               vm->va_bits = 48;
+               vm->page_size = 0x10000;
+               vm->page_shift = 16;
                break;
-
        default:
                TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
        }
 
+       /* Limit to VA-bit canonical virtual addresses. */
+       vm->vpages_valid = sparsebit_alloc();
+       sparsebit_set_num(vm->vpages_valid,
+               0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
+       sparsebit_set_num(vm->vpages_valid,
+               (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift,
+               (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
+
+       /* Limit physical addresses to PA-bits. */
+       vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
+
        /* Allocate and setup memory for guest. */
        vm->vpages_mapped = sparsebit_alloc();
        if (phy_pages != 0)
@@ -159,7 +186,8 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
        return vm;
 }
 
-/* VM Restart
+/*
+ * VM Restart
  *
  * Input Args:
  *   vm - VM that has been released before
@@ -186,7 +214,8 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm)
                            "  rc: %i errno: %i\n"
                            "  slot: %u flags: 0x%x\n"
                            "  guest_phys_addr: 0x%lx size: 0x%lx",
-                           ret, errno, region->region.slot, region->region.flags,
+                           ret, errno, region->region.slot,
+                           region->region.flags,
                            region->region.guest_phys_addr,
                            region->region.memory_size);
        }
@@ -202,7 +231,8 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
                    strerror(-ret));
 }
 
-/* Userspace Memory Region Find
+/*
+ * Userspace Memory Region Find
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -220,8 +250,8 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
  * of the regions is returned.  Null is returned only when no overlapping
  * region exists.
  */
-static struct userspace_mem_region *userspace_mem_region_find(
-       struct kvm_vm *vm, uint64_t start, uint64_t end)
+static struct userspace_mem_region *
+userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
 {
        struct userspace_mem_region *region;
 
@@ -237,7 +267,8 @@ static struct userspace_mem_region *userspace_mem_region_find(
        return NULL;
 }
 
-/* KVM Userspace Memory Region Find
+/*
+ * KVM Userspace Memory Region Find
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -265,7 +296,8 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
        return &region->region;
 }
 
-/* VCPU Find
+/*
+ * VCPU Find
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -280,8 +312,7 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
  * returns a pointer to it.  Returns NULL if the VM doesn't contain a VCPU
  * for the specified vcpuid.
  */
-struct vcpu *vcpu_find(struct kvm_vm *vm,
-       uint32_t vcpuid)
+struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid)
 {
        struct vcpu *vcpup;
 
@@ -293,7 +324,8 @@ struct vcpu *vcpu_find(struct kvm_vm *vm,
        return NULL;
 }
 
-/* VM VCPU Remove
+/*
+ * VM VCPU Remove
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -330,11 +362,9 @@ void kvm_vm_release(struct kvm_vm *vmp)
 {
        int ret;
 
-       /* Free VCPUs. */
        while (vmp->vcpu_head)
                vm_vcpu_rm(vmp, vmp->vcpu_head->id);
 
-       /* Close file descriptor for the VM. */
        ret = close(vmp->fd);
        TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
                "  vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
@@ -344,7 +374,8 @@ void kvm_vm_release(struct kvm_vm *vmp)
                "  vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
 }
 
-/* Destroys and frees the VM pointed to by vmp.
+/*
+ * Destroys and frees the VM pointed to by vmp.
  */
 void kvm_vm_free(struct kvm_vm *vmp)
 {
@@ -383,7 +414,8 @@ void kvm_vm_free(struct kvm_vm *vmp)
        free(vmp);
 }
 
-/* Memory Compare, host virtual to guest virtual
+/*
+ * Memory Compare, host virtual to guest virtual
  *
  * Input Args:
  *   hva - Starting host virtual address
@@ -405,23 +437,25 @@ void kvm_vm_free(struct kvm_vm *vmp)
  * a length of len, to the guest bytes starting at the guest virtual
  * address given by gva.
  */
-int kvm_memcmp_hva_gva(void *hva,
-       struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
+int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
 {
        size_t amt;
 
-       /* Compare a batch of bytes until either a match is found
+       /*
+        * Compare a batch of bytes until either a match is found
         * or all the bytes have been compared.
         */
        for (uintptr_t offset = 0; offset < len; offset += amt) {
                uintptr_t ptr1 = (uintptr_t)hva + offset;
 
-               /* Determine host address for guest virtual address
+               /*
+                * Determine host address for guest virtual address
                 * at offset.
                 */
                uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset);
 
-               /* Determine amount to compare on this pass.
+               /*
+                * Determine amount to compare on this pass.
                 * Don't allow the comparsion to cross a page boundary.
                 */
                amt = len - offset;
@@ -433,7 +467,8 @@ int kvm_memcmp_hva_gva(void *hva,
                assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift));
                assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift));
 
-               /* Perform the comparison.  If there is a difference
+               /*
+                * Perform the comparison.  If there is a difference
                 * return that result to the caller, otherwise need
                 * to continue on looking for a mismatch.
                 */
@@ -442,109 +477,15 @@ int kvm_memcmp_hva_gva(void *hva,
                        return ret;
        }
 
-       /* No mismatch found.  Let the caller know the two memory
+       /*
+        * No mismatch found.  Let the caller know the two memory
         * areas are equal.
         */
        return 0;
 }
 
-/* Allocate an instance of struct kvm_cpuid2
- *
- * Input Args: None
- *
- * Output Args: None
- *
- * Return: A pointer to the allocated struct. The caller is responsible
- * for freeing this struct.
- *
- * Since kvm_cpuid2 uses a 0-length array to allow a the size of the
- * array to be decided at allocation time, allocation is slightly
- * complicated. This function uses a reasonable default length for
- * the array and performs the appropriate allocation.
- */
-static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
-{
-       struct kvm_cpuid2 *cpuid;
-       int nent = 100;
-       size_t size;
-
-       size = sizeof(*cpuid);
-       size += nent * sizeof(struct kvm_cpuid_entry2);
-       cpuid = malloc(size);
-       if (!cpuid) {
-               perror("malloc");
-               abort();
-       }
-
-       cpuid->nent = nent;
-
-       return cpuid;
-}
-
-/* KVM Supported CPUID Get
- *
- * Input Args: None
- *
- * Output Args:
- *
- * Return: The supported KVM CPUID
- *
- * Get the guest CPUID supported by KVM.
- */
-struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
-{
-       static struct kvm_cpuid2 *cpuid;
-       int ret;
-       int kvm_fd;
-
-       if (cpuid)
-               return cpuid;
-
-       cpuid = allocate_kvm_cpuid2();
-       kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-       if (kvm_fd < 0)
-               exit(KSFT_SKIP);
-
-       ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
-       TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
-                   ret, errno);
-
-       close(kvm_fd);
-       return cpuid;
-}
-
-/* Locate a cpuid entry.
- *
- * Input Args:
- *   cpuid: The cpuid.
- *   function: The function of the cpuid entry to find.
- *
- * Output Args: None
- *
- * Return: A pointer to the cpuid entry. Never returns NULL.
- */
-struct kvm_cpuid_entry2 *
-kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
-{
-       struct kvm_cpuid2 *cpuid;
-       struct kvm_cpuid_entry2 *entry = NULL;
-       int i;
-
-       cpuid = kvm_get_supported_cpuid();
-       for (i = 0; i < cpuid->nent; i++) {
-               if (cpuid->entries[i].function == function &&
-                   cpuid->entries[i].index == index) {
-                       entry = &cpuid->entries[i];
-                       break;
-               }
-       }
-
-       TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).",
-                   function, index);
-       return entry;
-}
-
-/* VM Userspace Memory Region Add
+/*
+ * VM Userspace Memory Region Add
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -586,7 +527,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
                "  vm->max_gfn: 0x%lx vm->page_size: 0x%x",
                guest_paddr, npages, vm->max_gfn, vm->page_size);
 
-       /* Confirm a mem region with an overlapping address doesn't
+       /*
+        * Confirm a mem region with an overlapping address doesn't
         * already exist.
         */
        region = (struct userspace_mem_region *) userspace_mem_region_find(
@@ -677,7 +619,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        vm->userspace_mem_region_head = region;
 }
 
-/* Memslot to region
+/*
+ * Memslot to region
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -691,8 +634,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
  *   on error (e.g. currently no memory region using memslot as a KVM
  *   memory slot ID).
  */
-static struct userspace_mem_region *memslot2region(struct kvm_vm *vm,
-       uint32_t memslot)
+static struct userspace_mem_region *
+memslot2region(struct kvm_vm *vm, uint32_t memslot)
 {
        struct userspace_mem_region *region;
 
@@ -712,7 +655,8 @@ static struct userspace_mem_region *memslot2region(struct kvm_vm *vm,
        return region;
 }
 
-/* VM Memory Region Flags Set
+/*
+ * VM Memory Region Flags Set
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -730,7 +674,6 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
        int ret;
        struct userspace_mem_region *region;
 
-       /* Locate memory region. */
        region = memslot2region(vm, slot);
 
        region->region.flags = flags;
@@ -742,7 +685,8 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
                ret, errno, slot, flags);
 }
 
-/* VCPU mmap Size
+/*
+ * VCPU mmap Size
  *
  * Input Args: None
  *
@@ -772,7 +716,8 @@ static int vcpu_mmap_sz(void)
        return ret;
 }
 
-/* VM VCPU Add
+/*
+ * VM VCPU Add
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -785,7 +730,8 @@ static int vcpu_mmap_sz(void)
  * Creates and adds to the VM specified by vm and virtual CPU with
  * the ID given by vcpuid.
  */
-void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot)
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot,
+                int gdt_memslot)
 {
        struct vcpu *vcpu;
 
@@ -823,7 +769,8 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_me
        vcpu_setup(vm, vcpuid, pgd_memslot, gdt_memslot);
 }
 
-/* VM Virtual Address Unused Gap
+/*
+ * VM Virtual Address Unused Gap
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -843,14 +790,14 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_me
  * sz unallocated bytes >= vaddr_min is available.
  */
 static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
-       vm_vaddr_t vaddr_min)
+                                     vm_vaddr_t vaddr_min)
 {
        uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
 
        /* Determine lowest permitted virtual page index. */
        uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
        if ((pgidx_start * vm->page_size) < vaddr_min)
-                       goto no_va_found;
+               goto no_va_found;
 
        /* Loop over section with enough valid virtual page indexes. */
        if (!sparsebit_is_set_num(vm->vpages_valid,
@@ -909,7 +856,8 @@ va_found:
        return pgidx_start * vm->page_size;
 }
 
-/* VM Virtual Address Allocate
+/*
+ * VM Virtual Address Allocate
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -930,13 +878,14 @@ va_found:
  * a page.
  */
 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
-       uint32_t data_memslot, uint32_t pgd_memslot)
+                         uint32_t data_memslot, uint32_t pgd_memslot)
 {
        uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
 
        virt_pgd_alloc(vm, pgd_memslot);
 
-       /* Find an unused range of virtual page addresses of at least
+       /*
+        * Find an unused range of virtual page addresses of at least
         * pages in length.
         */
        vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
@@ -946,7 +895,8 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
                pages--, vaddr += vm->page_size) {
                vm_paddr_t paddr;
 
-               paddr = vm_phy_page_alloc(vm, KVM_UTIL_MIN_PADDR, data_memslot);
+               paddr = vm_phy_page_alloc(vm,
+                               KVM_UTIL_MIN_PFN * vm->page_size, data_memslot);
 
                virt_pg_map(vm, vaddr, paddr, pgd_memslot);
 
@@ -990,7 +940,8 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
        }
 }
 
-/* Address VM Physical to Host Virtual
+/*
+ * Address VM Physical to Host Virtual
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1022,7 +973,8 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
        return NULL;
 }
 
-/* Address Host Virtual to VM Physical
+/*
+ * Address Host Virtual to VM Physical
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1056,7 +1008,8 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
        return -1;
 }
 
-/* VM Create IRQ Chip
+/*
+ * VM Create IRQ Chip
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1078,7 +1031,8 @@ void vm_create_irqchip(struct kvm_vm *vm)
        vm->has_irqchip = true;
 }
 
-/* VM VCPU State
+/*
+ * VM VCPU State
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1100,7 +1054,8 @@ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid)
        return vcpu->state;
 }
 
-/* VM VCPU Run
+/*
+ * VM VCPU Run
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1126,13 +1081,14 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
        int rc;
 
        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
-        do {
+       do {
                rc = ioctl(vcpu->fd, KVM_RUN, NULL);
        } while (rc == -1 && errno == EINTR);
        return rc;
 }
 
-/* VM VCPU Set MP State
+/*
+ * VM VCPU Set MP State
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1147,7 +1103,7 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
  * by mp_state.
  */
 void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
-       struct kvm_mp_state *mp_state)
+                      struct kvm_mp_state *mp_state)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        int ret;
@@ -1159,7 +1115,8 @@ void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
                "rc: %i errno: %i", ret, errno);
 }
 
-/* VM VCPU Regs Get
+/*
+ * VM VCPU Regs Get
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1173,21 +1130,20 @@ void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
  * Obtains the current register state for the VCPU specified by vcpuid
  * and stores it at the location given by regs.
  */
-void vcpu_regs_get(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_regs *regs)
+void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        int ret;
 
        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 
-       /* Get the regs. */
        ret = ioctl(vcpu->fd, KVM_GET_REGS, regs);
        TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i",
                ret, errno);
 }
 
-/* VM VCPU Regs Set
+/*
+ * VM VCPU Regs Set
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1201,165 +1157,46 @@ void vcpu_regs_get(struct kvm_vm *vm,
  * Sets the regs of the VCPU specified by vcpuid to the values
  * given by regs.
  */
-void vcpu_regs_set(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_regs *regs)
+void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        int ret;
 
        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 
-       /* Set the regs. */
        ret = ioctl(vcpu->fd, KVM_SET_REGS, regs);
        TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i",
                ret, errno);
 }
 
 void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
-                         struct kvm_vcpu_events *events)
+                    struct kvm_vcpu_events *events)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        int ret;
 
        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 
-       /* Get the regs. */
        ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events);
        TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i",
                ret, errno);
 }
 
 void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
-                         struct kvm_vcpu_events *events)
+                    struct kvm_vcpu_events *events)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        int ret;
 
        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 
-       /* Set the regs. */
        ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events);
        TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i",
                ret, errno);
 }
 
-/* VCPU Get MSR
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vcpuid - VCPU ID
- *   msr_index - Index of MSR
- *
- * Output Args: None
- *
- * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
- *
- * Get value of MSR for VCPU.
- */
-uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
-{
-       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
-       struct {
-               struct kvm_msrs header;
-               struct kvm_msr_entry entry;
-       } buffer = {};
-       int r;
-
-       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
-       buffer.header.nmsrs = 1;
-       buffer.entry.index = msr_index;
-       r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header);
-       TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
-               "  rc: %i errno: %i", r, errno);
-
-       return buffer.entry.data;
-}
-
-/* VCPU Set MSR
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vcpuid - VCPU ID
- *   msr_index - Index of MSR
- *   msr_value - New value of MSR
- *
- * Output Args: None
- *
- * Return: On success, nothing. On failure a TEST_ASSERT is produced.
- *
- * Set value of MSR for VCPU.
- */
-void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
-       uint64_t msr_value)
-{
-       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
-       struct {
-               struct kvm_msrs header;
-               struct kvm_msr_entry entry;
-       } buffer = {};
-       int r;
-
-       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
-       memset(&buffer, 0, sizeof(buffer));
-       buffer.header.nmsrs = 1;
-       buffer.entry.index = msr_index;
-       buffer.entry.data = msr_value;
-       r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header);
-       TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n"
-               "  rc: %i errno: %i", r, errno);
-}
-
-/* VM VCPU Args Set
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vcpuid - VCPU ID
- *   num - number of arguments
- *   ... - arguments, each of type uint64_t
- *
- * Output Args: None
- *
- * Return: None
- *
- * Sets the first num function input arguments to the values
- * given as variable args.  Each of the variable args is expected to
- * be of type uint64_t.
- */
-void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
-{
-       va_list ap;
-       struct kvm_regs regs;
-
-       TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
-                   "  num: %u\n",
-                   num);
-
-       va_start(ap, num);
-       vcpu_regs_get(vm, vcpuid, &regs);
-
-       if (num >= 1)
-               regs.rdi = va_arg(ap, uint64_t);
-
-       if (num >= 2)
-               regs.rsi = va_arg(ap, uint64_t);
-
-       if (num >= 3)
-               regs.rdx = va_arg(ap, uint64_t);
-
-       if (num >= 4)
-               regs.rcx = va_arg(ap, uint64_t);
-
-       if (num >= 5)
-               regs.r8 = va_arg(ap, uint64_t);
-
-       if (num >= 6)
-               regs.r9 = va_arg(ap, uint64_t);
-
-       vcpu_regs_set(vm, vcpuid, &regs);
-       va_end(ap);
-}
-
-/* VM VCPU System Regs Get
+/*
+ * VM VCPU System Regs Get
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1373,22 +1210,20 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
  * Obtains the current system register state for the VCPU specified by
  * vcpuid and stores it at the location given by sregs.
  */
-void vcpu_sregs_get(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_sregs *sregs)
+void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        int ret;
 
        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 
-       /* Get the regs. */
-       /* Get the regs. */
        ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs);
        TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i",
                ret, errno);
 }
 
-/* VM VCPU System Regs Set
+/*
+ * VM VCPU System Regs Set
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1402,27 +1237,25 @@ void vcpu_sregs_get(struct kvm_vm *vm,
  * Sets the system regs of the VCPU specified by vcpuid to the values
  * given by sregs.
  */
-void vcpu_sregs_set(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_sregs *sregs)
+void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
 {
        int ret = _vcpu_sregs_set(vm, vcpuid, sregs);
        TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, "
                "rc: %i errno: %i", ret, errno);
 }
 
-int _vcpu_sregs_set(struct kvm_vm *vm,
-       uint32_t vcpuid, struct kvm_sregs *sregs)
+int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        int ret;
 
        TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 
-       /* Get the regs. */
        return ioctl(vcpu->fd, KVM_SET_SREGS, sregs);
 }
 
-/* VCPU Ioctl
+/*
+ * VCPU Ioctl
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1434,8 +1267,8 @@ int _vcpu_sregs_set(struct kvm_vm *vm,
  *
  * Issues an arbitrary ioctl on a VCPU fd.
  */
-void vcpu_ioctl(struct kvm_vm *vm,
-       uint32_t vcpuid, unsigned long cmd, void *arg)
+void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid,
+               unsigned long cmd, void *arg)
 {
        struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        int ret;
@@ -1447,7 +1280,8 @@ void vcpu_ioctl(struct kvm_vm *vm,
                cmd, ret, errno, strerror(errno));
 }
 
-/* VM Ioctl
+/*
+ * VM Ioctl
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1467,7 +1301,8 @@ void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
                cmd, ret, errno, strerror(errno));
 }
 
-/* VM Dump
+/*
+ * VM Dump
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1514,38 +1349,6 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
                vcpu_dump(stream, vm, vcpu->id, indent + 2);
 }
 
-/* VM VCPU Dump
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vcpuid - VCPU ID
- *   indent - Left margin indent amount
- *
- * Output Args:
- *   stream - Output FILE stream
- *
- * Return: None
- *
- * Dumps the current state of the VCPU specified by vcpuid, within the VM
- * given by vm, to the FILE stream given by stream.
- */
-void vcpu_dump(FILE *stream, struct kvm_vm *vm,
-       uint32_t vcpuid, uint8_t indent)
-{
-               struct kvm_regs regs;
-               struct kvm_sregs sregs;
-
-               fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid);
-
-               fprintf(stream, "%*sregs:\n", indent + 2, "");
-               vcpu_regs_get(vm, vcpuid, &regs);
-               regs_dump(stream, &regs, indent + 4);
-
-               fprintf(stream, "%*ssregs:\n", indent + 2, "");
-               vcpu_sregs_get(vm, vcpuid, &sregs);
-               sregs_dump(stream, &sregs, indent + 4);
-}
-
 /* Known KVM exit reasons */
 static struct exit_reason {
        unsigned int reason;
@@ -1576,7 +1379,8 @@ static struct exit_reason {
 #endif
 };
 
-/* Exit Reason String
+/*
+ * Exit Reason String
  *
  * Input Args:
  *   exit_reason - Exit reason
@@ -1602,10 +1406,12 @@ const char *exit_reason_str(unsigned int exit_reason)
        return "Unknown";
 }
 
-/* Physical Page Allocate
+/*
+ * Physical Contiguous Page Allocator
  *
  * Input Args:
  *   vm - Virtual Machine
+ *   num - number of pages
  *   paddr_min - Physical address minimum
  *   memslot - Memory region to allocate page from
  *
@@ -1614,47 +1420,59 @@ const char *exit_reason_str(unsigned int exit_reason)
  * Return:
  *   Starting physical address
  *
- * Within the VM specified by vm, locates an available physical page
- * at or above paddr_min.  If found, the page is marked as in use
- * and its address is returned.  A TEST_ASSERT failure occurs if no
- * page is available at or above paddr_min.
+ * Within the VM specified by vm, locates a range of available physical
+ * pages at or above paddr_min. If found, the pages are marked as in use
+ * and thier base address is returned. A TEST_ASSERT failure occurs if
+ * not enough pages are available at or above paddr_min.
  */
-vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
-       vm_paddr_t paddr_min, uint32_t memslot)
+vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+                             vm_paddr_t paddr_min, uint32_t memslot)
 {
        struct userspace_mem_region *region;
-       sparsebit_idx_t pg;
+       sparsebit_idx_t pg, base;
+
+       TEST_ASSERT(num > 0, "Must allocate at least one page");
 
        TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
                "not divisible by page size.\n"
                "  paddr_min: 0x%lx page_size: 0x%x",
                paddr_min, vm->page_size);
 
-       /* Locate memory region. */
        region = memslot2region(vm, memslot);
+       base = pg = paddr_min >> vm->page_shift;
 
-       /* Locate next available physical page at or above paddr_min. */
-       pg = paddr_min >> vm->page_shift;
-
-       if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
-               pg = sparsebit_next_set(region->unused_phy_pages, pg);
-               if (pg == 0) {
-                       fprintf(stderr, "No guest physical page available, "
-                               "paddr_min: 0x%lx page_size: 0x%x memslot: %u",
-                               paddr_min, vm->page_size, memslot);
-                       fputs("---- vm dump ----\n", stderr);
-                       vm_dump(stderr, vm, 2);
-                       abort();
+       do {
+               for (; pg < base + num; ++pg) {
+                       if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
+                               base = pg = sparsebit_next_set(region->unused_phy_pages, pg);
+                               break;
+                       }
                }
+       } while (pg && pg != base + num);
+
+       if (pg == 0) {
+               fprintf(stderr, "No guest physical page available, "
+                       "paddr_min: 0x%lx page_size: 0x%x memslot: %u\n",
+                       paddr_min, vm->page_size, memslot);
+               fputs("---- vm dump ----\n", stderr);
+               vm_dump(stderr, vm, 2);
+               abort();
        }
 
-       /* Specify page as in use and return its address. */
-       sparsebit_clear(region->unused_phy_pages, pg);
+       for (pg = base; pg < base + num; ++pg)
+               sparsebit_clear(region->unused_phy_pages, pg);
+
+       return base * vm->page_size;
+}
 
-       return pg * vm->page_size;
+vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
+                            uint32_t memslot)
+{
+       return vm_phy_pages_alloc(vm, 1, paddr_min, memslot);
 }
 
-/* Address Guest Virtual to Host Virtual
+/*
+ * Address Guest Virtual to Host Virtual
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -1669,17 +1487,3 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
 {
        return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
 }
-
-void guest_args_read(struct kvm_vm *vm, uint32_t vcpu_id,
-                    struct guest_args *args)
-{
-       struct kvm_run *run = vcpu_state(vm, vcpu_id);
-       struct kvm_regs regs;
-
-       memset(&regs, 0, sizeof(regs));
-       vcpu_regs_get(vm, vcpu_id, &regs);
-
-       args->port = run->io.port;
-       args->arg0 = regs.rdi;
-       args->arg1 = regs.rsi;
-}
index 542ed60..52701db 100644 (file)
@@ -1,28 +1,29 @@
 /*
- * tools/testing/selftests/kvm/lib/kvm_util.c
+ * tools/testing/selftests/kvm/lib/kvm_util_internal.h
  *
  * Copyright (C) 2018, Google LLC.
  *
  * This work is licensed under the terms of the GNU GPL, version 2.
  */
 
-#ifndef KVM_UTIL_INTERNAL_H
-#define KVM_UTIL_INTERNAL_H 1
+#ifndef SELFTEST_KVM_UTIL_INTERNAL_H
+#define SELFTEST_KVM_UTIL_INTERNAL_H
 
 #include "sparsebit.h"
 
+#define KVM_DEV_PATH           "/dev/kvm"
+
 #ifndef BITS_PER_BYTE
-#define BITS_PER_BYTE           8
+#define BITS_PER_BYTE          8
 #endif
 
 #ifndef BITS_PER_LONG
-#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
+#define BITS_PER_LONG          (BITS_PER_BYTE * sizeof(long))
 #endif
 
 #define DIV_ROUND_UP(n, d)     (((n) + (d) - 1) / (d))
-#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_LONG)
+#define BITS_TO_LONGS(nr)      DIV_ROUND_UP(nr, BITS_PER_LONG)
 
-/* Concrete definition of struct kvm_vm. */
 struct userspace_mem_region {
        struct userspace_mem_region *next, *prev;
        struct kvm_userspace_memory_region region;
@@ -45,14 +46,16 @@ struct kvm_vm {
        int mode;
        int kvm_fd;
        int fd;
+       unsigned int pgtable_levels;
        unsigned int page_size;
        unsigned int page_shift;
+       unsigned int pa_bits;
+       unsigned int va_bits;
        uint64_t max_gfn;
        struct vcpu *vcpu_head;
        struct userspace_mem_region *userspace_mem_region_head;
        struct sparsebit *vpages_valid;
        struct sparsebit *vpages_mapped;
-
        bool has_irqchip;
        bool pgd_created;
        vm_paddr_t pgd;
@@ -60,13 +63,11 @@ struct kvm_vm {
        vm_vaddr_t tss;
 };
 
-struct vcpu *vcpu_find(struct kvm_vm *vm,
-       uint32_t vcpuid);
-void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot);
+struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot,
+               int gdt_memslot);
 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
-void regs_dump(FILE *stream, struct kvm_regs *regs,
-       uint8_t indent);
-void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
-       uint8_t indent);
+void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent);
+void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent);
 
-#endif
+#endif /* SELFTEST_KVM_UTIL_INTERNAL_H */
diff --git a/tools/testing/selftests/kvm/lib/ucall.c b/tools/testing/selftests/kvm/lib/ucall.c
new file mode 100644 (file)
index 0000000..4777f9b
--- /dev/null
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#include "kvm_util.h"
+#include "kvm_util_internal.h"
+
+#define UCALL_PIO_PORT ((uint16_t)0x1000)
+
+static ucall_type_t ucall_type;
+static vm_vaddr_t *ucall_exit_mmio_addr;
+
+static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa)
+{
+       if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1))
+               return false;
+
+       virt_pg_map(vm, gpa, gpa, 0);
+
+       ucall_exit_mmio_addr = (vm_vaddr_t *)gpa;
+       sync_global_to_guest(vm, ucall_exit_mmio_addr);
+
+       return true;
+}
+
+void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg)
+{
+       ucall_type = type;
+       sync_global_to_guest(vm, ucall_type);
+
+       if (type == UCALL_PIO)
+               return;
+
+       if (type == UCALL_MMIO) {
+               vm_paddr_t gpa, start, end, step;
+               bool ret;
+
+               if (arg) {
+                       gpa = (vm_paddr_t)arg;
+                       ret = ucall_mmio_init(vm, gpa);
+                       TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa);
+                       return;
+               }
+
+               /*
+                * Find an address within the allowed virtual address space,
+                * that does _not_ have a KVM memory region associated with it.
+                * Identity mapping an address like this allows the guest to
+                * access it, but as KVM doesn't know what to do with it, it
+                * will assume it's something userspace handles and exit with
+                * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64.
+                * Here we start with a guess that the addresses around two
+                * thirds of the VA space are unmapped and then work both down
+                * and up from there in 1/6 VA space sized steps.
+                */
+               start = 1ul << (vm->va_bits * 2 / 3);
+               end = 1ul << vm->va_bits;
+               step = 1ul << (vm->va_bits / 6);
+               for (gpa = start; gpa >= 0; gpa -= step) {
+                       if (ucall_mmio_init(vm, gpa & ~(vm->page_size - 1)))
+                               return;
+               }
+               for (gpa = start + step; gpa < end; gpa += step) {
+                       if (ucall_mmio_init(vm, gpa & ~(vm->page_size - 1)))
+                               return;
+               }
+               TEST_ASSERT(false, "Can't find a ucall mmio address");
+       }
+}
+
+void ucall_uninit(struct kvm_vm *vm)
+{
+       ucall_type = 0;
+       sync_global_to_guest(vm, ucall_type);
+       ucall_exit_mmio_addr = 0;
+       sync_global_to_guest(vm, ucall_exit_mmio_addr);
+}
+
+static void ucall_pio_exit(struct ucall *uc)
+{
+#ifdef __x86_64__
+       asm volatile("in %[port], %%al"
+               : : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax");
+#endif
+}
+
+static void ucall_mmio_exit(struct ucall *uc)
+{
+       *ucall_exit_mmio_addr = (vm_vaddr_t)uc;
+}
+
+void ucall(uint64_t cmd, int nargs, ...)
+{
+       struct ucall uc = {
+               .cmd = cmd,
+       };
+       va_list va;
+       int i;
+
+       nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+
+       va_start(va, nargs);
+       for (i = 0; i < nargs; ++i)
+               uc.args[i] = va_arg(va, uint64_t);
+       va_end(va);
+
+       switch (ucall_type) {
+       case UCALL_PIO:
+               ucall_pio_exit(&uc);
+               break;
+       case UCALL_MMIO:
+               ucall_mmio_exit(&uc);
+               break;
+       };
+}
+
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+{
+       struct kvm_run *run = vcpu_state(vm, vcpu_id);
+
+       memset(uc, 0, sizeof(*uc));
+
+#ifdef __x86_64__
+       if (ucall_type == UCALL_PIO && run->exit_reason == KVM_EXIT_IO &&
+           run->io.port == UCALL_PIO_PORT) {
+               struct kvm_regs regs;
+               vcpu_regs_get(vm, vcpu_id, &regs);
+               memcpy(uc, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), sizeof(*uc));
+               return uc->cmd;
+       }
+#endif
+       if (ucall_type == UCALL_MMIO && run->exit_reason == KVM_EXIT_MMIO &&
+           run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) {
+               vm_vaddr_t gva;
+               TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8,
+                           "Unexpected ucall exit mmio address access");
+               gva = *(vm_vaddr_t *)run->mmio.data;
+               memcpy(uc, addr_gva2hva(vm, gva), sizeof(*uc));
+       }
+
+       return uc->cmd;
+}
similarity index 80%
rename from tools/testing/selftests/kvm/lib/x86.c
rename to tools/testing/selftests/kvm/lib/x86_64/processor.c
index a3122f1..f28127f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * tools/testing/selftests/kvm/lib/x86.c
+ * tools/testing/selftests/kvm/lib/x86_64/processor.c
  *
  * Copyright (C) 2018, Google LLC.
  *
@@ -10,8 +10,8 @@
 
 #include "test_util.h"
 #include "kvm_util.h"
-#include "kvm_util_internal.h"
-#include "x86.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
 
 /* Minimum physical address used for virtual translation tables. */
 #define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
@@ -231,7 +231,7 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
 {
        int rc;
 
-       TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use "
+       TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use "
                "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 
        /* If needed, create page map l4 table. */
@@ -264,7 +264,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
        uint16_t index[4];
        struct pageMapL4Entry *pml4e;
 
-       TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use "
+       TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use "
                "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 
        TEST_ASSERT((vaddr % vm->page_size) == 0,
@@ -551,7 +551,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
        struct pageTableEntry *pte;
        void *hva;
 
-       TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use "
+       TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use "
                "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 
        index[0] = (gva >> 12) & 0x1ffu;
@@ -624,9 +624,9 @@ void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
        kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot);
 
        switch (vm->mode) {
-       case VM_MODE_FLAT48PG:
+       case VM_MODE_P52V48_4K:
                sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
-               sregs.cr4 |= X86_CR4_PAE;
+               sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
                sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
 
                kvm_seg_set_unusable(&sregs.ldt);
@@ -672,6 +672,102 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
        vcpu_set_mp_state(vm, vcpuid, &mp_state);
 }
 
+/* Allocate an instance of struct kvm_cpuid2
+ *
+ * Input Args: None
+ *
+ * Output Args: None
+ *
+ * Return: A pointer to the allocated struct. The caller is responsible
+ * for freeing this struct.
+ *
+ * Since kvm_cpuid2 uses a 0-length array to allow a the size of the
+ * array to be decided at allocation time, allocation is slightly
+ * complicated. This function uses a reasonable default length for
+ * the array and performs the appropriate allocation.
+ */
+static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
+{
+       struct kvm_cpuid2 *cpuid;
+       int nent = 100;
+       size_t size;
+
+       size = sizeof(*cpuid);
+       size += nent * sizeof(struct kvm_cpuid_entry2);
+       cpuid = malloc(size);
+       if (!cpuid) {
+               perror("malloc");
+               abort();
+       }
+
+       cpuid->nent = nent;
+
+       return cpuid;
+}
+
+/* KVM Supported CPUID Get
+ *
+ * Input Args: None
+ *
+ * Output Args:
+ *
+ * Return: The supported KVM CPUID
+ *
+ * Get the guest CPUID supported by KVM.
+ */
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
+{
+       static struct kvm_cpuid2 *cpuid;
+       int ret;
+       int kvm_fd;
+
+       if (cpuid)
+               return cpuid;
+
+       cpuid = allocate_kvm_cpuid2();
+       kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+       if (kvm_fd < 0)
+               exit(KSFT_SKIP);
+
+       ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
+       TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
+                   ret, errno);
+
+       close(kvm_fd);
+       return cpuid;
+}
+
+/* Locate a cpuid entry.
+ *
+ * Input Args:
+ *   cpuid: The cpuid.
+ *   function: The function of the cpuid entry to find.
+ *
+ * Output Args: None
+ *
+ * Return: A pointer to the cpuid entry. Never returns NULL.
+ */
+struct kvm_cpuid_entry2 *
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
+{
+       struct kvm_cpuid2 *cpuid;
+       struct kvm_cpuid_entry2 *entry = NULL;
+       int i;
+
+       cpuid = kvm_get_supported_cpuid();
+       for (i = 0; i < cpuid->nent; i++) {
+               if (cpuid->entries[i].function == function &&
+                   cpuid->entries[i].index == index) {
+                       entry = &cpuid->entries[i];
+                       break;
+               }
+       }
+
+       TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).",
+                   function, index);
+       return entry;
+}
+
 /* VM VCPU CPUID Set
  *
  * Input Args:
@@ -698,6 +794,7 @@ void vcpu_set_cpuid(struct kvm_vm *vm,
                    rc, errno);
 
 }
+
 /* Create a VM with reasonable defaults
  *
  * Input Args:
@@ -726,7 +823,7 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
        uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
 
        /* Create VM */
-       vm = vm_create(VM_MODE_FLAT48PG,
+       vm = vm_create(VM_MODE_P52V48_4K,
                       DEFAULT_GUEST_PHY_PAGES + extra_pg_pages,
                       O_RDWR);
 
@@ -742,6 +839,154 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
        return vm;
 }
 
+/* VCPU Get MSR
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   msr_index - Index of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
+ *
+ * Get value of MSR for VCPU.
+ */
+uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
+{
+       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+       struct {
+               struct kvm_msrs header;
+               struct kvm_msr_entry entry;
+       } buffer = {};
+       int r;
+
+       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+       buffer.header.nmsrs = 1;
+       buffer.entry.index = msr_index;
+       r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header);
+       TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
+               "  rc: %i errno: %i", r, errno);
+
+       return buffer.entry.data;
+}
+
+/* VCPU Set MSR
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   msr_index - Index of MSR
+ *   msr_value - New value of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, nothing. On failure a TEST_ASSERT is produced.
+ *
+ * Set value of MSR for VCPU.
+ */
+void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+       uint64_t msr_value)
+{
+       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+       struct {
+               struct kvm_msrs header;
+               struct kvm_msr_entry entry;
+       } buffer = {};
+       int r;
+
+       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+       memset(&buffer, 0, sizeof(buffer));
+       buffer.header.nmsrs = 1;
+       buffer.entry.index = msr_index;
+       buffer.entry.data = msr_value;
+       r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header);
+       TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n"
+               "  rc: %i errno: %i", r, errno);
+}
+
+/* VM VCPU Args Set
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   num - number of arguments
+ *   ... - arguments, each of type uint64_t
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the first num function input arguments to the values
+ * given as variable args.  Each of the variable args is expected to
+ * be of type uint64_t.
+ */
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+       va_list ap;
+       struct kvm_regs regs;
+
+       TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
+                   "  num: %u\n",
+                   num);
+
+       va_start(ap, num);
+       vcpu_regs_get(vm, vcpuid, &regs);
+
+       if (num >= 1)
+               regs.rdi = va_arg(ap, uint64_t);
+
+       if (num >= 2)
+               regs.rsi = va_arg(ap, uint64_t);
+
+       if (num >= 3)
+               regs.rdx = va_arg(ap, uint64_t);
+
+       if (num >= 4)
+               regs.rcx = va_arg(ap, uint64_t);
+
+       if (num >= 5)
+               regs.r8 = va_arg(ap, uint64_t);
+
+       if (num >= 6)
+               regs.r9 = va_arg(ap, uint64_t);
+
+       vcpu_regs_set(vm, vcpuid, &regs);
+       va_end(ap);
+}
+
+/*
+ * VM VCPU Dump
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   indent - Left margin indent amount
+ *
+ * Output Args:
+ *   stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps the current state of the VCPU specified by vcpuid, within the VM
+ * given by vm, to the FILE stream given by stream.
+ */
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
+{
+       struct kvm_regs regs;
+       struct kvm_sregs sregs;
+
+       fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid);
+
+       fprintf(stream, "%*sregs:\n", indent + 2, "");
+       vcpu_regs_get(vm, vcpuid, &regs);
+       regs_dump(stream, &regs, indent + 4);
+
+       fprintf(stream, "%*ssregs:\n", indent + 2, "");
+       vcpu_sregs_get(vm, vcpuid, &sregs);
+       sregs_dump(stream, &sregs, indent + 4);
+}
+
 struct kvm_x86_state {
        struct kvm_vcpu_events events;
        struct kvm_mp_state mp_state;
similarity index 88%
rename from tools/testing/selftests/kvm/lib/vmx.c
rename to tools/testing/selftests/kvm/lib/x86_64/vmx.c
index b987c3c..771ba6b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * tools/testing/selftests/kvm/lib/x86.c
+ * tools/testing/selftests/kvm/lib/x86_64/vmx.c
  *
  * Copyright (C) 2018, Google LLC.
  *
 
 #include "test_util.h"
 #include "kvm_util.h"
-#include "x86.h"
+#include "processor.h"
 #include "vmx.h"
 
+bool enable_evmcs;
+
 /* Allocate memory regions for nested VMX tests.
  *
  * Input Args:
@@ -62,6 +64,20 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
        vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
        memset(vmx->vmwrite_hva, 0, getpagesize());
 
+       /* Setup of a region of guest memory for the VP Assist page. */
+       vmx->vp_assist = (void *)vm_vaddr_alloc(vm, getpagesize(),
+                                               0x10000, 0, 0);
+       vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist);
+       vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist);
+
+       /* Setup of a region of guest memory for the enlightened VMCS. */
+       vmx->enlightened_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(),
+                                                      0x10000, 0, 0);
+       vmx->enlightened_vmcs_hva =
+               addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs);
+       vmx->enlightened_vmcs_gpa =
+               addr_gva2gpa(vm, (uintptr_t)vmx->enlightened_vmcs);
+
        *p_vmx_gva = vmx_gva;
        return vmx;
 }
@@ -107,18 +123,31 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx)
        if (vmxon(vmx->vmxon_gpa))
                return false;
 
-       /* Load a VMCS. */
-       *(uint32_t *)(vmx->vmcs) = vmcs_revision();
-       if (vmclear(vmx->vmcs_gpa))
-               return false;
-
-       if (vmptrld(vmx->vmcs_gpa))
-               return false;
+       return true;
+}
 
-       /* Setup shadow VMCS, do not load it yet. */
-       *(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul;
-       if (vmclear(vmx->shadow_vmcs_gpa))
-               return false;
+bool load_vmcs(struct vmx_pages *vmx)
+{
+       if (!enable_evmcs) {
+               /* Load a VMCS. */
+               *(uint32_t *)(vmx->vmcs) = vmcs_revision();
+               if (vmclear(vmx->vmcs_gpa))
+                       return false;
+
+               if (vmptrld(vmx->vmcs_gpa))
+                       return false;
+
+               /* Setup shadow VMCS, do not load it yet. */
+               *(uint32_t *)(vmx->shadow_vmcs) =
+                       vmcs_revision() | 0x80000000ul;
+               if (vmclear(vmx->shadow_vmcs_gpa))
+                       return false;
+       } else {
+               if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa,
+                                 vmx->enlightened_vmcs))
+                       return false;
+               current_evmcs->revision_id = vmcs_revision();
+       }
 
        return true;
 }
@@ -17,7 +17,7 @@
 #include "test_util.h"
 
 #include "kvm_util.h"
-#include "x86.h"
+#include "processor.h"
 
 #define X86_FEATURE_XSAVE      (1<<26)
 #define X86_FEATURE_OSXSAVE    (1<<27)
@@ -67,6 +67,7 @@ int main(int argc, char *argv[])
        struct kvm_vm *vm;
        struct kvm_sregs sregs;
        struct kvm_cpuid_entry2 *entry;
+       struct ucall uc;
        int rc;
 
        entry = kvm_get_supported_cpuid_entry(1);
@@ -87,21 +88,20 @@ int main(int argc, char *argv[])
                rc = _vcpu_run(vm, VCPU_ID);
 
                if (run->exit_reason == KVM_EXIT_IO) {
-                       switch (run->io.port) {
-                       case GUEST_PORT_SYNC:
+                       switch (get_ucall(vm, VCPU_ID, &uc)) {
+                       case UCALL_SYNC:
                                /* emulate hypervisor clearing CR4.OSXSAVE */
                                vcpu_sregs_get(vm, VCPU_ID, &sregs);
                                sregs.cr4 &= ~X86_CR4_OSXSAVE;
                                vcpu_sregs_set(vm, VCPU_ID, &sregs);
                                break;
-                       case GUEST_PORT_ABORT:
+                       case UCALL_ABORT:
                                TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit.");
                                break;
-                       case GUEST_PORT_DONE:
+                       case UCALL_DONE:
                                goto done;
                        default:
-                               TEST_ASSERT(false, "Unknown port 0x%x.",
-                                           run->io.port);
+                               TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
                        }
                }
        }
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
new file mode 100644 (file)
index 0000000..92c2cfd
--- /dev/null
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for Enlightened VMCS, including nested guest state.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+
+#include "vmx.h"
+
+#define VCPU_ID                5
+
+static bool have_nested_state;
+
+void l2_guest_code(void)
+{
+       GUEST_SYNC(6);
+
+       GUEST_SYNC(7);
+
+       /* Done, exit to L1 and never come back.  */
+       vmcall();
+}
+
+void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+       unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+       enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist);
+
+       GUEST_ASSERT(vmx_pages->vmcs_gpa);
+       GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+       GUEST_SYNC(3);
+       GUEST_ASSERT(load_vmcs(vmx_pages));
+       GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+
+       GUEST_SYNC(4);
+       GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+
+       prepare_vmcs(vmx_pages, l2_guest_code,
+                    &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+       GUEST_SYNC(5);
+       GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+       GUEST_ASSERT(!vmlaunch());
+       GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+       GUEST_SYNC(8);
+       GUEST_ASSERT(!vmresume());
+       GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+       GUEST_SYNC(9);
+}
+
+void guest_code(struct vmx_pages *vmx_pages)
+{
+       GUEST_SYNC(1);
+       GUEST_SYNC(2);
+
+       if (vmx_pages)
+               l1_guest_code(vmx_pages);
+
+       GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+       struct vmx_pages *vmx_pages = NULL;
+       vm_vaddr_t vmx_pages_gva = 0;
+
+       struct kvm_regs regs1, regs2;
+       struct kvm_vm *vm;
+       struct kvm_run *run;
+       struct kvm_x86_state *state;
+       struct ucall uc;
+       int stage;
+       uint16_t evmcs_ver;
+       struct kvm_enable_cap enable_evmcs_cap = {
+               .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS,
+                .args[0] = (unsigned long)&evmcs_ver
+       };
+
+       struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
+
+       /* Create VM */
+       vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+       vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+       if (!kvm_check_cap(KVM_CAP_NESTED_STATE) ||
+           !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
+               printf("capabilities not available, skipping test\n");
+               exit(KSFT_SKIP);
+       }
+
+       vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap);
+
+       run = vcpu_state(vm, VCPU_ID);
+
+       vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+       vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+       vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+       for (stage = 1;; stage++) {
+               _vcpu_run(vm, VCPU_ID);
+               TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                           "Unexpected exit reason: %u (%s),\n",
+                           run->exit_reason,
+                           exit_reason_str(run->exit_reason));
+
+               memset(&regs1, 0, sizeof(regs1));
+               vcpu_regs_get(vm, VCPU_ID, &regs1);
+               switch (get_ucall(vm, VCPU_ID, &uc)) {
+               case UCALL_ABORT:
+                       TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0],
+                                   __FILE__, uc.args[1]);
+                       /* NOT REACHED */
+               case UCALL_SYNC:
+                       break;
+               case UCALL_DONE:
+                       goto done;
+               default:
+                       TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
+               }
+
+               /* UCALL_SYNC is handled here.  */
+               TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+                           uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx",
+                           stage, (ulong)uc.args[1]);
+
+               state = vcpu_save_state(vm, VCPU_ID);
+               kvm_vm_release(vm);
+
+               /* Restore state in a new VM.  */
+               kvm_vm_restart(vm, O_RDWR);
+               vm_vcpu_add(vm, VCPU_ID, 0, 0);
+               vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+               vcpu_load_state(vm, VCPU_ID, state);
+               run = vcpu_state(vm, VCPU_ID);
+               free(state);
+
+               memset(&regs2, 0, sizeof(regs2));
+               vcpu_regs_get(vm, VCPU_ID, &regs2);
+               TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+                           "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+                           (ulong) regs2.rdi, (ulong) regs2.rsi);
+       }
+
+done:
+       kvm_vm_free(vm);
+}
@@ -19,7 +19,7 @@
 
 #include "test_util.h"
 #include "kvm_util.h"
-#include "x86.h"
+#include "processor.h"
 
 #define VCPU_ID 0
 #define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00
@@ -48,7 +48,7 @@ static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable)
 static void test_msr_platform_info_enabled(struct kvm_vm *vm)
 {
        struct kvm_run *run = vcpu_state(vm, VCPU_ID);
-       struct guest_args args;
+       struct ucall uc;
 
        set_msr_platform_info_enabled(vm, true);
        vcpu_run(vm, VCPU_ID);
@@ -56,11 +56,11 @@ static void test_msr_platform_info_enabled(struct kvm_vm *vm)
                        "Exit_reason other than KVM_EXIT_IO: %u (%s),\n",
                        run->exit_reason,
                        exit_reason_str(run->exit_reason));
-       guest_args_read(vm, VCPU_ID, &args);
-       TEST_ASSERT(args.port == GUEST_PORT_SYNC,
-                       "Received IO from port other than PORT_HOST_SYNC: %u\n",
-                       run->io.port);
-       TEST_ASSERT((args.arg1 & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) ==
+       get_ucall(vm, VCPU_ID, &uc);
+       TEST_ASSERT(uc.cmd == UCALL_SYNC,
+                       "Received ucall other than UCALL_SYNC: %u\n",
+                       ucall);
+       TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) ==
                MSR_PLATFORM_INFO_MAX_TURBO_RATIO,
                "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.",
                MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
@@ -22,7 +22,7 @@
 #include "test_util.h"
 
 #include "kvm_util.h"
-#include "x86.h"
+#include "processor.h"
 
 #define VCPU_ID                  5
 
similarity index 89%
rename from tools/testing/selftests/kvm/state_test.c
rename to tools/testing/selftests/kvm/x86_64/state_test.c
index 900e3e9..03da41f 100644 (file)
@@ -17,7 +17,7 @@
 #include "test_util.h"
 
 #include "kvm_util.h"
-#include "x86.h"
+#include "processor.h"
 #include "vmx.h"
 
 #define VCPU_ID                5
@@ -26,20 +26,20 @@ static bool have_nested_state;
 
 void l2_guest_code(void)
 {
-       GUEST_SYNC(5);
+       GUEST_SYNC(6);
 
         /* Exit to L1 */
        vmcall();
 
        /* L1 has now set up a shadow VMCS for us.  */
        GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
-       GUEST_SYNC(9);
+       GUEST_SYNC(10);
        GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
        GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee));
-       GUEST_SYNC(10);
+       GUEST_SYNC(11);
        GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee);
        GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee));
-       GUEST_SYNC(11);
+       GUEST_SYNC(12);
 
        /* Done, exit to L1 and never come back.  */
        vmcall();
@@ -52,15 +52,17 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
 
        GUEST_ASSERT(vmx_pages->vmcs_gpa);
        GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+       GUEST_SYNC(3);
+       GUEST_ASSERT(load_vmcs(vmx_pages));
        GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
 
-       GUEST_SYNC(3);
+       GUEST_SYNC(4);
        GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
 
        prepare_vmcs(vmx_pages, l2_guest_code,
                     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
 
-       GUEST_SYNC(4);
+       GUEST_SYNC(5);
        GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
        GUEST_ASSERT(!vmlaunch());
        GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
@@ -72,7 +74,7 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
        GUEST_ASSERT(!vmresume());
        GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
 
-       GUEST_SYNC(6);
+       GUEST_SYNC(7);
        GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
 
        GUEST_ASSERT(!vmresume());
@@ -85,12 +87,12 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
 
        GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
        GUEST_ASSERT(vmlaunch());
-       GUEST_SYNC(7);
+       GUEST_SYNC(8);
        GUEST_ASSERT(vmlaunch());
        GUEST_ASSERT(vmresume());
 
        vmwrite(GUEST_RIP, 0xc0ffee);
-       GUEST_SYNC(8);
+       GUEST_SYNC(9);
        GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
 
        GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa));
@@ -101,7 +103,7 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
        GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
        GUEST_ASSERT(vmlaunch());
        GUEST_ASSERT(vmresume());
-       GUEST_SYNC(12);
+       GUEST_SYNC(13);
        GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
        GUEST_ASSERT(vmlaunch());
        GUEST_ASSERT(vmresume());
@@ -127,6 +129,7 @@ int main(int argc, char *argv[])
        struct kvm_vm *vm;
        struct kvm_run *run;
        struct kvm_x86_state *state;
+       struct ucall uc;
        int stage;
 
        struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
@@ -155,23 +158,23 @@ int main(int argc, char *argv[])
 
                memset(&regs1, 0, sizeof(regs1));
                vcpu_regs_get(vm, VCPU_ID, &regs1);
-               switch (run->io.port) {
-               case GUEST_PORT_ABORT:
-                       TEST_ASSERT(false, "%s at %s:%d", (const char *) regs1.rdi,
-                                   __FILE__, regs1.rsi);
+               switch (get_ucall(vm, VCPU_ID, &uc)) {
+               case UCALL_ABORT:
+                       TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0],
+                                   __FILE__, uc.args[1]);
                        /* NOT REACHED */
-               case GUEST_PORT_SYNC:
+               case UCALL_SYNC:
                        break;
-               case GUEST_PORT_DONE:
+               case UCALL_DONE:
                        goto done;
                default:
-                       TEST_ASSERT(false, "Unknown port 0x%x.", run->io.port);
+                       TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
                }
 
-               /* PORT_SYNC is handled here.  */
-               TEST_ASSERT(!strcmp((const char *)regs1.rdi, "hello") &&
-                           regs1.rsi == stage, "Unexpected register values vmexit #%lx, got %lx",
-                           stage, (ulong) regs1.rsi);
+               /* UCALL_SYNC is handled here.  */
+               TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+                           uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx",
+                           stage, (ulong)uc.args[1]);
 
                state = vcpu_save_state(vm, VCPU_ID);
                kvm_vm_release(vm);
@@ -19,7 +19,7 @@
 
 #include "test_util.h"
 #include "kvm_util.h"
-#include "x86.h"
+#include "processor.h"
 
 #define VCPU_ID 5
 
@@ -1,5 +1,5 @@
 /*
- * gtests/tests/vmx_tsc_adjust_test.c
+ * vmx_tsc_adjust_test
  *
  * Copyright (C) 2018, Google LLC.
  *
 
 #include "test_util.h"
 #include "kvm_util.h"
-#include "x86.h"
+#include "processor.h"
 #include "vmx.h"
 
 #include <string.h>
 #include <sys/ioctl.h>
 
-#include "../kselftest.h"
+#include "kselftest.h"
 
 #ifndef MSR_IA32_TSC_ADJUST
 #define MSR_IA32_TSC_ADJUST 0x3b
@@ -94,6 +94,7 @@ static void l1_guest_code(struct vmx_pages *vmx_pages)
        check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
 
        GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+       GUEST_ASSERT(load_vmcs(vmx_pages));
 
        /* Prepare the VMCS for L2 execution. */
        prepare_vmcs(vmx_pages, l2_guest_code,
@@ -146,26 +147,25 @@ int main(int argc, char *argv[])
 
        for (;;) {
                volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
-               struct guest_args args;
+               struct ucall uc;
 
                vcpu_run(vm, VCPU_ID);
-               guest_args_read(vm, VCPU_ID, &args);
                TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
                            "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
                            run->exit_reason,
                            exit_reason_str(run->exit_reason));
 
-               switch (args.port) {
-               case GUEST_PORT_ABORT:
-                       TEST_ASSERT(false, "%s", (const char *) args.arg0);
+               switch (get_ucall(vm, VCPU_ID, &uc)) {
+               case UCALL_ABORT:
+                       TEST_ASSERT(false, "%s", (const char *)uc.args[0]);
                        /* NOT REACHED */
-               case GUEST_PORT_SYNC:
-                       report(args.arg1);
+               case UCALL_SYNC:
+                       report(uc.args[1]);
                        break;
-               case GUEST_PORT_DONE:
+               case UCALL_DONE:
                        goto done;
                default:
-                       TEST_ASSERT(false, "Unknown port 0x%x.", args.port);
+                       TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
                }
        }
 
index 150c8a6..2377497 100644 (file)
@@ -120,8 +120,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
        int ret, cpu;
 
-       if (type)
-               return -EINVAL;
+       ret = kvm_arm_setup_stage2(kvm, type);
+       if (ret)
+               return ret;
 
        kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran));
        if (!kvm->arch.last_vcpu_ran)
@@ -212,6 +213,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_MP_STATE:
        case KVM_CAP_IMMEDIATE_EXIT:
+       case KVM_CAP_VCPU_EVENTS:
                r = 1;
                break;
        case KVM_CAP_ARM_SET_DEVICE_ADDR:
@@ -240,7 +242,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = 1;
                break;
        default:
-               r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
+               r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
                break;
        }
        return r;
@@ -544,7 +546,7 @@ static void update_vttbr(struct kvm *kvm)
 
        /* update vttbr to be used with the new vmid */
        pgd_phys = virt_to_phys(kvm->arch.pgd);
-       BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
+       BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm));
        vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
        kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp;
 
@@ -1295,8 +1297,6 @@ static void cpu_init_hyp_mode(void *dummy)
 
        __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
        __cpu_init_stage2();
-
-       kvm_arm_init_debug();
 }
 
 static void cpu_hyp_reset(void)
@@ -1309,16 +1309,12 @@ static void cpu_hyp_reinit(void)
 {
        cpu_hyp_reset();
 
-       if (is_kernel_in_hyp_mode()) {
-               /*
-                * __cpu_init_stage2() is safe to call even if the PM
-                * event was cancelled before the CPU was reset.
-                */
-               __cpu_init_stage2();
+       if (is_kernel_in_hyp_mode())
                kvm_timer_init_vhe();
-       } else {
+       else
                cpu_init_hyp_mode(NULL);
-       }
+
+       kvm_arm_init_debug();
 
        if (vgic_present)
                kvm_vgic_init_cpu_hardware();
@@ -1412,6 +1408,8 @@ static int init_common_resources(void)
        kvm_vmid_bits = kvm_get_vmid_bits();
        kvm_info("%d-bit VMID\n", kvm_vmid_bits);
 
+       kvm_set_ipa_limit();
+
        return 0;
 }
 
index 1a2c3a1..5eca48b 100644 (file)
@@ -45,7 +45,6 @@ static phys_addr_t hyp_idmap_vector;
 
 static unsigned long io_map_base;
 
-#define S2_PGD_SIZE    (PTRS_PER_S2_PGD * sizeof(pgd_t))
 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
 
 #define KVM_S2PTE_FLAG_IS_IOMAP                (1UL << 0)
@@ -150,20 +149,20 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 
 static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
 {
-       pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
-       stage2_pgd_clear(pgd);
+       pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
+       stage2_pgd_clear(kvm, pgd);
        kvm_tlb_flush_vmid_ipa(kvm, addr);
-       stage2_pud_free(pud_table);
+       stage2_pud_free(kvm, pud_table);
        put_page(virt_to_page(pgd));
 }
 
 static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
 {
-       pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
-       VM_BUG_ON(stage2_pud_huge(*pud));
-       stage2_pud_clear(pud);
+       pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
+       VM_BUG_ON(stage2_pud_huge(kvm, *pud));
+       stage2_pud_clear(kvm, pud);
        kvm_tlb_flush_vmid_ipa(kvm, addr);
-       stage2_pmd_free(pmd_table);
+       stage2_pmd_free(kvm, pmd_table);
        put_page(virt_to_page(pud));
 }
 
@@ -252,7 +251,7 @@ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
 
-       if (stage2_pte_table_empty(start_pte))
+       if (stage2_pte_table_empty(kvm, start_pte))
                clear_stage2_pmd_entry(kvm, pmd, start_addr);
 }
 
@@ -262,9 +261,9 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
        phys_addr_t next, start_addr = addr;
        pmd_t *pmd, *start_pmd;
 
-       start_pmd = pmd = stage2_pmd_offset(pud, addr);
+       start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
        do {
-               next = stage2_pmd_addr_end(addr, end);
+               next = stage2_pmd_addr_end(kvm, addr, end);
                if (!pmd_none(*pmd)) {
                        if (pmd_thp_or_huge(*pmd)) {
                                pmd_t old_pmd = *pmd;
@@ -281,7 +280,7 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
                }
        } while (pmd++, addr = next, addr != end);
 
-       if (stage2_pmd_table_empty(start_pmd))
+       if (stage2_pmd_table_empty(kvm, start_pmd))
                clear_stage2_pud_entry(kvm, pud, start_addr);
 }
 
@@ -291,14 +290,14 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
        phys_addr_t next, start_addr = addr;
        pud_t *pud, *start_pud;
 
-       start_pud = pud = stage2_pud_offset(pgd, addr);
+       start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
        do {
-               next = stage2_pud_addr_end(addr, end);
-               if (!stage2_pud_none(*pud)) {
-                       if (stage2_pud_huge(*pud)) {
+               next = stage2_pud_addr_end(kvm, addr, end);
+               if (!stage2_pud_none(kvm, *pud)) {
+                       if (stage2_pud_huge(kvm, *pud)) {
                                pud_t old_pud = *pud;
 
-                               stage2_pud_clear(pud);
+                               stage2_pud_clear(kvm, pud);
                                kvm_tlb_flush_vmid_ipa(kvm, addr);
                                kvm_flush_dcache_pud(old_pud);
                                put_page(virt_to_page(pud));
@@ -308,7 +307,7 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
                }
        } while (pud++, addr = next, addr != end);
 
-       if (stage2_pud_table_empty(start_pud))
+       if (stage2_pud_table_empty(kvm, start_pud))
                clear_stage2_pgd_entry(kvm, pgd, start_addr);
 }
 
@@ -332,7 +331,7 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
        assert_spin_locked(&kvm->mmu_lock);
        WARN_ON(size & ~PAGE_MASK);
 
-       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
        do {
                /*
                 * Make sure the page table is still active, as another thread
@@ -341,8 +340,8 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
                 */
                if (!READ_ONCE(kvm->arch.pgd))
                        break;
-               next = stage2_pgd_addr_end(addr, end);
-               if (!stage2_pgd_none(*pgd))
+               next = stage2_pgd_addr_end(kvm, addr, end);
+               if (!stage2_pgd_none(kvm, *pgd))
                        unmap_stage2_puds(kvm, pgd, addr, next);
                /*
                 * If the range is too large, release the kvm->mmu_lock
@@ -371,9 +370,9 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
        pmd_t *pmd;
        phys_addr_t next;
 
-       pmd = stage2_pmd_offset(pud, addr);
+       pmd = stage2_pmd_offset(kvm, pud, addr);
        do {
-               next = stage2_pmd_addr_end(addr, end);
+               next = stage2_pmd_addr_end(kvm, addr, end);
                if (!pmd_none(*pmd)) {
                        if (pmd_thp_or_huge(*pmd))
                                kvm_flush_dcache_pmd(*pmd);
@@ -389,11 +388,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
        pud_t *pud;
        phys_addr_t next;
 
-       pud = stage2_pud_offset(pgd, addr);
+       pud = stage2_pud_offset(kvm, pgd, addr);
        do {
-               next = stage2_pud_addr_end(addr, end);
-               if (!stage2_pud_none(*pud)) {
-                       if (stage2_pud_huge(*pud))
+               next = stage2_pud_addr_end(kvm, addr, end);
+               if (!stage2_pud_none(kvm, *pud)) {
+                       if (stage2_pud_huge(kvm, *pud))
                                kvm_flush_dcache_pud(*pud);
                        else
                                stage2_flush_pmds(kvm, pud, addr, next);
@@ -409,10 +408,11 @@ static void stage2_flush_memslot(struct kvm *kvm,
        phys_addr_t next;
        pgd_t *pgd;
 
-       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
        do {
-               next = stage2_pgd_addr_end(addr, end);
-               stage2_flush_puds(kvm, pgd, addr, next);
+               next = stage2_pgd_addr_end(kvm, addr, end);
+               if (!stage2_pgd_none(kvm, *pgd))
+                       stage2_flush_puds(kvm, pgd, addr, next);
        } while (pgd++, addr = next, addr != end);
 }
 
@@ -897,7 +897,7 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
        }
 
        /* Allocate the HW PGD, making sure that each page gets its own refcount */
-       pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
+       pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
        if (!pgd)
                return -ENOMEM;
 
@@ -986,7 +986,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
 
        spin_lock(&kvm->mmu_lock);
        if (kvm->arch.pgd) {
-               unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
+               unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
                pgd = READ_ONCE(kvm->arch.pgd);
                kvm->arch.pgd = NULL;
        }
@@ -994,7 +994,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
 
        /* Free the HW pgd, one page at a time */
        if (pgd)
-               free_pages_exact(pgd, S2_PGD_SIZE);
+               free_pages_exact(pgd, stage2_pgd_size(kvm));
 }
 
 static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
@@ -1003,16 +1003,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
        pgd_t *pgd;
        pud_t *pud;
 
-       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
-       if (WARN_ON(stage2_pgd_none(*pgd))) {
+       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+       if (stage2_pgd_none(kvm, *pgd)) {
                if (!cache)
                        return NULL;
                pud = mmu_memory_cache_alloc(cache);
-               stage2_pgd_populate(pgd, pud);
+               stage2_pgd_populate(kvm, pgd, pud);
                get_page(virt_to_page(pgd));
        }
 
-       return stage2_pud_offset(pgd, addr);
+       return stage2_pud_offset(kvm, pgd, addr);
 }
 
 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
@@ -1025,15 +1025,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
        if (!pud)
                return NULL;
 
-       if (stage2_pud_none(*pud)) {
+       if (stage2_pud_none(kvm, *pud)) {
                if (!cache)
                        return NULL;
                pmd = mmu_memory_cache_alloc(cache);
-               stage2_pud_populate(pud, pmd);
+               stage2_pud_populate(kvm, pud, pmd);
                get_page(virt_to_page(pud));
        }
 
-       return stage2_pmd_offset(pud, addr);
+       return stage2_pmd_offset(kvm, pud, addr);
 }
 
 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
@@ -1207,8 +1207,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
                if (writable)
                        pte = kvm_s2pte_mkwrite(pte);
 
-               ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
-                                               KVM_NR_MEM_OBJS);
+               ret = mmu_topup_memory_cache(&cache,
+                                            kvm_mmu_cache_min_pages(kvm),
+                                            KVM_NR_MEM_OBJS);
                if (ret)
                        goto out;
                spin_lock(&kvm->mmu_lock);
@@ -1230,8 +1231,14 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
 {
        kvm_pfn_t pfn = *pfnp;
        gfn_t gfn = *ipap >> PAGE_SHIFT;
+       struct page *page = pfn_to_page(pfn);
 
-       if (PageTransCompoundMap(pfn_to_page(pfn))) {
+       /*
+        * PageTransCompoungMap() returns true for THP and
+        * hugetlbfs. Make sure the adjustment is done only for THP
+        * pages.
+        */
+       if (!PageHuge(page) && PageTransCompoundMap(page)) {
                unsigned long mask;
                /*
                 * The address we faulted on is backed by a transparent huge
@@ -1296,19 +1303,21 @@ static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
 
 /**
  * stage2_wp_pmds - write protect PUD range
+ * kvm:                kvm instance for the VM
  * @pud:       pointer to pud entry
  * @addr:      range start address
  * @end:       range end address
  */
-static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
+                          phys_addr_t addr, phys_addr_t end)
 {
        pmd_t *pmd;
        phys_addr_t next;
 
-       pmd = stage2_pmd_offset(pud, addr);
+       pmd = stage2_pmd_offset(kvm, pud, addr);
 
        do {
-               next = stage2_pmd_addr_end(addr, end);
+               next = stage2_pmd_addr_end(kvm, addr, end);
                if (!pmd_none(*pmd)) {
                        if (pmd_thp_or_huge(*pmd)) {
                                if (!kvm_s2pmd_readonly(pmd))
@@ -1328,18 +1337,19 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
   *
   * Process PUD entries, for a huge PUD we cause a panic.
   */
-static void  stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
+                           phys_addr_t addr, phys_addr_t end)
 {
        pud_t *pud;
        phys_addr_t next;
 
-       pud = stage2_pud_offset(pgd, addr);
+       pud = stage2_pud_offset(kvm, pgd, addr);
        do {
-               next = stage2_pud_addr_end(addr, end);
-               if (!stage2_pud_none(*pud)) {
+               next = stage2_pud_addr_end(kvm, addr, end);
+               if (!stage2_pud_none(kvm, *pud)) {
                        /* TODO:PUD not supported, revisit later if supported */
-                       BUG_ON(stage2_pud_huge(*pud));
-                       stage2_wp_pmds(pud, addr, next);
+                       BUG_ON(stage2_pud_huge(kvm, *pud));
+                       stage2_wp_pmds(kvm, pud, addr, next);
                }
        } while (pud++, addr = next, addr != end);
 }
@@ -1355,7 +1365,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
        pgd_t *pgd;
        phys_addr_t next;
 
-       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
        do {
                /*
                 * Release kvm_mmu_lock periodically if the memory region is
@@ -1369,9 +1379,9 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
                cond_resched_lock(&kvm->mmu_lock);
                if (!READ_ONCE(kvm->arch.pgd))
                        break;
-               next = stage2_pgd_addr_end(addr, end);
-               if (stage2_pgd_present(*pgd))
-                       stage2_wp_puds(pgd, addr, next);
+               next = stage2_pgd_addr_end(kvm, addr, end);
+               if (stage2_pgd_present(kvm, *pgd))
+                       stage2_wp_puds(kvm, pgd, addr, next);
        } while (pgd++, addr = next, addr != end);
 }
 
@@ -1514,7 +1524,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        up_read(&current->mm->mmap_sem);
 
        /* We need minimum second+third level pages */
-       ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
+       ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
                                     KVM_NR_MEM_OBJS);
        if (ret)
                return ret;
@@ -1757,7 +1767,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
        }
 
        /* Userspace should not be able to register out-of-bounds IPAs */
-       VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
+       VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
 
        if (fault_status == FSC_ACCESS) {
                handle_access_fault(vcpu, fault_ipa);
@@ -2056,7 +2066,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
         * space addressable by the KVM guest IPA space.
         */
        if (memslot->base_gfn + memslot->npages >=
-           (KVM_PHYS_SIZE >> PAGE_SHIFT))
+           (kvm_phys_size(kvm) >> PAGE_SHIFT))
                return -EFAULT;
 
        down_read(&current->mm->mmap_sem);
index 1250225..eb2a390 100644 (file)
@@ -241,13 +241,6 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
        list_for_each_entry(dev, &(its)->device_list, dev_list) \
                list_for_each_entry(ite, &(dev)->itt_head, ite_list)
 
-/*
- * We only implement 48 bits of PA at the moment, although the ITS
- * supports more. Let's be restrictive here.
- */
-#define BASER_ADDRESS(x)       ((x) & GENMASK_ULL(47, 16))
-#define CBASER_ADDRESS(x)      ((x) & GENMASK_ULL(47, 12))
-
 #define GIC_LPI_OFFSET 8192
 
 #define VITS_TYPER_IDBITS 16
@@ -759,6 +752,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
 {
        int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
        u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
+       phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
        int esz = GITS_BASER_ENTRY_SIZE(baser);
        int index;
        gfn_t gfn;
@@ -783,7 +777,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
                if (id >= (l1_tbl_size / esz))
                        return false;
 
-               addr = BASER_ADDRESS(baser) + id * esz;
+               addr = base + id * esz;
                gfn = addr >> PAGE_SHIFT;
 
                if (eaddr)
@@ -798,7 +792,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
 
        /* Each 1st level entry is represented by a 64-bit value. */
        if (kvm_read_guest_lock(its->dev->kvm,
-                          BASER_ADDRESS(baser) + index * sizeof(indirect_ptr),
+                          base + index * sizeof(indirect_ptr),
                           &indirect_ptr, sizeof(indirect_ptr)))
                return false;
 
@@ -808,11 +802,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
        if (!(indirect_ptr & BIT_ULL(63)))
                return false;
 
-       /*
-        * Mask the guest physical address and calculate the frame number.
-        * Any address beyond our supported 48 bits of PA will be caught
-        * by the actual check in the final step.
-        */
+       /* Mask the guest physical address and calculate the frame number. */
        indirect_ptr &= GENMASK_ULL(51, 16);
 
        /* Find the address of the actual entry */
@@ -1304,9 +1294,6 @@ static u64 vgic_sanitise_its_baser(u64 reg)
                                  GITS_BASER_OUTER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_outer_cacheability);
 
-       /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */
-       reg &= ~GENMASK_ULL(15, 12);
-
        /* We support only one (ITS) page size: 64K */
        reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
 
@@ -1325,11 +1312,8 @@ static u64 vgic_sanitise_its_cbaser(u64 reg)
                                  GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_outer_cacheability);
 
-       /*
-        * Sanitise the physical address to be 64k aligned.
-        * Also limit the physical addresses to 48 bits.
-        */
-       reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12));
+       /* Sanitise the physical address to be 64k aligned. */
+       reg &= ~GENMASK_ULL(15, 12);
 
        return reg;
 }
@@ -1375,7 +1359,7 @@ static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its)
        if (!its->enabled)
                return;
 
-       cbaser = CBASER_ADDRESS(its->cbaser);
+       cbaser = GITS_CBASER_ADDRESS(its->cbaser);
 
        while (its->cwriter != its->creadr) {
                int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr,
@@ -2233,7 +2217,7 @@ static int vgic_its_restore_device_tables(struct vgic_its *its)
        if (!(baser & GITS_BASER_VALID))
                return 0;
 
-       l1_gpa = BASER_ADDRESS(baser);
+       l1_gpa = GITS_BASER_ADDR_48_to_52(baser);
 
        if (baser & GITS_BASER_INDIRECT) {
                l1_esz = GITS_LVL1_ENTRY_SIZE;
@@ -2305,7 +2289,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
 {
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        u64 baser = its->baser_coll_table;
-       gpa_t gpa = BASER_ADDRESS(baser);
+       gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
        struct its_collection *collection;
        u64 val;
        size_t max_size, filled = 0;
@@ -2354,7 +2338,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its)
        if (!(baser & GITS_BASER_VALID))
                return 0;
 
-       gpa = BASER_ADDRESS(baser);
+       gpa = GITS_BASER_ADDR_48_to_52(baser);
 
        max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
 
index 6ada243..114dce9 100644 (file)
@@ -25,7 +25,7 @@
 int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
                      phys_addr_t addr, phys_addr_t alignment)
 {
-       if (addr & ~KVM_PHYS_MASK)
+       if (addr & ~kvm_phys_mask(kvm))
                return -E2BIG;
 
        if (!IS_ALIGNED(addr, alignment))
index a2a175b..b3d1f09 100644 (file)
@@ -364,7 +364,6 @@ static u64 vgic_sanitise_pendbaser(u64 reg)
                                  vgic_sanitise_outer_cacheability);
 
        reg &= ~PENDBASER_RES0_MASK;
-       reg &= ~GENMASK_ULL(51, 48);
 
        return reg;
 }
@@ -382,7 +381,6 @@ static u64 vgic_sanitise_propbaser(u64 reg)
                                  vgic_sanitise_outer_cacheability);
 
        reg &= ~PROPBASER_RES0_MASK;
-       reg &= ~GENMASK_ULL(51, 48);
        return reg;
 }
 
index 9e65feb..3710342 100644 (file)
@@ -83,6 +83,7 @@ static int coalesced_mmio_write(struct kvm_vcpu *vcpu,
        ring->coalesced_mmio[ring->last].phys_addr = addr;
        ring->coalesced_mmio[ring->last].len = len;
        memcpy(ring->coalesced_mmio[ring->last].data, val, len);
+       ring->coalesced_mmio[ring->last].pio = dev->zone.pio;
        smp_wmb();
        ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
        spin_unlock(&dev->kvm->ring_lock);
@@ -140,6 +141,9 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
        int ret;
        struct kvm_coalesced_mmio_dev *dev;
 
+       if (zone->pio != 1 && zone->pio != 0)
+               return -EINVAL;
+
        dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
        if (!dev)
                return -ENOMEM;
@@ -149,8 +153,9 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
        dev->zone = *zone;
 
        mutex_lock(&kvm->slots_lock);
-       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr,
-                                     zone->size, &dev->dev);
+       ret = kvm_io_bus_register_dev(kvm,
+                               zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS,
+                               zone->addr, zone->size, &dev->dev);
        if (ret < 0)
                goto out_free_dev;
        list_add_tail(&dev->list, &kvm->coalesced_zones);
@@ -174,7 +179,8 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
 
        list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
                if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
-                       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+                       kvm_io_bus_unregister_dev(kvm,
+                               zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
                        kvm_iodevice_destructor(&dev->dev);
                }
 
index f986e31..786ade1 100644 (file)
@@ -219,7 +219,7 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
        me = get_cpu();
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!test_bit(i, vcpu_bitmap))
+               if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
                        continue;
 
                kvm_make_request(req, vcpu);
@@ -243,12 +243,10 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
        cpumask_var_t cpus;
        bool called;
-       static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]
-               = {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX};
 
        zalloc_cpumask_var(&cpus, GFP_ATOMIC);
 
-       called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus);
+       called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus);
 
        free_cpumask_var(cpus);
        return called;
@@ -807,20 +805,25 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
  * sorted array and known changed memslot position.
  */
 static void update_memslots(struct kvm_memslots *slots,
-                           struct kvm_memory_slot *new)
+                           struct kvm_memory_slot *new,
+                           enum kvm_mr_change change)
 {
        int id = new->id;
        int i = slots->id_to_index[id];
        struct kvm_memory_slot *mslots = slots->memslots;
 
        WARN_ON(mslots[i].id != id);
-       if (!new->npages) {
-               WARN_ON(!mslots[i].npages);
-               if (mslots[i].npages)
-                       slots->used_slots--;
-       } else {
-               if (!mslots[i].npages)
-                       slots->used_slots++;
+       switch (change) {
+       case KVM_MR_CREATE:
+               slots->used_slots++;
+               WARN_ON(mslots[i].npages || !new->npages);
+               break;
+       case KVM_MR_DELETE:
+               slots->used_slots--;
+               WARN_ON(new->npages || !mslots[i].npages);
+               break;
+       default:
+               break;
        }
 
        while (i < KVM_MEM_SLOTS_NUM - 1 &&
@@ -1056,7 +1059,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
                memset(&new.arch, 0, sizeof(new.arch));
        }
 
-       update_memslots(slots, &new);
+       update_memslots(slots, &new, change);
        old_memslots = install_new_memslots(kvm, as_id, slots);
 
        kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
@@ -1311,8 +1314,12 @@ unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
 
 /*
- * If writable is set to false, the hva returned by this function is only
- * allowed to be read.
+ * Return the hva of a @gfn and the R/W attribute if possible.
+ *
+ * @slot: the kvm_memory_slot which contains @gfn
+ * @gfn: the gfn to be translated
+ * @writable: used to return the read/write attribute of the @slot if the hva
+ * is valid and @writable is not NULL
  */
 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
                                      gfn_t gfn, bool *writable)
@@ -2946,6 +2953,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #ifdef CONFIG_KVM_MMIO
        case KVM_CAP_COALESCED_MMIO:
                return KVM_COALESCED_MMIO_PAGE_OFFSET;
+       case KVM_CAP_COALESCED_PIO:
+               return 1;
 #endif
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
        case KVM_CAP_IRQ_ROUTING: