Merge tag 'kvmarm-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm...
authorPaolo Bonzini <pbonzini@redhat.com>
Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)
committerPaolo Bonzini <pbonzini@redhat.com>
Tue, 31 Mar 2020 14:44:53 +0000 (10:44 -0400)
KVM/arm updates for Linux 5.7

- GICv4.1 support
- 32bit host removal

126 files changed:
Documentation/admin-guide/kernel-parameters.txt
Documentation/virt/kvm/api.rst
Documentation/virt/kvm/devices/s390_flic.rst
Documentation/virt/kvm/index.rst
Documentation/virt/kvm/locking.rst
Documentation/virt/kvm/s390-pv-boot.rst [new file with mode: 0644]
Documentation/virt/kvm/s390-pv.rst [new file with mode: 0644]
MAINTAINERS
arch/arm64/kvm/fpsimd.c
arch/arm64/kvm/guest.c
arch/arm64/kvm/hyp/switch.c
arch/arm64/kvm/sys_regs.c
arch/arm64/kvm/sys_regs_generic_v8.c
arch/mips/include/asm/kvm_host.h
arch/mips/kvm/mips.c
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_64_vio.c
arch/powerpc/kvm/book3s_64_vio_hv.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/mpic.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/timing.h
arch/s390/boot/Makefile
arch/s390/boot/uv.c
arch/s390/include/asm/gmap.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/mmu.h
arch/s390/include/asm/mmu_context.h
arch/s390/include/asm/page.h
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/uv.h
arch/s390/kernel/Makefile
arch/s390/kernel/entry.h
arch/s390/kernel/pgm_check.S
arch/s390/kernel/setup.c
arch/s390/kernel/uv.c [new file with mode: 0644]
arch/s390/kvm/Makefile
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.c
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/s390/kvm/pv.c [new file with mode: 0644]
arch/s390/mm/fault.c
arch/s390/mm/gmap.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_page_track.h
arch/x86/include/asm/vmx.h
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/emulate.c
arch/x86/kvm/i8254.c
arch/x86/kvm/kvm_emulate.h [moved from arch/x86/include/asm/kvm_emulate.h with 93% similarity]
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/page_track.c
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/pmu.c
arch/x86/kvm/pmu.h
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx/capabilities.h
arch/x86/kvm/vmx/evmcs.h
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/nested.h
arch/x86/kvm/vmx/pmu_intel.c
arch/x86/kvm/vmx/vmenter.S
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
include/linux/kvm_host.h
include/uapi/linux/kvm.h
tools/arch/x86/include/asm/unistd_64.h
tools/kvm/kvm_stat/kvm_stat
tools/kvm/kvm_stat/kvm_stat.txt
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/clear_dirty_log_test.c
tools/testing/selftests/kvm/demand_paging_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/dirty_log_test.c
tools/testing/selftests/kvm/include/evmcs.h
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/include/test_util.h
tools/testing/selftests/kvm/kvm_create_max_vcpus.c
tools/testing/selftests/kvm/lib/aarch64/processor.c
tools/testing/selftests/kvm/lib/aarch64/ucall.c
tools/testing/selftests/kvm/lib/assert.c
tools/testing/selftests/kvm/lib/io.c
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/kvm_util_internal.h
tools/testing/selftests/kvm/lib/s390x/processor.c
tools/testing/selftests/kvm/lib/test_util.c [new file with mode: 0644]
tools/testing/selftests/kvm/lib/x86_64/processor.c
tools/testing/selftests/kvm/lib/x86_64/svm.c
tools/testing/selftests/kvm/lib/x86_64/vmx.c
tools/testing/selftests/kvm/s390x/memop.c
tools/testing/selftests/kvm/s390x/resets.c
tools/testing/selftests/kvm/s390x/sync_regs_test.c
tools/testing/selftests/kvm/steal_time.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
tools/testing/selftests/kvm/x86_64/evmcs_test.c
tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
tools/testing/selftests/kvm/x86_64/platform_info_test.c
tools/testing/selftests/kvm/x86_64/set_memory_region_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/smm_test.c
tools/testing/selftests/kvm/x86_64/state_test.c
tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
tools/testing/selftests/kvm/x86_64/sync_regs_test.c
tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
tools/testing/selftests/kvm/x86_64/xss_msr_test.c
virt/kvm/arm/arm.c
virt/kvm/arm/mmu.c
virt/kvm/arm/psci.c
virt/kvm/kvm_main.c

index c07815d..144c130 100644 (file)
                        before loading.
                        See Documentation/admin-guide/blockdev/ramdisk.rst.
 
+       prot_virt=      [S390] enable hosting protected virtual machines
+                       isolated from the hypervisor (if hardware supports
+                       that).
+                       Format: <bool>
+
        psi=            [KNL] Enable or disable pressure stall information
                        tracking.
                        Format: <bool>
index ebd383f..acebf6c 100644 (file)
@@ -1574,8 +1574,8 @@ This ioctl would set vcpu's xcr to the value userspace specified.
   };
 
   #define KVM_CPUID_FLAG_SIGNIFCANT_INDEX              BIT(0)
-  #define KVM_CPUID_FLAG_STATEFUL_FUNC         BIT(1)
-  #define KVM_CPUID_FLAG_STATE_READ_NEXT               BIT(2)
+  #define KVM_CPUID_FLAG_STATEFUL_FUNC         BIT(1) /* deprecated */
+  #define KVM_CPUID_FLAG_STATE_READ_NEXT               BIT(2) /* deprecated */
 
   struct kvm_cpuid_entry2 {
        __u32 function;
@@ -1626,13 +1626,6 @@ emulate them efficiently. The fields in each entry are defined as follows:
 
         KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
            if the index field is valid
-        KVM_CPUID_FLAG_STATEFUL_FUNC:
-           if cpuid for this function returns different values for successive
-           invocations; there will be several entries with the same function,
-           all with this flag set
-        KVM_CPUID_FLAG_STATE_READ_NEXT:
-           for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
-           the first entry to be read by a cpu
 
    eax, ebx, ecx, edx:
          the values returned by the cpuid instruction for
@@ -2117,7 +2110,8 @@ Errors:
 
   ======   ============================================================
  Â ENOENT Â Â no such register
- Â EINVAL Â Â invalid register ID, or no such register
+ Â EINVAL Â Â invalid register ID, or no such register or used with VMs in
+           protected virtualization mode on s390
  Â EPERM Â Â Â (arm64) register access not allowed before vcpu finalization
   ======   ============================================================
 
@@ -2552,7 +2546,8 @@ Errors include:
 
   ======== ============================================================
  Â ENOENT Â Â no such register
- Â EINVAL Â Â invalid register ID, or no such register
+ Â EINVAL Â Â invalid register ID, or no such register or used with VMs in
+           protected virtualization mode on s390
  Â EPERM Â Â Â (arm64) register access not allowed before vcpu finalization
   ======== ============================================================
 
@@ -3347,8 +3342,8 @@ The member 'flags' is used for passing flags from userspace.
 ::
 
   #define KVM_CPUID_FLAG_SIGNIFCANT_INDEX              BIT(0)
-  #define KVM_CPUID_FLAG_STATEFUL_FUNC         BIT(1)
-  #define KVM_CPUID_FLAG_STATE_READ_NEXT               BIT(2)
+  #define KVM_CPUID_FLAG_STATEFUL_FUNC         BIT(1) /* deprecated */
+  #define KVM_CPUID_FLAG_STATE_READ_NEXT               BIT(2) /* deprecated */
 
   struct kvm_cpuid_entry2 {
        __u32 function;
@@ -3394,13 +3389,6 @@ The fields in each entry are defined as follows:
 
         KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
            if the index field is valid
-        KVM_CPUID_FLAG_STATEFUL_FUNC:
-           if cpuid for this function returns different values for successive
-           invocations; there will be several entries with the same function,
-           all with this flag set
-        KVM_CPUID_FLAG_STATE_READ_NEXT:
-           for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
-           the first entry to be read by a cpu
 
    eax, ebx, ecx, edx:
 
@@ -4649,6 +4637,60 @@ the clear cpu reset definition in the POP. However, the cpu is not put
 into ESA mode. This reset is a superset of the initial reset.
 
 
+4.125 KVM_S390_PV_COMMAND
+-------------------------
+
+:Capability: KVM_CAP_S390_PROTECTED
+:Architectures: s390
+:Type: vm ioctl
+:Parameters: struct kvm_pv_cmd
+:Returns: 0 on success, < 0 on error
+
+::
+
+  struct kvm_pv_cmd {
+       __u32 cmd;      /* Command to be executed */
+       __u16 rc;       /* Ultravisor return code */
+       __u16 rrc;      /* Ultravisor return reason code */
+       __u64 data;     /* Data or address */
+       __u32 flags;    /* flags for future extensions. Must be 0 for now */
+       __u32 reserved[3];
+  };
+
+cmd values:
+
+KVM_PV_ENABLE
+  Allocate memory and register the VM with the Ultravisor, thereby
+  donating memory to the Ultravisor that will become inaccessible to
+  KVM. All existing CPUs are converted to protected ones. After this
+  command has succeeded, any CPU added via hotplug will become
+  protected during its creation as well.
+
+  Errors:
+
+  =====      =============================
+  EINTR      an unmasked signal is pending
+  =====      =============================
+
+KVM_PV_DISABLE
+
+  Deregister the VM from the Ultravisor and reclaim the memory that
+  had been donated to the Ultravisor, making it usable by the kernel
+  again.  All registered VCPUs are converted back to non-protected
+  ones.
+
+KVM_PV_VM_SET_SEC_PARMS
+  Pass the image header from VM memory to the Ultravisor in
+  preparation of image unpacking and verification.
+
+KVM_PV_VM_UNPACK
+  Unpack (protect and decrypt) a page of the encrypted boot image.
+
+KVM_PV_VM_VERIFY
+  Verify the integrity of the unpacked image. Only if this succeeds,
+  KVM is allowed to start protected VCPUs.
+
+
 5. The kvm_run structure
 ========================
 
@@ -5707,8 +5749,13 @@ and injected exceptions.
 :Architectures: x86, arm, arm64, mips
 :Parameters: args[0] whether feature should be enabled or not
 
-With this capability enabled, KVM_GET_DIRTY_LOG will not automatically
-clear and write-protect all pages that are returned as dirty.
+Valid flags are::
+
+  #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE   (1 << 0)
+  #define KVM_DIRTY_LOG_INITIALLY_SET           (1 << 1)
+
+With KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE is set, KVM_GET_DIRTY_LOG will not
+automatically clear and write-protect all pages that are returned as dirty.
 Rather, userspace will have to do this operation separately using
 KVM_CLEAR_DIRTY_LOG.
 
@@ -5719,12 +5766,19 @@ than requiring to sync a full memslot; this ensures that KVM does not
 take spinlocks for an extended period of time.  Second, in some cases a
 large amount of time can pass between a call to KVM_GET_DIRTY_LOG and
 userspace actually using the data in the page.  Pages can be modified
-during this time, which is inefficint for both the guest and userspace:
+during this time, which is inefficient for both the guest and userspace:
 the guest will incur a higher penalty due to write protection faults,
 while userspace can see false reports of dirty pages.  Manual reprotection
 helps reducing this time, improving guest performance and reducing the
 number of dirty log false positives.
 
+With KVM_DIRTY_LOG_INITIALLY_SET set, all the bits of the dirty bitmap
+will be initialized to 1 when created.  This also improves performance because
+dirty logging can be enabled gradually in small chunks on the first call
+to KVM_CLEAR_DIRTY_LOG.  KVM_DIRTY_LOG_INITIALLY_SET depends on
+KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on
+x86 for now).
+
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make
 it hard or impossible to use it correctly.  The availability of
@@ -6027,3 +6081,14 @@ Architectures: s390
 
 This capability indicates that the KVM_S390_NORMAL_RESET and
 KVM_S390_CLEAR_RESET ioctls are available.
+
+8.23 KVM_CAP_S390_PROTECTED
+
+Architecture: s390
+
+
+This capability indicates that the Ultravisor has been initialized and
+KVM can therefore start protected VMs.
+This capability governs the KVM_S390_PV_COMMAND ioctl and the
+KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
+guests when the state change is invalid.
index 954190d..ea96559 100644 (file)
@@ -108,16 +108,9 @@ Groups:
       mask or unmask the adapter, as specified in mask
 
     KVM_S390_IO_ADAPTER_MAP
-      perform a gmap translation for the guest address provided in addr,
-      pin a userspace page for the translated address and add it to the
-      list of mappings
-
-      .. note:: A new mapping will be created unconditionally; therefore,
-               the calling code should avoid making duplicate mappings.
-
+      This is now a no-op. The mapping is purely done by the irq route.
     KVM_S390_IO_ADAPTER_UNMAP
-      release a userspace page for the translated address specified in addr
-      from the list of mappings
+      This is now a no-op. The mapping is purely done by the irq route.
 
   KVM_DEV_FLIC_AISM
     modify the adapter-interruption-suppression mode for a given isc if the
index 774deae..dcc2526 100644 (file)
@@ -18,6 +18,8 @@ KVM
    nested-vmx
    ppc-pv
    s390-diag
+   s390-pv
+   s390-pv-boot
    timekeeping
    vcpu-requests
 
index c02291b..b21a34c 100644 (file)
@@ -96,19 +96,18 @@ will happen:
 We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
 
 For direct sp, we can easily avoid it since the spte of direct sp is fixed
-to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic()
-to pin gfn to pfn, because after gfn_to_pfn_atomic():
+to gfn.  For indirect sp, we disabled fast page fault for simplicity.
+
+A solution for indirect sp could be to pin the gfn, for example via
+kvm_vcpu_gfn_to_pfn_atomic, before the cmpxchg.  After the pinning:
 
 - We have held the refcount of pfn that means the pfn can not be freed and
   be reused for another gfn.
-- The pfn is writable that means it can not be shared between different gfns
+- The pfn is writable and therefore it cannot be shared between different gfns
   by KSM.
 
 Then, we can ensure the dirty bitmaps is correctly set for a gfn.
 
-Currently, to simplify the whole things, we disable fast page fault for
-indirect shadow page.
-
 2) Dirty bit tracking
 
 In the origin code, the spte can be fast updated (non-atomically) if the
diff --git a/Documentation/virt/kvm/s390-pv-boot.rst b/Documentation/virt/kvm/s390-pv-boot.rst
new file mode 100644 (file)
index 0000000..8b8fa03
--- /dev/null
@@ -0,0 +1,84 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================================
+s390 (IBM Z) Boot/IPL of Protected VMs
+======================================
+
+Summary
+-------
+The memory of Protected Virtual Machines (PVMs) is not accessible to
+I/O or the hypervisor. In those cases where the hypervisor needs to
+access the memory of a PVM, that memory must be made accessible.
+Memory made accessible to the hypervisor will be encrypted. See
+:doc:`s390-pv` for details."
+
+On IPL (boot) a small plaintext bootloader is started, which provides
+information about the encrypted components and necessary metadata to
+KVM to decrypt the protected virtual machine.
+
+Based on this data, KVM will make the protected virtual machine known
+to the Ultravisor (UV) and instruct it to secure the memory of the
+PVM, decrypt the components and verify the data and address list
+hashes, to ensure integrity. Afterwards KVM can run the PVM via the
+SIE instruction which the UV will intercept and execute on KVM's
+behalf.
+
+As the guest image is just like an opaque kernel image that does the
+switch into PV mode itself, the user can load encrypted guest
+executables and data via every available method (network, dasd, scsi,
+direct kernel, ...) without the need to change the boot process.
+
+
+Diag308
+-------
+This diagnose instruction is the basic mechanism to handle IPL and
+related operations for virtual machines. The VM can set and retrieve
+IPL information blocks, that specify the IPL method/devices and
+request VM memory and subsystem resets, as well as IPLs.
+
+For PVMs this concept has been extended with new subcodes:
+
+Subcode 8: Set an IPL Information Block of type 5 (information block
+for PVMs)
+Subcode 9: Store the saved block in guest memory
+Subcode 10: Move into Protected Virtualization mode
+
+The new PV load-device-specific-parameters field specifies all data
+that is necessary to move into PV mode.
+
+* PV Header origin
+* PV Header length
+* List of Components composed of
+   * AES-XTS Tweak prefix
+   * Origin
+   * Size
+
+The PV header contains the keys and hashes, which the UV will use to
+decrypt and verify the PV, as well as control flags and a start PSW.
+
+The components are for instance an encrypted kernel, kernel parameters
+and initrd. The components are decrypted by the UV.
+
+After the initial import of the encrypted data, all defined pages will
+contain the guest content. All non-specified pages will start out as
+zero pages on first access.
+
+
+When running in protected virtualization mode, some subcodes will result in
+exceptions or return error codes.
+
+Subcodes 4 and 7, which specify operations that do not clear the guest
+memory, will result in specification exceptions. This is because the
+UV will clear all memory when a secure VM is removed, and therefore
+non-clearing IPL subcodes are not allowed.
+
+Subcodes 8, 9, 10 will result in specification exceptions.
+Re-IPL into a protected mode is only possible via a detour into non
+protected mode.
+
+Keys
+----
+Every CEC will have a unique public key to enable tooling to build
+encrypted images.
+See  `s390-tools <https://github.com/ibm-s390-tools/s390-tools/>`_
+for the tooling.
diff --git a/Documentation/virt/kvm/s390-pv.rst b/Documentation/virt/kvm/s390-pv.rst
new file mode 100644 (file)
index 0000000..774a8c6
--- /dev/null
@@ -0,0 +1,116 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+s390 (IBM Z) Ultravisor and Protected VMs
+=========================================
+
+Summary
+-------
+Protected virtual machines (PVM) are KVM VMs that do not allow KVM to
+access VM state like guest memory or guest registers. Instead, the
+PVMs are mostly managed by a new entity called Ultravisor (UV). The UV
+provides an API that can be used by PVMs and KVM to request management
+actions.
+
+Each guest starts in non-protected mode and then may make a request to
+transition into protected mode. On transition, KVM registers the guest
+and its VCPUs with the Ultravisor and prepares everything for running
+it.
+
+The Ultravisor will secure and decrypt the guest's boot memory
+(i.e. kernel/initrd). It will safeguard state changes like VCPU
+starts/stops and injected interrupts while the guest is running.
+
+As access to the guest's state, such as the SIE state description, is
+normally needed to be able to run a VM, some changes have been made in
+the behavior of the SIE instruction. A new format 4 state description
+has been introduced, where some fields have different meanings for a
+PVM. SIE exits are minimized as much as possible to improve speed and
+reduce exposed guest state.
+
+
+Interrupt injection
+-------------------
+Interrupt injection is safeguarded by the Ultravisor. As KVM doesn't
+have access to the VCPUs' lowcores, injection is handled via the
+format 4 state description.
+
+Machine check, external, IO and restart interruptions each can be
+injected on SIE entry via a bit in the interrupt injection control
+field (offset 0x54). If the guest cpu is not enabled for the interrupt
+at the time of injection, a validity interception is recognized. The
+format 4 state description contains fields in the interception data
+block where data associated with the interrupt can be transported.
+
+Program and Service Call exceptions have another layer of
+safeguarding; they can only be injected for instructions that have
+been intercepted into KVM. The exceptions need to be a valid outcome
+of an instruction emulation by KVM, e.g. we can never inject a
+addressing exception as they are reported by SIE since KVM has no
+access to the guest memory.
+
+
+Mask notification interceptions
+-------------------------------
+KVM cannot intercept lctl(g) and lpsw(e) anymore in order to be
+notified when a PVM enables a certain class of interrupt.  As a
+replacement, two new interception codes have been introduced: One
+indicating that the contents of CRs 0, 6, or 14 have been changed,
+indicating different interruption subclasses; and one indicating that
+PSW bit 13 has been changed, indicating that a machine check
+intervention was requested and those are now enabled.
+
+Instruction emulation
+---------------------
+With the format 4 state description for PVMs, the SIE instruction already
+interprets more instructions than it does with format 2. It is not able
+to interpret every instruction, but needs to hand some tasks to KVM;
+therefore, the SIE and the ultravisor safeguard emulation inputs and outputs.
+
+The control structures associated with SIE provide the Secure
+Instruction Data Area (SIDA), the Interception Parameters (IP) and the
+Secure Interception General Register Save Area.  Guest GRs and most of
+the instruction data, such as I/O data structures, are filtered.
+Instruction data is copied to and from the SIDA when needed.  Guest
+GRs are put into / retrieved from the Secure Interception General
+Register Save Area.
+
+Only GR values needed to emulate an instruction will be copied into this
+save area and the real register numbers will be hidden.
+
+The Interception Parameters state description field still contains the
+the bytes of the instruction text, but with pre-set register values
+instead of the actual ones. I.e. each instruction always uses the same
+instruction text, in order not to leak guest instruction text.
+This also implies that the register content that a guest had in r<n>
+may be in r<m> from the hypervisor's point of view.
+
+The Secure Instruction Data Area contains instruction storage
+data. Instruction data, i.e. data being referenced by an instruction
+like the SCCB for sclp, is moved via the SIDA. When an instruction is
+intercepted, the SIE will only allow data and program interrupts for
+this instruction to be moved to the guest via the two data areas
+discussed before. Other data is either ignored or results in validity
+interceptions.
+
+
+Instruction emulation interceptions
+-----------------------------------
+There are two types of SIE secure instruction intercepts: the normal
+and the notification type. Normal secure instruction intercepts will
+make the guest pending for instruction completion of the intercepted
+instruction type, i.e. on SIE entry it is attempted to complete
+emulation of the instruction with the data provided by KVM. That might
+be a program exception or instruction completion.
+
+The notification type intercepts inform KVM about guest environment
+changes due to guest instruction interpretation. Such an interception
+is recognized, for example, for the store prefix instruction to provide
+the new lowcore location. On SIE reentry, any KVM data in the data areas
+is ignored and execution continues as if the guest instruction had
+completed. For that reason KVM is not allowed to inject a program
+interrupt.
+
+Links
+-----
+`KVM Forum 2019 presentation <https://static.sched.com/hosted_files/kvmforum2019/3b/ibm_protected_vms_s390x.pdf>`_
index e84a94e..d87d009 100644 (file)
@@ -9207,6 +9207,7 @@ L:        kvm@vger.kernel.org
 W:     http://www.ibm.com/developerworks/linux/linux390/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
 S:     Supported
+F:     Documentation/virt/kvm/s390*
 F:     arch/s390/include/uapi/asm/kvm*
 F:     arch/s390/include/asm/gmap.h
 F:     arch/s390/include/asm/kvm*
index 5250105..e329a36 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/kvm_host.h>
 #include <asm/fpsimd.h>
 #include <asm/kvm_asm.h>
-#include <asm/kvm_host.h>
 #include <asm/kvm_mmu.h>
 #include <asm/sysreg.h>
 
index 2bd9230..23ebe51 100644 (file)
@@ -25,7 +25,6 @@
 #include <asm/kvm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
-#include <asm/kvm_host.h>
 #include <asm/sigcontext.h>
 
 #include "trace.h"
index 925086b..600010c 100644 (file)
@@ -17,7 +17,6 @@
 #include <asm/kprobes.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
-#include <asm/kvm_host.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 #include <asm/fpsimd.h>
index 3e909b1..b95f7b7 100644 (file)
@@ -22,7 +22,6 @@
 #include <asm/kvm_arm.h>
 #include <asm/kvm_coproc.h>
 #include <asm/kvm_emulate.h>
-#include <asm/kvm_host.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 #include <asm/perf_event.h>
index 2b4a3e2..9cb6b4c 100644 (file)
@@ -12,7 +12,6 @@
 #include <asm/cputype.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
-#include <asm/kvm_host.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
 #include <asm/sysreg.h>
index 41204a4..2c343c3 100644 (file)
@@ -1133,7 +1133,7 @@ extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm);
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
-               struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
+                                        struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
index 71244bf..7850775 100644 (file)
@@ -188,12 +188,6 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl,
        return -ENOIOCTLCMD;
 }
 
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                           unsigned long npages)
-{
-       return 0;
-}
-
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
        /* Flush whole GPA */
@@ -230,7 +224,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                   const struct kvm_userspace_memory_region *mem,
-                                  const struct kvm_memory_slot *old,
+                                  struct kvm_memory_slot *old,
                                   const struct kvm_memory_slot *new,
                                   enum kvm_mr_change change)
 {
@@ -984,69 +978,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
        return r;
 }
 
-/**
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- * @kvm: kvm instance
- * @log: slot id and address to which we copy the log
- *
- * Steps 1-4 below provide general overview of dirty page logging. See
- * kvm_get_dirty_log_protect() function description for additional details.
- *
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- * always flush the TLB (step 4) even if previous step failed  and the dirty
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- * writes will be marked dirty for next log read.
- *
- *   1. Take a snapshot of the bit and clear it if needed.
- *   2. Write protect the corresponding page.
- *   3. Copy the snapshot to the userspace.
- *   4. Flush TLB's if needed.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       bool flush = false;
-       int r;
-
-       mutex_lock(&kvm->slots_lock);
-
-       r = kvm_get_dirty_log_protect(kvm, log, &flush);
-
-       if (flush) {
-               slots = kvm_memslots(kvm);
-               memslot = id_to_memslot(slots, log->slot);
 
-               /* Let implementation handle TLB/GVA invalidation */
-               kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
-       }
-
-       mutex_unlock(&kvm->slots_lock);
-       return r;
 }
 
-int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
+void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+                                       struct kvm_memory_slot *memslot)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       bool flush = false;
-       int r;
-
-       mutex_lock(&kvm->slots_lock);
-
-       r = kvm_clear_dirty_log_protect(kvm, log, &flush);
-
-       if (flush) {
-               slots = kvm_memslots(kvm);
-               memslot = id_to_memslot(slots, log->slot);
-
-               /* Let implementation handle TLB/GVA invalidation */
-               kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
-       }
-
-       mutex_unlock(&kvm->slots_lock);
-       return r;
+       /* Let implementation handle TLB/GVA invalidation */
+       kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
 }
 
 long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
index bc2494e..406ec46 100644 (file)
@@ -200,14 +200,11 @@ extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern void kvmppc_core_free_memslot(struct kvm *kvm,
-                                    struct kvm_memory_slot *free,
-                                    struct kvm_memory_slot *dont);
-extern int kvmppc_core_create_memslot(struct kvm *kvm,
-                                     struct kvm_memory_slot *slot,
-                                     unsigned long npages);
+                                    struct kvm_memory_slot *slot);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
-                               const struct kvm_userspace_memory_region *mem);
+                               const struct kvm_userspace_memory_region *mem,
+                               enum kvm_mr_change change);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem,
                                const struct kvm_memory_slot *old,
@@ -280,7 +277,8 @@ struct kvmppc_ops {
        void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot);
        int (*prepare_memory_region)(struct kvm *kvm,
                                     struct kvm_memory_slot *memslot,
-                                    const struct kvm_userspace_memory_region *mem);
+                                    const struct kvm_userspace_memory_region *mem,
+                                    enum kvm_mr_change change);
        void (*commit_memory_region)(struct kvm *kvm,
                                     const struct kvm_userspace_memory_region *mem,
                                     const struct kvm_memory_slot *old,
@@ -292,10 +290,7 @@ struct kvmppc_ops {
        int (*test_age_hva)(struct kvm *kvm, unsigned long hva);
        void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte);
        void (*mmu_destroy)(struct kvm_vcpu *vcpu);
-       void (*free_memslot)(struct kvm_memory_slot *free,
-                            struct kvm_memory_slot *dont);
-       int (*create_memslot)(struct kvm_memory_slot *slot,
-                             unsigned long npages);
+       void (*free_memslot)(struct kvm_memory_slot *slot);
        int (*init_vm)(struct kvm *kvm);
        void (*destroy_vm)(struct kvm *kvm);
        int (*get_smmu_info)(struct kvm *kvm, struct kvm_ppc_smmu_info *info);
index d07a8e1..0adaf47 100644 (file)
@@ -799,21 +799,19 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
        return vcpu->kvm->arch.kvm_ops->check_requests(vcpu);
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
-       return kvm->arch.kvm_ops->get_dirty_log(kvm, log);
+
 }
 
-void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-                             struct kvm_memory_slot *dont)
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-       kvm->arch.kvm_ops->free_memslot(free, dont);
+       return kvm->arch.kvm_ops->get_dirty_log(kvm, log);
 }
 
-int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                              unsigned long npages)
+void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
-       return kvm->arch.kvm_ops->create_memslot(slot, npages);
+       kvm->arch.kvm_ops->free_memslot(slot);
 }
 
 void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
@@ -823,9 +821,11 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
-                               const struct kvm_userspace_memory_region *mem)
+                               const struct kvm_userspace_memory_region *mem,
+                               enum kvm_mr_change change)
 {
-       return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem);
+       return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem,
+                                                       change);
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
index ee6c103..50555ad 100644 (file)
@@ -27,7 +27,6 @@
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
-#include <asm/kvm_host.h>
 #include <asm/udbg.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
index ab6eeb8..6fcaf1f 100644 (file)
@@ -24,7 +24,6 @@
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
-#include <asm/kvm_host.h>
 #include <asm/udbg.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
index 2cefd07..fbc55a1 100644 (file)
@@ -72,7 +72,6 @@
 #include <asm/xics.h>
 #include <asm/xive.h>
 #include <asm/hw_breakpoint.h>
-#include <asm/kvm_host.h>
 #include <asm/kvm_book3s_uvmem.h>
 #include <asm/ultravisor.h>
 
@@ -4400,7 +4399,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
        slots = kvm_memslots(kvm);
        memslot = id_to_memslot(slots, log->slot);
        r = -ENOENT;
-       if (!memslot->dirty_bitmap)
+       if (!memslot || !memslot->dirty_bitmap)
                goto out;
 
        /*
@@ -4447,29 +4446,26 @@ out:
        return r;
 }
 
-static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
-                                       struct kvm_memory_slot *dont)
+static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot)
 {
-       if (!dont || free->arch.rmap != dont->arch.rmap) {
-               vfree(free->arch.rmap);
-               free->arch.rmap = NULL;
-       }
+       vfree(slot->arch.rmap);
+       slot->arch.rmap = NULL;
 }
 
-static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
-                                        unsigned long npages)
+static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
+                                       struct kvm_memory_slot *slot,
+                                       const struct kvm_userspace_memory_region *mem,
+                                       enum kvm_mr_change change)
 {
-       slot->arch.rmap = vzalloc(array_size(npages, sizeof(*slot->arch.rmap)));
-       if (!slot->arch.rmap)
-               return -ENOMEM;
+       unsigned long npages = mem->memory_size >> PAGE_SHIFT;
 
-       return 0;
-}
+       if (change == KVM_MR_CREATE) {
+               slot->arch.rmap = vzalloc(array_size(npages,
+                                         sizeof(*slot->arch.rmap)));
+               if (!slot->arch.rmap)
+                       return -ENOMEM;
+       }
 
-static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot,
-                                       const struct kvm_userspace_memory_region *mem)
-{
        return 0;
 }
 
@@ -5528,7 +5524,6 @@ static struct kvmppc_ops kvm_ops_hv = {
        .set_spte_hva = kvm_set_spte_hva_hv,
        .mmu_destroy  = kvmppc_mmu_destroy_hv,
        .free_memslot = kvmppc_core_free_memslot_hv,
-       .create_memslot = kvmppc_core_create_memslot_hv,
        .init_vm =  kvmppc_core_init_vm_hv,
        .destroy_vm = kvmppc_core_destroy_vm_hv,
        .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
index 729a0f1..3bc2f5d 100644 (file)
@@ -1884,7 +1884,6 @@ out:
 static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
                                         struct kvm_dirty_log *log)
 {
-       struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        struct kvm_vcpu *vcpu;
        ulong ga, ga_end;
@@ -1894,15 +1893,12 @@ static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
 
        mutex_lock(&kvm->slots_lock);
 
-       r = kvm_get_dirty_log(kvm, log, &is_dirty);
+       r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot);
        if (r)
                goto out;
 
        /* If nothing is dirty, don't bother messing with page tables. */
        if (is_dirty) {
-               slots = kvm_memslots(kvm);
-               memslot = id_to_memslot(slots, log->slot);
-
                ga = memslot->base_gfn << PAGE_SHIFT;
                ga_end = ga + (memslot->npages << PAGE_SHIFT);
 
@@ -1927,7 +1923,8 @@ static void kvmppc_core_flush_memslot_pr(struct kvm *kvm,
 
 static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
                                        struct kvm_memory_slot *memslot,
-                                       const struct kvm_userspace_memory_region *mem)
+                                       const struct kvm_userspace_memory_region *mem,
+                                       enum kvm_mr_change change)
 {
        return 0;
 }
@@ -1941,19 +1938,11 @@ static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
        return;
 }
 
-static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free,
-                                       struct kvm_memory_slot *dont)
+static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *slot)
 {
        return;
 }
 
-static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot,
-                                        unsigned long npages)
-{
-       return 0;
-}
-
-
 #ifdef CONFIG_PPC64
 static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm,
                                         struct kvm_ppc_smmu_info *info)
@@ -2099,7 +2088,6 @@ static struct kvmppc_ops kvm_ops_pr = {
        .set_spte_hva = kvm_set_spte_hva_pr,
        .mmu_destroy  = kvmppc_mmu_destroy_pr,
        .free_memslot = kvmppc_core_free_memslot_pr,
-       .create_memslot = kvmppc_core_create_memslot_pr,
        .init_vm = kvmppc_core_init_vm_pr,
        .destroy_vm = kvmppc_core_destroy_vm_pr,
        .get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr,
index 7b27604..c9f4b37 100644 (file)
@@ -1766,25 +1766,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        return r;
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
-       return -ENOTSUPP;
+
 }
 
-void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-                             struct kvm_memory_slot *dont)
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
+       return -ENOTSUPP;
 }
 
-int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                              unsigned long npages)
+void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
-       return 0;
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                      struct kvm_memory_slot *memslot,
-                                     const struct kvm_userspace_memory_region *mem)
+                                     const struct kvm_userspace_memory_region *mem,
+                                     enum kvm_mr_change change)
 {
        return 0;
 }
index fe312c1..23e9c2b 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/uaccess.h>
 #include <asm/mpic.h>
 #include <asm/kvm_para.h>
-#include <asm/kvm_host.h>
 #include <asm/kvm_ppc.h>
 #include <kvm/iodev.h>
 
index 1af96fb..62ee66d 100644 (file)
@@ -32,7 +32,6 @@
 #include <asm/plpar_wrappers.h>
 #endif
 #include <asm/ultravisor.h>
-#include <asm/kvm_host.h>
 
 #include "timing.h"
 #include "irq.h"
@@ -685,16 +684,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
        return -EINVAL;
 }
 
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-                          struct kvm_memory_slot *dont)
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
-       kvmppc_core_free_memslot(kvm, free, dont);
-}
-
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                           unsigned long npages)
-{
-       return kvmppc_core_create_memslot(kvm, slot, npages);
+       kvmppc_core_free_memslot(kvm, slot);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -702,12 +694,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   const struct kvm_userspace_memory_region *mem,
                                   enum kvm_mr_change change)
 {
-       return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
+       return kvmppc_core_prepare_memory_region(kvm, memslot, mem, change);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                   const struct kvm_userspace_memory_region *mem,
-                                  const struct kvm_memory_slot *old,
+                                  struct kvm_memory_slot *old,
                                   const struct kvm_memory_slot *new,
                                   enum kvm_mr_change change)
 {
index ace65f9..feef788 100644 (file)
@@ -10,7 +10,6 @@
 #define __POWERPC_KVM_EXITTIMING_H__
 
 #include <linux/kvm_host.h>
-#include <asm/kvm_host.h>
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu);
index 0ff9261..45b33b8 100644 (file)
@@ -37,7 +37,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
 obj-y  := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o
 obj-y  += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
 obj-y  += version.o pgm_check_info.o ctype.o text_dma.o
-obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST)   += uv.o
+obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))  += uv.o
 obj-$(CONFIG_RELOCATABLE)      += machine_kexec_reloc.o
 obj-$(CONFIG_RANDOMIZE_BASE)   += kaslr.o
 targets        := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
index 3f50115..8fde561 100644 (file)
@@ -3,7 +3,13 @@
 #include <asm/facility.h>
 #include <asm/sections.h>
 
+/* will be used in arch/s390/kernel/uv.c */
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
 int __bootdata_preserved(prot_virt_guest);
+#endif
+#if IS_ENABLED(CONFIG_KVM)
+struct uv_info __bootdata_preserved(uv_info);
+#endif
 
 void uv_query_info(void)
 {
@@ -19,7 +25,21 @@ void uv_query_info(void)
        if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100)
                return;
 
+       if (IS_ENABLED(CONFIG_KVM)) {
+               memcpy(uv_info.inst_calls_list, uvcb.inst_calls_list, sizeof(uv_info.inst_calls_list));
+               uv_info.uv_base_stor_len = uvcb.uv_base_stor_len;
+               uv_info.guest_base_stor_len = uvcb.conf_base_phys_stor_len;
+               uv_info.guest_virt_base_stor_len = uvcb.conf_base_virt_stor_len;
+               uv_info.guest_virt_var_stor_len = uvcb.conf_virt_var_stor_len;
+               uv_info.guest_cpu_stor_len = uvcb.cpu_stor_len;
+               uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE);
+               uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
+               uv_info.max_guest_cpus = uvcb.max_guest_cpus;
+       }
+
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
        if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) &&
            test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list))
                prot_virt_guest = 1;
+#endif
 }
index 37f96b6..a816fb4 100644 (file)
@@ -9,6 +9,7 @@
 #ifndef _ASM_S390_GMAP_H
 #define _ASM_S390_GMAP_H
 
+#include <linux/radix-tree.h>
 #include <linux/refcount.h>
 
 /* Generic bits for GMAP notification on DAT table entry changes. */
@@ -31,6 +32,7 @@
  * @table: pointer to the page directory
  * @asce: address space control element for gmap page table
  * @pfault_enabled: defines if pfaults are applicable for the guest
+ * @guest_handle: protected virtual machine handle for the ultravisor
  * @host_to_rmap: radix tree with gmap_rmap lists
  * @children: list of shadow gmap structures
  * @pt_list: list of all page tables used in the shadow guest address space
@@ -54,6 +56,8 @@ struct gmap {
        unsigned long asce_end;
        void *private;
        bool pfault_enabled;
+       /* only set for protected virtual machines */
+       unsigned long guest_handle;
        /* Additional data for shadow guest address spaces */
        struct radix_tree_root host_to_rmap;
        struct list_head children;
@@ -144,4 +148,6 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start,
 
 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
                             unsigned long gaddr, unsigned long vmaddr);
+int gmap_mark_unmergeable(void);
+void s390_reset_acc(struct mm_struct *mm);
 #endif /* _ASM_S390_GMAP_H */
index 1726224..d6bcd34 100644 (file)
@@ -127,6 +127,12 @@ struct mcck_volatile_info {
 #define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \
                           CR14_EXTERNAL_DAMAGE_SUBMASK)
 
+#define SIDAD_SIZE_MASK                0xff
+#define sida_origin(sie_block) \
+       ((sie_block)->sidad & PAGE_MASK)
+#define sida_size(sie_block) \
+       ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE)
+
 #define CPUSTAT_STOPPED    0x80000000
 #define CPUSTAT_WAIT       0x10000000
 #define CPUSTAT_ECALL_PEND 0x08000000
@@ -160,7 +166,13 @@ struct kvm_s390_sie_block {
        __u8    reserved08[4];          /* 0x0008 */
 #define PROG_IN_SIE (1<<0)
        __u32   prog0c;                 /* 0x000c */
-       __u8    reserved10[16];         /* 0x0010 */
+       union {
+               __u8    reserved10[16];         /* 0x0010 */
+               struct {
+                       __u64   pv_handle_cpu;
+                       __u64   pv_handle_config;
+               };
+       };
 #define PROG_BLOCK_SIE (1<<0)
 #define PROG_REQUEST   (1<<1)
        atomic_t prog20;                /* 0x0020 */
@@ -209,10 +221,23 @@ struct kvm_s390_sie_block {
 #define ICPT_PARTEXEC  0x38
 #define ICPT_IOINST    0x40
 #define ICPT_KSS       0x5c
+#define ICPT_MCHKREQ   0x60
+#define ICPT_INT_ENABLE        0x64
+#define ICPT_PV_INSTR  0x68
+#define ICPT_PV_NOTIFY 0x6c
+#define ICPT_PV_PREF   0x70
        __u8    icptcode;               /* 0x0050 */
        __u8    icptstatus;             /* 0x0051 */
        __u16   ihcpu;                  /* 0x0052 */
-       __u8    reserved54[2];          /* 0x0054 */
+       __u8    reserved54;             /* 0x0054 */
+#define IICTL_CODE_NONE                 0x00
+#define IICTL_CODE_MCHK                 0x01
+#define IICTL_CODE_EXT          0x02
+#define IICTL_CODE_IO           0x03
+#define IICTL_CODE_RESTART      0x04
+#define IICTL_CODE_SPECIFICATION 0x10
+#define IICTL_CODE_OPERAND      0x11
+       __u8    iictl;                  /* 0x0055 */
        __u16   ipa;                    /* 0x0056 */
        __u32   ipb;                    /* 0x0058 */
        __u32   scaoh;                  /* 0x005c */
@@ -233,7 +258,7 @@ struct kvm_s390_sie_block {
 #define ECB3_RI  0x01
        __u8    ecb3;                   /* 0x0063 */
        __u32   scaol;                  /* 0x0064 */
-       __u8    reserved68;             /* 0x0068 */
+       __u8    sdf;                    /* 0x0068 */
        __u8    epdx;                   /* 0x0069 */
        __u8    reserved6a[2];          /* 0x006a */
        __u32   todpr;                  /* 0x006c */
@@ -249,31 +274,58 @@ struct kvm_s390_sie_block {
 #define HPID_KVM       0x4
 #define HPID_VSIE      0x5
        __u8    hpid;                   /* 0x00b8 */
-       __u8    reservedb9[11];         /* 0x00b9 */
-       __u16   extcpuaddr;             /* 0x00c4 */
-       __u16   eic;                    /* 0x00c6 */
+       __u8    reservedb9[7];          /* 0x00b9 */
+       union {
+               struct {
+                       __u32   eiparams;       /* 0x00c0 */
+                       __u16   extcpuaddr;     /* 0x00c4 */
+                       __u16   eic;            /* 0x00c6 */
+               };
+               __u64   mcic;                   /* 0x00c0 */
+       } __packed;
        __u32   reservedc8;             /* 0x00c8 */
-       __u16   pgmilc;                 /* 0x00cc */
-       __u16   iprcc;                  /* 0x00ce */
-       __u32   dxc;                    /* 0x00d0 */
-       __u16   mcn;                    /* 0x00d4 */
-       __u8    perc;                   /* 0x00d6 */
-       __u8    peratmid;               /* 0x00d7 */
+       union {
+               struct {
+                       __u16   pgmilc;         /* 0x00cc */
+                       __u16   iprcc;          /* 0x00ce */
+               };
+               __u32   edc;                    /* 0x00cc */
+       } __packed;
+       union {
+               struct {
+                       __u32   dxc;            /* 0x00d0 */
+                       __u16   mcn;            /* 0x00d4 */
+                       __u8    perc;           /* 0x00d6 */
+                       __u8    peratmid;       /* 0x00d7 */
+               };
+               __u64   faddr;                  /* 0x00d0 */
+       } __packed;
        __u64   peraddr;                /* 0x00d8 */
        __u8    eai;                    /* 0x00e0 */
        __u8    peraid;                 /* 0x00e1 */
        __u8    oai;                    /* 0x00e2 */
        __u8    armid;                  /* 0x00e3 */
        __u8    reservede4[4];          /* 0x00e4 */
-       __u64   tecmc;                  /* 0x00e8 */
-       __u8    reservedf0[12];         /* 0x00f0 */
+       union {
+               __u64   tecmc;          /* 0x00e8 */
+               struct {
+                       __u16   subchannel_id;  /* 0x00e8 */
+                       __u16   subchannel_nr;  /* 0x00ea */
+                       __u32   io_int_parm;    /* 0x00ec */
+                       __u32   io_int_word;    /* 0x00f0 */
+               };
+       } __packed;
+       __u8    reservedf4[8];          /* 0x00f4 */
 #define CRYCB_FORMAT_MASK 0x00000003
 #define CRYCB_FORMAT0 0x00000000
 #define CRYCB_FORMAT1 0x00000001
 #define CRYCB_FORMAT2 0x00000003
        __u32   crycbd;                 /* 0x00fc */
        __u64   gcr[16];                /* 0x0100 */
-       __u64   gbea;                   /* 0x0180 */
+       union {
+               __u64   gbea;           /* 0x0180 */
+               __u64   sidad;
+       };
        __u8    reserved188[8];         /* 0x0188 */
        __u64   sdnxo;                  /* 0x0190 */
        __u8    reserved198[8];         /* 0x0198 */
@@ -292,7 +344,7 @@ struct kvm_s390_sie_block {
        __u64   itdba;                  /* 0x01e8 */
        __u64   riccbd;                 /* 0x01f0 */
        __u64   gvrd;                   /* 0x01f8 */
-} __attribute__((packed));
+} __packed __aligned(512);
 
 struct kvm_s390_itdb {
        __u8    data[256];
@@ -301,7 +353,9 @@ struct kvm_s390_itdb {
 struct sie_page {
        struct kvm_s390_sie_block sie_block;
        struct mcck_volatile_info mcck_info;    /* 0x0200 */
-       __u8 reserved218[1000];         /* 0x0218 */
+       __u8 reserved218[360];          /* 0x0218 */
+       __u64 pv_grregs[16];            /* 0x0380 */
+       __u8 reserved400[512];          /* 0x0400 */
        struct kvm_s390_itdb itdb;      /* 0x0600 */
        __u8 reserved700[2304];         /* 0x0700 */
 };
@@ -476,6 +530,7 @@ enum irq_types {
        IRQ_PEND_PFAULT_INIT,
        IRQ_PEND_EXT_HOST,
        IRQ_PEND_EXT_SERVICE,
+       IRQ_PEND_EXT_SERVICE_EV,
        IRQ_PEND_EXT_TIMING,
        IRQ_PEND_EXT_CPU_TIMER,
        IRQ_PEND_EXT_CLOCK_COMP,
@@ -520,6 +575,7 @@ enum irq_types {
                           (1UL << IRQ_PEND_EXT_TIMING)     | \
                           (1UL << IRQ_PEND_EXT_HOST)       | \
                           (1UL << IRQ_PEND_EXT_SERVICE)    | \
+                          (1UL << IRQ_PEND_EXT_SERVICE_EV) | \
                           (1UL << IRQ_PEND_VIRTIO)         | \
                           (1UL << IRQ_PEND_PFAULT_INIT)    | \
                           (1UL << IRQ_PEND_PFAULT_DONE))
@@ -536,6 +592,13 @@ enum irq_types {
 #define IRQ_PEND_MCHK_MASK ((1UL << IRQ_PEND_MCHK_REP) | \
                            (1UL << IRQ_PEND_MCHK_EX))
 
+#define IRQ_PEND_EXT_II_MASK ((1UL << IRQ_PEND_EXT_CPU_TIMER)  | \
+                             (1UL << IRQ_PEND_EXT_CLOCK_COMP) | \
+                             (1UL << IRQ_PEND_EXT_EMERGENCY)  | \
+                             (1UL << IRQ_PEND_EXT_EXTERNAL)   | \
+                             (1UL << IRQ_PEND_EXT_SERVICE)    | \
+                             (1UL << IRQ_PEND_EXT_SERVICE_EV))
+
 struct kvm_s390_interrupt_info {
        struct list_head list;
        u64     type;
@@ -594,6 +657,7 @@ struct kvm_s390_local_interrupt {
 
 struct kvm_s390_float_interrupt {
        unsigned long pending_irqs;
+       unsigned long masked_irqs;
        spinlock_t lock;
        struct list_head lists[FIRQ_LIST_COUNT];
        int counters[FIRQ_MAX_COUNT];
@@ -645,6 +709,11 @@ struct kvm_guestdbg_info_arch {
        unsigned long last_bp;
 };
 
+struct kvm_s390_pv_vcpu {
+       u64 handle;
+       unsigned long stor_base;
+};
+
 struct kvm_vcpu_arch {
        struct kvm_s390_sie_block *sie_block;
        /* if vsie is active, currently executed shadow sie control block */
@@ -673,6 +742,7 @@ struct kvm_vcpu_arch {
        __u64 cputm_start;
        bool gs_enabled;
        bool skey_enabled;
+       struct kvm_s390_pv_vcpu pv;
 };
 
 struct kvm_vm_stat {
@@ -701,9 +771,6 @@ struct s390_io_adapter {
        bool masked;
        bool swap;
        bool suppressible;
-       struct rw_semaphore maps_lock;
-       struct list_head maps;
-       atomic_t nr_maps;
 };
 
 #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
@@ -846,6 +913,13 @@ struct kvm_s390_gisa_interrupt {
        DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS);
 };
 
+struct kvm_s390_pv {
+       u64 handle;
+       u64 guest_len;
+       unsigned long stor_base;
+       void *stor_var;
+};
+
 struct kvm_arch{
        void *sca;
        int use_esca;
@@ -881,6 +955,7 @@ struct kvm_arch{
        DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
        DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
        struct kvm_s390_gisa_interrupt gisa_int;
+       struct kvm_s390_pv pv;
 };
 
 #define KVM_HVA_ERR_BAD                (-1UL)
@@ -921,7 +996,7 @@ static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
-               struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
+                                        struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
index bcfb637..e21b618 100644 (file)
@@ -16,6 +16,8 @@ typedef struct {
        unsigned long asce;
        unsigned long asce_limit;
        unsigned long vdso_base;
+       /* The mmu context belongs to a secure guest. */
+       atomic_t is_protected;
        /*
         * The following bitfields need a down_write on the mm
         * semaphore when they are written to. As they are only
index 8d04e6f..afa8360 100644 (file)
@@ -23,6 +23,7 @@ static inline int init_new_context(struct task_struct *tsk,
        INIT_LIST_HEAD(&mm->context.gmap_list);
        cpumask_clear(&mm->context.cpu_attach_mask);
        atomic_set(&mm->context.flush_count, 0);
+       atomic_set(&mm->context.is_protected, 0);
        mm->context.gmap_asce = 0;
        mm->context.flush_mm = 0;
        mm->context.compat_mm = test_thread_flag(TIF_31BIT);
index 1019efd..62440a8 100644 (file)
@@ -153,6 +153,11 @@ static inline int devmem_is_allowed(unsigned long pfn)
 #define HAVE_ARCH_FREE_PAGE
 #define HAVE_ARCH_ALLOC_PAGE
 
+#if IS_ENABLED(CONFIG_PGSTE)
+int arch_make_page_accessible(struct page *page);
+#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #define __PAGE_OFFSET          0x0UL
index 137a392..cc7a1ad 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/atomic.h>
 #include <asm/bug.h>
 #include <asm/page.h>
+#include <asm/uv.h>
 
 extern pgd_t swapper_pg_dir[];
 extern void paging_init(void);
@@ -520,6 +521,15 @@ static inline int mm_has_pgste(struct mm_struct *mm)
        return 0;
 }
 
+static inline int mm_is_protected(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+       if (unlikely(atomic_read(&mm->context.is_protected)))
+               return 1;
+#endif
+       return 0;
+}
+
 static inline int mm_alloc_pgste(struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
@@ -1061,7 +1071,12 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long addr, pte_t *ptep)
 {
-       return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+       pte_t res;
+
+       res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+       if (mm_is_protected(mm) && pte_present(res))
+               uv_convert_from_secure(pte_val(res) & PAGE_MASK);
+       return res;
 }
 
 #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -1073,7 +1088,12 @@ void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
 static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
                                     unsigned long addr, pte_t *ptep)
 {
-       return ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
+       pte_t res;
+
+       res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
+       if (mm_is_protected(vma->vm_mm) && pte_present(res))
+               uv_convert_from_secure(pte_val(res) & PAGE_MASK);
+       return res;
 }
 
 /*
@@ -1088,12 +1108,17 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long addr,
                                            pte_t *ptep, int full)
 {
+       pte_t res;
+
        if (full) {
-               pte_t pte = *ptep;
+               res = *ptep;
                *ptep = __pte(_PAGE_INVALID);
-               return pte;
+       } else {
+               res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
        }
-       return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+       if (mm_is_protected(mm) && pte_present(res))
+               uv_convert_from_secure(pte_val(res) & PAGE_MASK);
+       return res;
 }
 
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
index 4093a28..cff4b4c 100644 (file)
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/bug.h>
+#include <linux/sched.h>
 #include <asm/page.h>
+#include <asm/gmap.h>
 
 #define UVC_RC_EXECUTED                0x0001
 #define UVC_RC_INV_CMD         0x0002
 #define UVC_RC_INV_STATE       0x0003
 #define UVC_RC_INV_LEN         0x0005
 #define UVC_RC_NO_RESUME       0x0007
+#define UVC_RC_NEED_DESTROY    0x8000
 
 #define UVC_CMD_QUI                    0x0001
+#define UVC_CMD_INIT_UV                        0x000f
+#define UVC_CMD_CREATE_SEC_CONF                0x0100
+#define UVC_CMD_DESTROY_SEC_CONF       0x0101
+#define UVC_CMD_CREATE_SEC_CPU         0x0120
+#define UVC_CMD_DESTROY_SEC_CPU                0x0121
+#define UVC_CMD_CONV_TO_SEC_STOR       0x0200
+#define UVC_CMD_CONV_FROM_SEC_STOR     0x0201
+#define UVC_CMD_SET_SEC_CONF_PARAMS    0x0300
+#define UVC_CMD_UNPACK_IMG             0x0301
+#define UVC_CMD_VERIFY_IMG             0x0302
+#define UVC_CMD_CPU_RESET              0x0310
+#define UVC_CMD_CPU_RESET_INITIAL      0x0311
+#define UVC_CMD_PREPARE_RESET          0x0320
+#define UVC_CMD_CPU_RESET_CLEAR                0x0321
+#define UVC_CMD_CPU_SET_STATE          0x0330
+#define UVC_CMD_SET_UNSHARE_ALL                0x0340
+#define UVC_CMD_PIN_PAGE_SHARED                0x0341
+#define UVC_CMD_UNPIN_PAGE_SHARED      0x0342
 #define UVC_CMD_SET_SHARED_ACCESS      0x1000
 #define UVC_CMD_REMOVE_SHARED_ACCESS   0x1001
 
 /* Bits in installed uv calls */
 enum uv_cmds_inst {
        BIT_UVC_CMD_QUI = 0,
+       BIT_UVC_CMD_INIT_UV = 1,
+       BIT_UVC_CMD_CREATE_SEC_CONF = 2,
+       BIT_UVC_CMD_DESTROY_SEC_CONF = 3,
+       BIT_UVC_CMD_CREATE_SEC_CPU = 4,
+       BIT_UVC_CMD_DESTROY_SEC_CPU = 5,
+       BIT_UVC_CMD_CONV_TO_SEC_STOR = 6,
+       BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7,
        BIT_UVC_CMD_SET_SHARED_ACCESS = 8,
        BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9,
+       BIT_UVC_CMD_SET_SEC_PARMS = 11,
+       BIT_UVC_CMD_UNPACK_IMG = 13,
+       BIT_UVC_CMD_VERIFY_IMG = 14,
+       BIT_UVC_CMD_CPU_RESET = 15,
+       BIT_UVC_CMD_CPU_RESET_INITIAL = 16,
+       BIT_UVC_CMD_CPU_SET_STATE = 17,
+       BIT_UVC_CMD_PREPARE_RESET = 18,
+       BIT_UVC_CMD_CPU_PERFORM_CLEAR_RESET = 19,
+       BIT_UVC_CMD_UNSHARE_ALL = 20,
+       BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
+       BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
 };
 
 struct uv_cb_header {
@@ -40,13 +79,127 @@ struct uv_cb_header {
        u16 rrc;        /* Return Reason Code */
 } __packed __aligned(8);
 
+/* Query Ultravisor Information */
 struct uv_cb_qui {
        struct uv_cb_header header;
        u64 reserved08;
        u64 inst_calls_list[4];
-       u64 reserved30[15];
+       u64 reserved30[2];
+       u64 uv_base_stor_len;
+       u64 reserved48;
+       u64 conf_base_phys_stor_len;
+       u64 conf_base_virt_stor_len;
+       u64 conf_virt_var_stor_len;
+       u64 cpu_stor_len;
+       u32 reserved70[3];
+       u32 max_num_sec_conf;
+       u64 max_guest_stor_addr;
+       u8  reserved88[158 - 136];
+       u16 max_guest_cpus;
+       u8  reserveda0[200 - 160];
 } __packed __aligned(8);
 
+/* Initialize Ultravisor */
+struct uv_cb_init {
+       struct uv_cb_header header;
+       u64 reserved08[2];
+       u64 stor_origin;
+       u64 stor_len;
+       u64 reserved28[4];
+} __packed __aligned(8);
+
+/* Create Guest Configuration */
+struct uv_cb_cgc {
+       struct uv_cb_header header;
+       u64 reserved08[2];
+       u64 guest_handle;
+       u64 conf_base_stor_origin;
+       u64 conf_virt_stor_origin;
+       u64 reserved30;
+       u64 guest_stor_origin;
+       u64 guest_stor_len;
+       u64 guest_sca;
+       u64 guest_asce;
+       u64 reserved58[5];
+} __packed __aligned(8);
+
+/* Create Secure CPU */
+struct uv_cb_csc {
+       struct uv_cb_header header;
+       u64 reserved08[2];
+       u64 cpu_handle;
+       u64 guest_handle;
+       u64 stor_origin;
+       u8  reserved30[6];
+       u16 num;
+       u64 state_origin;
+       u64 reserved40[4];
+} __packed __aligned(8);
+
+/* Convert to Secure */
+struct uv_cb_cts {
+       struct uv_cb_header header;
+       u64 reserved08[2];
+       u64 guest_handle;
+       u64 gaddr;
+} __packed __aligned(8);
+
+/* Convert from Secure / Pin Page Shared */
+struct uv_cb_cfs {
+       struct uv_cb_header header;
+       u64 reserved08[2];
+       u64 paddr;
+} __packed __aligned(8);
+
+/* Set Secure Config Parameter */
+struct uv_cb_ssc {
+       struct uv_cb_header header;
+       u64 reserved08[2];
+       u64 guest_handle;
+       u64 sec_header_origin;
+       u32 sec_header_len;
+       u32 reserved2c;
+       u64 reserved30[4];
+} __packed __aligned(8);
+
+/* Unpack */
+struct uv_cb_unp {
+       struct uv_cb_header header;
+       u64 reserved08[2];
+       u64 guest_handle;
+       u64 gaddr;
+       u64 tweak[2];
+       u64 reserved38[3];
+} __packed __aligned(8);
+
+#define PV_CPU_STATE_OPR       1
+#define PV_CPU_STATE_STP       2
+#define PV_CPU_STATE_CHKSTP    3
+#define PV_CPU_STATE_OPR_LOAD  5
+
+struct uv_cb_cpu_set_state {
+       struct uv_cb_header header;
+       u64 reserved08[2];
+       u64 cpu_handle;
+       u8  reserved20[7];
+       u8  state;
+       u64 reserved28[5];
+};
+
+/*
+ * A common UV call struct for calls that take no payload
+ * Examples:
+ * Destroy cpu/config
+ * Verify
+ */
+struct uv_cb_nodata {
+       struct uv_cb_header header;
+       u64 reserved08[2];
+       u64 handle;
+       u64 reserved20[4];
+} __packed __aligned(8);
+
+/* Set Shared Access */
 struct uv_cb_share {
        struct uv_cb_header header;
        u64 reserved08[3];
@@ -54,21 +207,76 @@ struct uv_cb_share {
        u64 reserved28;
 } __packed __aligned(8);
 
-static inline int uv_call(unsigned long r1, unsigned long r2)
+static inline int __uv_call(unsigned long r1, unsigned long r2)
 {
        int cc;
 
        asm volatile(
-               "0:     .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n"
-               "               brc     3,0b\n"
-               "               ipm     %[cc]\n"
-               "               srl     %[cc],28\n"
+               "       .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n"
+               "       ipm     %[cc]\n"
+               "       srl     %[cc],28\n"
                : [cc] "=d" (cc)
                : [r1] "a" (r1), [r2] "a" (r2)
                : "memory", "cc");
        return cc;
 }
 
+static inline int uv_call(unsigned long r1, unsigned long r2)
+{
+       int cc;
+
+       do {
+               cc = __uv_call(r1, r2);
+       } while (cc > 1);
+       return cc;
+}
+
+/* Low level uv_call that avoids stalls for long running busy conditions  */
+static inline int uv_call_sched(unsigned long r1, unsigned long r2)
+{
+       int cc;
+
+       do {
+               cc = __uv_call(r1, r2);
+               cond_resched();
+       } while (cc > 1);
+       return cc;
+}
+
+/*
+ * special variant of uv_call that only transports the cpu or guest
+ * handle and the command, like destroy or verify.
+ */
+static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc)
+{
+       struct uv_cb_nodata uvcb = {
+               .header.cmd = cmd,
+               .header.len = sizeof(uvcb),
+               .handle = handle,
+       };
+       int cc;
+
+       WARN(!handle, "No handle provided to Ultravisor call cmd %x\n", cmd);
+       cc = uv_call_sched(0, (u64)&uvcb);
+       *rc = uvcb.header.rc;
+       *rrc = uvcb.header.rrc;
+       return cc ? -EINVAL : 0;
+}
+
+struct uv_info {
+       unsigned long inst_calls_list[4];
+       unsigned long uv_base_stor_len;
+       unsigned long guest_base_stor_len;
+       unsigned long guest_virt_base_stor_len;
+       unsigned long guest_virt_var_stor_len;
+       unsigned long guest_cpu_stor_len;
+       unsigned long max_sec_stor_addr;
+       unsigned int max_num_sec_conf;
+       unsigned short max_guest_cpus;
+};
+
+extern struct uv_info uv_info;
+
 #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
 extern int prot_virt_guest;
 
@@ -121,11 +329,40 @@ static inline int uv_remove_shared(unsigned long addr)
        return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS);
 }
 
-void uv_query_info(void);
 #else
 #define is_prot_virt_guest() 0
 static inline int uv_set_shared(unsigned long addr) { return 0; }
 static inline int uv_remove_shared(unsigned long addr) { return 0; }
+#endif
+
+#if IS_ENABLED(CONFIG_KVM)
+extern int prot_virt_host;
+
+static inline int is_prot_virt_host(void)
+{
+       return prot_virt_host;
+}
+
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
+int uv_convert_from_secure(unsigned long paddr);
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
+
+void setup_uv(void);
+void adjust_to_uv_max(unsigned long *vmax);
+#else
+#define is_prot_virt_host() 0
+static inline void setup_uv(void) {}
+static inline void adjust_to_uv_max(unsigned long *vmax) {}
+
+static inline int uv_convert_from_secure(unsigned long paddr)
+{
+       return 0;
+}
+#endif
+
+#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
+void uv_query_info(void);
+#else
 static inline void uv_query_info(void) {}
 #endif
 
index 2b1203c..22bfb8d 100644 (file)
@@ -78,6 +78,7 @@ obj-$(CONFIG_PERF_EVENTS)     += perf_cpum_cf_events.o perf_regs.o
 obj-$(CONFIG_PERF_EVENTS)      += perf_cpum_cf_diag.o
 
 obj-$(CONFIG_TRACEPOINTS)      += trace.o
+obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))  += uv.o
 
 # vdso
 obj-y                          += vdso64/
index 1d3927e..faca269 100644 (file)
@@ -24,6 +24,8 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs);
 
 void do_protection_exception(struct pt_regs *regs);
 void do_dat_exception(struct pt_regs *regs);
+void do_secure_storage_access(struct pt_regs *regs);
+void do_non_secure_storage_access(struct pt_regs *regs);
 
 void addressing_exception(struct pt_regs *regs);
 void data_exception(struct pt_regs *regs);
index eee3a48..2c27907 100644 (file)
@@ -78,8 +78,8 @@ PGM_CHECK(do_dat_exception)           /* 39 */
 PGM_CHECK(do_dat_exception)            /* 3a */
 PGM_CHECK(do_dat_exception)            /* 3b */
 PGM_CHECK_DEFAULT                      /* 3c */
-PGM_CHECK_DEFAULT                      /* 3d */
-PGM_CHECK_DEFAULT                      /* 3e */
+PGM_CHECK(do_secure_storage_access)    /* 3d */
+PGM_CHECK(do_non_secure_storage_access)        /* 3e */
 PGM_CHECK_DEFAULT                      /* 3f */
 PGM_CHECK(monitor_event_exception)     /* 40 */
 PGM_CHECK_DEFAULT                      /* 41 */
index b2c2f75..1423090 100644 (file)
@@ -92,10 +92,6 @@ char elf_platform[ELF_PLATFORM_SIZE];
 
 unsigned long int_hwcap = 0;
 
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
-int __bootdata_preserved(prot_virt_guest);
-#endif
-
 int __bootdata(noexec_disabled);
 int __bootdata(memory_end_set);
 unsigned long __bootdata(memory_end);
@@ -564,6 +560,9 @@ static void __init setup_memory_end(void)
                        vmax = _REGION1_SIZE; /* 4-level kernel page table */
        }
 
+       if (is_prot_virt_host())
+               adjust_to_uv_max(&vmax);
+
        /* module area is at the end of the kernel address space. */
        MODULES_END = vmax;
        MODULES_VADDR = MODULES_END - MODULES_LEN;
@@ -1138,6 +1137,8 @@ void __init setup_arch(char **cmdline_p)
         */
        memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT));
 
+       if (is_prot_virt_host())
+               setup_uv();
        setup_memory_end();
        setup_memory();
        dma_contiguous_reserve(memory_end);
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
new file mode 100644 (file)
index 0000000..c86d654
--- /dev/null
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Ultravisor functions and initialization
+ *
+ * Copyright IBM Corp. 2019, 2020
+ */
+#define KMSG_COMPONENT "prot_virt"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/bitmap.h>
+#include <linux/memblock.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <asm/facility.h>
+#include <asm/sections.h>
+#include <asm/uv.h>
+
+/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
+int __bootdata_preserved(prot_virt_guest);
+#endif
+
+#if IS_ENABLED(CONFIG_KVM)
+int prot_virt_host;
+EXPORT_SYMBOL(prot_virt_host);
+struct uv_info __bootdata_preserved(uv_info);
+EXPORT_SYMBOL(uv_info);
+
+static int __init prot_virt_setup(char *val)
+{
+       bool enabled;
+       int rc;
+
+       rc = kstrtobool(val, &enabled);
+       if (!rc && enabled)
+               prot_virt_host = 1;
+
+       if (is_prot_virt_guest() && prot_virt_host) {
+               prot_virt_host = 0;
+               pr_warn("Protected virtualization not available in protected guests.");
+       }
+
+       if (prot_virt_host && !test_facility(158)) {
+               prot_virt_host = 0;
+               pr_warn("Protected virtualization not supported by the hardware.");
+       }
+
+       return rc;
+}
+early_param("prot_virt", prot_virt_setup);
+
+static int __init uv_init(unsigned long stor_base, unsigned long stor_len)
+{
+       struct uv_cb_init uvcb = {
+               .header.cmd = UVC_CMD_INIT_UV,
+               .header.len = sizeof(uvcb),
+               .stor_origin = stor_base,
+               .stor_len = stor_len,
+       };
+
+       if (uv_call(0, (uint64_t)&uvcb)) {
+               pr_err("Ultravisor init failed with rc: 0x%x rrc: 0%x\n",
+                      uvcb.header.rc, uvcb.header.rrc);
+               return -1;
+       }
+       return 0;
+}
+
+void __init setup_uv(void)
+{
+       unsigned long uv_stor_base;
+
+       uv_stor_base = (unsigned long)memblock_alloc_try_nid(
+               uv_info.uv_base_stor_len, SZ_1M, SZ_2G,
+               MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
+       if (!uv_stor_base) {
+               pr_warn("Failed to reserve %lu bytes for ultravisor base storage\n",
+                       uv_info.uv_base_stor_len);
+               goto fail;
+       }
+
+       if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) {
+               memblock_free(uv_stor_base, uv_info.uv_base_stor_len);
+               goto fail;
+       }
+
+       pr_info("Reserving %luMB as ultravisor base storage\n",
+               uv_info.uv_base_stor_len >> 20);
+       return;
+fail:
+       pr_info("Disabling support for protected virtualization");
+       prot_virt_host = 0;
+}
+
+void adjust_to_uv_max(unsigned long *vmax)
+{
+       *vmax = min_t(unsigned long, *vmax, uv_info.max_sec_stor_addr);
+}
+
+/*
+ * Requests the Ultravisor to pin the page in the shared state. This will
+ * cause an intercept when the guest attempts to unshare the pinned page.
+ */
+static int uv_pin_shared(unsigned long paddr)
+{
+       struct uv_cb_cfs uvcb = {
+               .header.cmd = UVC_CMD_PIN_PAGE_SHARED,
+               .header.len = sizeof(uvcb),
+               .paddr = paddr,
+       };
+
+       if (uv_call(0, (u64)&uvcb))
+               return -EINVAL;
+       return 0;
+}
+
+/*
+ * Requests the Ultravisor to encrypt a guest page and make it
+ * accessible to the host for paging (export).
+ *
+ * @paddr: Absolute host address of page to be exported
+ */
+int uv_convert_from_secure(unsigned long paddr)
+{
+       struct uv_cb_cfs uvcb = {
+               .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR,
+               .header.len = sizeof(uvcb),
+               .paddr = paddr
+       };
+
+       if (uv_call(0, (u64)&uvcb))
+               return -EINVAL;
+       return 0;
+}
+
+/*
+ * Calculate the expected ref_count for a page that would otherwise have no
+ * further pins. This was cribbed from similar functions in other places in
+ * the kernel, but with some slight modifications. We know that a secure
+ * page can not be a huge page for example.
+ */
+static int expected_page_refs(struct page *page)
+{
+       int res;
+
+       res = page_mapcount(page);
+       if (PageSwapCache(page)) {
+               res++;
+       } else if (page_mapping(page)) {
+               res++;
+               if (page_has_private(page))
+                       res++;
+       }
+       return res;
+}
+
+static int make_secure_pte(pte_t *ptep, unsigned long addr,
+                          struct page *exp_page, struct uv_cb_header *uvcb)
+{
+       pte_t entry = READ_ONCE(*ptep);
+       struct page *page;
+       int expected, rc = 0;
+
+       if (!pte_present(entry))
+               return -ENXIO;
+       if (pte_val(entry) & _PAGE_INVALID)
+               return -ENXIO;
+
+       page = pte_page(entry);
+       if (page != exp_page)
+               return -ENXIO;
+       if (PageWriteback(page))
+               return -EAGAIN;
+       expected = expected_page_refs(page);
+       if (!page_ref_freeze(page, expected))
+               return -EBUSY;
+       set_bit(PG_arch_1, &page->flags);
+       rc = uv_call(0, (u64)uvcb);
+       page_ref_unfreeze(page, expected);
+       /* Return -ENXIO if the page was not mapped, -EINVAL otherwise */
+       if (rc)
+               rc = uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
+       return rc;
+}
+
+/*
+ * Requests the Ultravisor to make a page accessible to a guest.
+ * If it's brought in the first time, it will be cleared. If
+ * it has been exported before, it will be decrypted and integrity
+ * checked.
+ */
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
+{
+       struct vm_area_struct *vma;
+       bool local_drain = false;
+       spinlock_t *ptelock;
+       unsigned long uaddr;
+       struct page *page;
+       pte_t *ptep;
+       int rc;
+
+again:
+       rc = -EFAULT;
+       down_read(&gmap->mm->mmap_sem);
+
+       uaddr = __gmap_translate(gmap, gaddr);
+       if (IS_ERR_VALUE(uaddr))
+               goto out;
+       vma = find_vma(gmap->mm, uaddr);
+       if (!vma)
+               goto out;
+       /*
+        * Secure pages cannot be huge and userspace should not combine both.
+        * In case userspace does it anyway this will result in an -EFAULT for
+        * the unpack. The guest is thus never reaching secure mode. If
+        * userspace is playing dirty tricky with mapping huge pages later
+        * on this will result in a segmentation fault.
+        */
+       if (is_vm_hugetlb_page(vma))
+               goto out;
+
+       rc = -ENXIO;
+       page = follow_page(vma, uaddr, FOLL_WRITE);
+       if (IS_ERR_OR_NULL(page))
+               goto out;
+
+       lock_page(page);
+       ptep = get_locked_pte(gmap->mm, uaddr, &ptelock);
+       rc = make_secure_pte(ptep, uaddr, page, uvcb);
+       pte_unmap_unlock(ptep, ptelock);
+       unlock_page(page);
+out:
+       up_read(&gmap->mm->mmap_sem);
+
+       if (rc == -EAGAIN) {
+               wait_on_page_writeback(page);
+       } else if (rc == -EBUSY) {
+               /*
+                * If we have tried a local drain and the page refcount
+                * still does not match our expected safe value, try with a
+                * system wide drain. This is needed if the pagevecs holding
+                * the page are on a different CPU.
+                */
+               if (local_drain) {
+                       lru_add_drain_all();
+                       /* We give up here, and let the caller try again */
+                       return -EAGAIN;
+               }
+               /*
+                * We are here if the page refcount does not match the
+                * expected safe value. The main culprits are usually
+                * pagevecs. With lru_add_drain() we drain the pagevecs
+                * on the local CPU so that hopefully the refcount will
+                * reach the expected safe value.
+                */
+               lru_add_drain();
+               local_drain = true;
+               /* And now we try again immediately after draining */
+               goto again;
+       } else if (rc == -ENXIO) {
+               if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE))
+                       return -EFAULT;
+               return -EAGAIN;
+       }
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_make_secure);
+
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
+{
+       struct uv_cb_cts uvcb = {
+               .header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
+               .header.len = sizeof(uvcb),
+               .guest_handle = gmap->guest_handle,
+               .gaddr = gaddr,
+       };
+
+       return gmap_make_secure(gmap, gaddr, &uvcb);
+}
+EXPORT_SYMBOL_GPL(gmap_convert_to_secure);
+
+/*
+ * To be called with the page locked or with an extra reference! This will
+ * prevent gmap_make_secure from touching the page concurrently. Having 2
+ * parallel make_page_accessible is fine, as the UV calls will become a
+ * no-op if the page is already exported.
+ */
+int arch_make_page_accessible(struct page *page)
+{
+       int rc = 0;
+
+       /* Hugepage cannot be protected, so nothing to do */
+       if (PageHuge(page))
+               return 0;
+
+       /*
+        * PG_arch_1 is used in 3 places:
+        * 1. for kernel page tables during early boot
+        * 2. for storage keys of huge pages and KVM
+        * 3. As an indication that this page might be secure. This can
+        *    overindicate, e.g. we set the bit before calling
+        *    convert_to_secure.
+        * As secure pages are never huge, all 3 variants can co-exists.
+        */
+       if (!test_bit(PG_arch_1, &page->flags))
+               return 0;
+
+       rc = uv_pin_shared(page_to_phys(page));
+       if (!rc) {
+               clear_bit(PG_arch_1, &page->flags);
+               return 0;
+       }
+
+       rc = uv_convert_from_secure(page_to_phys(page));
+       if (!rc) {
+               clear_bit(PG_arch_1, &page->flags);
+               return 0;
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL_GPL(arch_make_page_accessible);
+
+#endif
+
+#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
+static ssize_t uv_query_facilities(struct kobject *kobj,
+                                  struct kobj_attribute *attr, char *page)
+{
+       return snprintf(page, PAGE_SIZE, "%lx\n%lx\n%lx\n%lx\n",
+                       uv_info.inst_calls_list[0],
+                       uv_info.inst_calls_list[1],
+                       uv_info.inst_calls_list[2],
+                       uv_info.inst_calls_list[3]);
+}
+
+static struct kobj_attribute uv_query_facilities_attr =
+       __ATTR(facilities, 0444, uv_query_facilities, NULL);
+
+static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
+                                      struct kobj_attribute *attr, char *page)
+{
+       return snprintf(page, PAGE_SIZE, "%d\n",
+                       uv_info.max_guest_cpus);
+}
+
+static struct kobj_attribute uv_query_max_guest_cpus_attr =
+       __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL);
+
+static ssize_t uv_query_max_guest_vms(struct kobject *kobj,
+                                     struct kobj_attribute *attr, char *page)
+{
+       return snprintf(page, PAGE_SIZE, "%d\n",
+                       uv_info.max_num_sec_conf);
+}
+
+static struct kobj_attribute uv_query_max_guest_vms_attr =
+       __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL);
+
+static ssize_t uv_query_max_guest_addr(struct kobject *kobj,
+                                      struct kobj_attribute *attr, char *page)
+{
+       return snprintf(page, PAGE_SIZE, "%lx\n",
+                       uv_info.max_sec_stor_addr);
+}
+
+static struct kobj_attribute uv_query_max_guest_addr_attr =
+       __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL);
+
+static struct attribute *uv_query_attrs[] = {
+       &uv_query_facilities_attr.attr,
+       &uv_query_max_guest_cpus_attr.attr,
+       &uv_query_max_guest_vms_attr.attr,
+       &uv_query_max_guest_addr_attr.attr,
+       NULL,
+};
+
+static struct attribute_group uv_query_attr_group = {
+       .attrs = uv_query_attrs,
+};
+
+static struct kset *uv_query_kset;
+static struct kobject *uv_kobj;
+
+static int __init uv_info_init(void)
+{
+       int rc = -ENOMEM;
+
+       if (!test_facility(158))
+               return 0;
+
+       uv_kobj = kobject_create_and_add("uv", firmware_kobj);
+       if (!uv_kobj)
+               return -ENOMEM;
+
+       uv_query_kset = kset_create_and_add("query", NULL, uv_kobj);
+       if (!uv_query_kset)
+               goto out_kobj;
+
+       rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group);
+       if (!rc)
+               return 0;
+
+       kset_unregister(uv_query_kset);
+out_kobj:
+       kobject_del(uv_kobj);
+       kobject_put(uv_kobj);
+       return rc;
+}
+device_initcall(uv_info_init);
+#endif
index 05ee90a..12decca 100644 (file)
@@ -9,6 +9,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o vsie.o
+kvm-objs += diag.o gaccess.o guestdbg.o vsie.o pv.o
 
 obj-$(CONFIG_KVM) += kvm.o
index 3fb54ec..563429d 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * handling diagnose instructions
  *
- * Copyright IBM Corp. 2008, 2011
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -201,6 +201,10 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
                return -EOPNOTSUPP;
        }
 
+       /*
+        * no need to check the return value of vcpu_stop as it can only have
+        * an error for protvirt, but protvirt means user cpu state
+        */
        if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
                kvm_s390_vcpu_stop(vcpu);
        vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM;
index 07d30ff..47a67a9 100644 (file)
@@ -505,7 +505,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
                switch (prot) {
                case PROT_TYPE_IEP:
                        tec->b61 = 1;
-                       /* FALL THROUGH */
+                       fallthrough;
                case PROT_TYPE_LA:
                        tec->b56 = 1;
                        break;
@@ -514,12 +514,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
                        break;
                case PROT_TYPE_ALC:
                        tec->b60 = 1;
-                       /* FALL THROUGH */
+                       fallthrough;
                case PROT_TYPE_DAT:
                        tec->b61 = 1;
                        break;
                }
-               /* FALL THROUGH */
+               fallthrough;
        case PGM_ASCE_TYPE:
        case PGM_PAGE_TRANSLATION:
        case PGM_REGION_FIRST_TRANS:
@@ -534,7 +534,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
                tec->addr = gva >> PAGE_SHIFT;
                tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
                tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
-               /* FALL THROUGH */
+               fallthrough;
        case PGM_ALEN_TRANSLATION:
        case PGM_ALE_SEQUENCE:
        case PGM_ASTE_VALIDITY:
@@ -677,7 +677,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
                        dat_protection |= rfte.p;
                ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8;
        }
-               /* fallthrough */
+               fallthrough;
        case ASCE_TYPE_REGION2: {
                union region2_table_entry rste;
 
@@ -695,7 +695,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
                        dat_protection |= rste.p;
                ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8;
        }
-               /* fallthrough */
+               fallthrough;
        case ASCE_TYPE_REGION3: {
                union region3_table_entry rtte;
 
@@ -723,7 +723,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
                        dat_protection |= rtte.fc0.p;
                ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8;
        }
-               /* fallthrough */
+               fallthrough;
        case ASCE_TYPE_SEGMENT: {
                union segment_table_entry ste;
 
@@ -1050,7 +1050,8 @@ shadow_r2t:
                rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
                if (rc)
                        return rc;
-       } /* fallthrough */
+       }
+               fallthrough;
        case ASCE_TYPE_REGION2: {
                union region2_table_entry rste;
 
@@ -1076,7 +1077,8 @@ shadow_r3t:
                rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
                if (rc)
                        return rc;
-       } /* fallthrough */
+       }
+               fallthrough;
        case ASCE_TYPE_REGION3: {
                union region3_table_entry rtte;
 
@@ -1111,7 +1113,8 @@ shadow_sgt:
                rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
                if (rc)
                        return rc;
-       } /* fallthrough */
+       }
+               fallthrough;
        case ASCE_TYPE_SEGMENT: {
                union segment_table_entry ste;
 
index a389fa8..e7a7c49 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * in-kernel handling for sie intercepts
  *
- * Copyright IBM Corp. 2008, 2014
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 
-#include <asm/kvm_host.h>
 #include <asm/asm-offsets.h>
 #include <asm/irq.h>
 #include <asm/sysinfo.h>
+#include <asm/uv.h>
 
 #include "kvm-s390.h"
 #include "gaccess.h"
@@ -79,6 +79,10 @@ static int handle_stop(struct kvm_vcpu *vcpu)
                        return rc;
        }
 
+       /*
+        * no need to check the return value of vcpu_stop as it can only have
+        * an error for protvirt, but protvirt means user cpu state
+        */
        if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
                kvm_s390_vcpu_stop(vcpu);
        return -EOPNOTSUPP;
@@ -231,6 +235,13 @@ static int handle_prog(struct kvm_vcpu *vcpu)
 
        vcpu->stat.exit_program_interruption++;
 
+       /*
+        * Intercept 8 indicates a loop of specification exceptions
+        * for protected guests.
+        */
+       if (kvm_s390_pv_cpu_is_protected(vcpu))
+               return -EOPNOTSUPP;
+
        if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
                rc = kvm_s390_handle_per_event(vcpu);
                if (rc)
@@ -384,7 +395,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
                goto out;
        }
 
-       if (addr & ~PAGE_MASK)
+       if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK))
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
        sctns = (void *)get_zeroed_page(GFP_KERNEL);
@@ -395,10 +406,15 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
 
 out:
        if (!cc) {
-               r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
-               if (r) {
-                       free_page((unsigned long)sctns);
-                       return kvm_s390_inject_prog_cond(vcpu, r);
+               if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+                       memcpy((void *)(sida_origin(vcpu->arch.sie_block)),
+                              sctns, PAGE_SIZE);
+               } else {
+                       r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+                       if (r) {
+                               free_page((unsigned long)sctns);
+                               return kvm_s390_inject_prog_cond(vcpu, r);
+                       }
                }
        }
 
@@ -444,6 +460,77 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
        return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
 }
 
+static int handle_pv_spx(struct kvm_vcpu *vcpu)
+{
+       u32 pref = *(u32 *)vcpu->arch.sie_block->sidad;
+
+       kvm_s390_set_prefix(vcpu, pref);
+       trace_kvm_s390_handle_prefix(vcpu, 1, pref);
+       return 0;
+}
+
+static int handle_pv_sclp(struct kvm_vcpu *vcpu)
+{
+       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+
+       spin_lock(&fi->lock);
+       /*
+        * 2 cases:
+        * a: an sccb answering interrupt was already pending or in flight.
+        *    As the sccb value is not known we can simply set some value to
+        *    trigger delivery of a saved SCCB. UV will then use its saved
+        *    copy of the SCCB value.
+        * b: an error SCCB interrupt needs to be injected so we also inject
+        *    a fake SCCB address. Firmware will use the proper one.
+        * This makes sure, that both errors and real sccb returns will only
+        * be delivered after a notification intercept (instruction has
+        * finished) but not after others.
+        */
+       fi->srv_signal.ext_params |= 0x43000;
+       set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+       clear_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs);
+       spin_unlock(&fi->lock);
+       return 0;
+}
+
+static int handle_pv_uvc(struct kvm_vcpu *vcpu)
+{
+       struct uv_cb_share *guest_uvcb = (void *)vcpu->arch.sie_block->sidad;
+       struct uv_cb_cts uvcb = {
+               .header.cmd     = UVC_CMD_UNPIN_PAGE_SHARED,
+               .header.len     = sizeof(uvcb),
+               .guest_handle   = kvm_s390_pv_get_handle(vcpu->kvm),
+               .gaddr          = guest_uvcb->paddr,
+       };
+       int rc;
+
+       if (guest_uvcb->header.cmd != UVC_CMD_REMOVE_SHARED_ACCESS) {
+               WARN_ONCE(1, "Unexpected notification intercept for UVC 0x%x\n",
+                         guest_uvcb->header.cmd);
+               return 0;
+       }
+       rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb);
+       /*
+        * If the unpin did not succeed, the guest will exit again for the UVC
+        * and we will retry the unpin.
+        */
+       if (rc == -EINVAL)
+               return 0;
+       return rc;
+}
+
+static int handle_pv_notification(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.sie_block->ipa == 0xb210)
+               return handle_pv_spx(vcpu);
+       if (vcpu->arch.sie_block->ipa == 0xb220)
+               return handle_pv_sclp(vcpu);
+       if (vcpu->arch.sie_block->ipa == 0xb9a4)
+               return handle_pv_uvc(vcpu);
+
+       return handle_instruction(vcpu);
+}
+
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
        int rc, per_rc = 0;
@@ -480,6 +567,28 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
        case ICPT_KSS:
                rc = kvm_s390_skey_check_enable(vcpu);
                break;
+       case ICPT_MCHKREQ:
+       case ICPT_INT_ENABLE:
+               /*
+                * PSW bit 13 or a CR (0, 6, 14) changed and we might
+                * now be able to deliver interrupts. The pre-run code
+                * will take care of this.
+                */
+               rc = 0;
+               break;
+       case ICPT_PV_INSTR:
+               rc = handle_instruction(vcpu);
+               break;
+       case ICPT_PV_NOTIFY:
+               rc = handle_pv_notification(vcpu);
+               break;
+       case ICPT_PV_PREF:
+               rc = 0;
+               gmap_convert_to_secure(vcpu->arch.gmap,
+                                      kvm_s390_get_prefix(vcpu));
+               gmap_convert_to_secure(vcpu->arch.gmap,
+                                      kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
+               break;
        default:
                return -EOPNOTSUPP;
        }
index c06c89d..8191106 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * handling kvm guest interrupts
  *
- * Copyright IBM Corp. 2008, 2015
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  */
@@ -324,8 +324,11 @@ static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
 
 static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu)
 {
-       return vcpu->kvm->arch.float_int.pending_irqs |
-               vcpu->arch.local_int.pending_irqs;
+       unsigned long pending = vcpu->kvm->arch.float_int.pending_irqs |
+                               vcpu->arch.local_int.pending_irqs;
+
+       pending &= ~vcpu->kvm->arch.float_int.masked_irqs;
+       return pending;
 }
 
 static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
@@ -383,10 +386,18 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
                __clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask);
        if (!(vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK))
                __clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask);
-       if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
+       if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) {
                __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask);
+               __clear_bit(IRQ_PEND_EXT_SERVICE_EV, &active_mask);
+       }
        if (psw_mchk_disabled(vcpu))
                active_mask &= ~IRQ_PEND_MCHK_MASK;
+       /* PV guest cpus can have a single interruption injected at a time. */
+       if (kvm_s390_pv_cpu_is_protected(vcpu) &&
+           vcpu->arch.sie_block->iictl != IICTL_CODE_NONE)
+               active_mask &= ~(IRQ_PEND_EXT_II_MASK |
+                                IRQ_PEND_IO_MASK |
+                                IRQ_PEND_MCHK_MASK);
        /*
         * Check both floating and local interrupt's cr14 because
         * bit IRQ_PEND_MCHK_REP could be set in both cases.
@@ -479,19 +490,23 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu)
 static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-       int rc;
+       int rc = 0;
 
        vcpu->stat.deliver_cputm++;
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER,
                                         0, 0);
-
-       rc  = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
-                          (u16 *)__LC_EXT_INT_CODE);
-       rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
-       rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
-                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+               vcpu->arch.sie_block->eic = EXT_IRQ_CPU_TIMER;
+       } else {
+               rc  = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
+                                  (u16 *)__LC_EXT_INT_CODE);
+               rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       }
        clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
        return rc ? -EFAULT : 0;
 }
@@ -499,19 +514,23 @@ static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
 static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-       int rc;
+       int rc = 0;
 
        vcpu->stat.deliver_ckc++;
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP,
                                         0, 0);
-
-       rc  = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP,
-                          (u16 __user *)__LC_EXT_INT_CODE);
-       rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
-       rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
-                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+               vcpu->arch.sie_block->eic = EXT_IRQ_CLK_COMP;
+       } else {
+               rc  = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP,
+                                  (u16 __user *)__LC_EXT_INT_CODE);
+               rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       }
        clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs);
        return rc ? -EFAULT : 0;
 }
@@ -553,6 +572,20 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
        union mci mci;
        int rc;
 
+       /*
+        * All other possible payload for a machine check (e.g. the register
+        * contents in the save area) will be handled by the ultravisor, as
+        * the hypervisor does not not have the needed information for
+        * protected guests.
+        */
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               vcpu->arch.sie_block->iictl = IICTL_CODE_MCHK;
+               vcpu->arch.sie_block->mcic = mchk->mcic;
+               vcpu->arch.sie_block->faddr = mchk->failing_storage_address;
+               vcpu->arch.sie_block->edc = mchk->ext_damage_code;
+               return 0;
+       }
+
        mci.val = mchk->mcic;
        /* take care of lazy register loading */
        save_fpu_regs();
@@ -696,17 +729,21 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
 static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-       int rc;
+       int rc = 0;
 
        VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart");
        vcpu->stat.deliver_restart_signal++;
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0);
 
-       rc  = write_guest_lc(vcpu,
-                            offsetof(struct lowcore, restart_old_psw),
-                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw),
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               vcpu->arch.sie_block->iictl = IICTL_CODE_RESTART;
+       } else {
+               rc  = write_guest_lc(vcpu,
+                                    offsetof(struct lowcore, restart_old_psw),
+                                    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw),
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       }
        clear_bit(IRQ_PEND_RESTART, &li->pending_irqs);
        return rc ? -EFAULT : 0;
 }
@@ -748,6 +785,12 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu)
        vcpu->stat.deliver_emergency_signal++;
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY,
                                         cpu_addr, 0);
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+               vcpu->arch.sie_block->eic = EXT_IRQ_EMERGENCY_SIG;
+               vcpu->arch.sie_block->extcpuaddr = cpu_addr;
+               return 0;
+       }
 
        rc  = put_guest_lc(vcpu, EXT_IRQ_EMERGENCY_SIG,
                           (u16 *)__LC_EXT_INT_CODE);
@@ -776,6 +819,12 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu)
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
                                         KVM_S390_INT_EXTERNAL_CALL,
                                         extcall.code, 0);
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+               vcpu->arch.sie_block->eic = EXT_IRQ_EXTERNAL_CALL;
+               vcpu->arch.sie_block->extcpuaddr = extcall.code;
+               return 0;
+       }
 
        rc  = put_guest_lc(vcpu, EXT_IRQ_EXTERNAL_CALL,
                           (u16 *)__LC_EXT_INT_CODE);
@@ -787,6 +836,21 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu)
        return rc ? -EFAULT : 0;
 }
 
+static int __deliver_prog_pv(struct kvm_vcpu *vcpu, u16 code)
+{
+       switch (code) {
+       case PGM_SPECIFICATION:
+               vcpu->arch.sie_block->iictl = IICTL_CODE_SPECIFICATION;
+               break;
+       case PGM_OPERAND:
+               vcpu->arch.sie_block->iictl = IICTL_CODE_OPERAND;
+               break;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
 static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -807,6 +871,10 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                         pgm_info.code, 0);
 
+       /* PER is handled by the ultravisor */
+       if (kvm_s390_pv_cpu_is_protected(vcpu))
+               return __deliver_prog_pv(vcpu, pgm_info.code & ~PGM_PER);
+
        switch (pgm_info.code & ~PGM_PER) {
        case PGM_AFX_TRANSLATION:
        case PGM_ASX_TRANSLATION:
@@ -818,7 +886,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
        case PGM_PRIMARY_AUTHORITY:
        case PGM_SECONDARY_AUTHORITY:
                nullifying = true;
-               /* fall through */
+               fallthrough;
        case PGM_SPACE_SWITCH:
                rc = put_guest_lc(vcpu, pgm_info.trans_exc_code,
                                  (u64 *)__LC_TRANS_EXC_CODE);
@@ -902,20 +970,49 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
        return rc ? -EFAULT : 0;
 }
 
+#define SCCB_MASK 0xFFFFFFF8
+#define SCCB_EVENT_PENDING 0x3
+
+static int write_sclp(struct kvm_vcpu *vcpu, u32 parm)
+{
+       int rc;
+
+       if (kvm_s390_pv_cpu_get_handle(vcpu)) {
+               vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+               vcpu->arch.sie_block->eic = EXT_IRQ_SERVICE_SIG;
+               vcpu->arch.sie_block->eiparams = parm;
+               return 0;
+       }
+
+       rc  = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE);
+       rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+       rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       rc |= put_guest_lc(vcpu, parm,
+                          (u32 *)__LC_EXT_PARAMS);
+
+       return rc ? -EFAULT : 0;
+}
+
 static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
        struct kvm_s390_ext_info ext;
-       int rc = 0;
 
        spin_lock(&fi->lock);
-       if (!(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) {
+       if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs) ||
+           !(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) {
                spin_unlock(&fi->lock);
                return 0;
        }
        ext = fi->srv_signal;
        memset(&fi->srv_signal, 0, sizeof(ext));
        clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+       clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+       if (kvm_s390_pv_cpu_is_protected(vcpu))
+               set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs);
        spin_unlock(&fi->lock);
 
        VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x",
@@ -924,16 +1021,31 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
                                         ext.ext_params, 0);
 
-       rc  = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE);
-       rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
-       rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
-                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= put_guest_lc(vcpu, ext.ext_params,
-                          (u32 *)__LC_EXT_PARAMS);
+       return write_sclp(vcpu, ext.ext_params);
+}
 
-       return rc ? -EFAULT : 0;
+static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu)
+{
+       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+       struct kvm_s390_ext_info ext;
+
+       spin_lock(&fi->lock);
+       if (!(test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs))) {
+               spin_unlock(&fi->lock);
+               return 0;
+       }
+       ext = fi->srv_signal;
+       /* only clear the event bit */
+       fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING;
+       clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+       spin_unlock(&fi->lock);
+
+       VCPU_EVENT(vcpu, 4, "%s", "deliver: sclp parameter event");
+       vcpu->stat.deliver_service_signal++;
+       trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
+                                        ext.ext_params, 0);
+
+       return write_sclp(vcpu, SCCB_EVENT_PENDING);
 }
 
 static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu)
@@ -1028,6 +1140,15 @@ static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io)
 {
        int rc;
 
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               vcpu->arch.sie_block->iictl = IICTL_CODE_IO;
+               vcpu->arch.sie_block->subchannel_id = io->subchannel_id;
+               vcpu->arch.sie_block->subchannel_nr = io->subchannel_nr;
+               vcpu->arch.sie_block->io_int_parm = io->io_int_parm;
+               vcpu->arch.sie_block->io_int_word = io->io_int_word;
+               return 0;
+       }
+
        rc  = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID);
        rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR);
        rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM);
@@ -1329,6 +1450,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
                case IRQ_PEND_EXT_SERVICE:
                        rc = __deliver_service(vcpu);
                        break;
+               case IRQ_PEND_EXT_SERVICE_EV:
+                       rc = __deliver_service_ev(vcpu);
+                       break;
                case IRQ_PEND_PFAULT_DONE:
                        rc = __deliver_pfault_done(vcpu);
                        break;
@@ -1421,7 +1545,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
        if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL)
                return -EINVAL;
 
-       if (sclp.has_sigpif)
+       if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu))
                return sca_inject_ext_call(vcpu, src_id);
 
        if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
@@ -1681,9 +1805,6 @@ out:
        return inti;
 }
 
-#define SCCB_MASK 0xFFFFFFF8
-#define SCCB_EVENT_PENDING 0x3
-
 static int __inject_service(struct kvm *kvm,
                             struct kvm_s390_interrupt_info *inti)
 {
@@ -1692,6 +1813,11 @@ static int __inject_service(struct kvm *kvm,
        kvm->stat.inject_service_signal++;
        spin_lock(&fi->lock);
        fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING;
+
+       /* We always allow events, track them separately from the sccb ints */
+       if (fi->srv_signal.ext_params & SCCB_EVENT_PENDING)
+               set_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+
        /*
         * Early versions of the QEMU s390 bios will inject several
         * service interrupts after another without handling a
@@ -1773,7 +1899,14 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
        kvm->stat.inject_io++;
        isc = int_word_to_isc(inti->io.io_int_word);
 
-       if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) {
+       /*
+        * Do not make use of gisa in protected mode. We do not use the lock
+        * checking variant as this is just a performance optimization and we
+        * do not hold the lock here. This is ok as the code will pick
+        * interrupts from both "lists" for delivery.
+        */
+       if (!kvm_s390_pv_get_handle(kvm) &&
+           gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) {
                VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc);
                gisa_set_ipm_gisc(gi->origin, isc);
                kfree(inti);
@@ -1834,7 +1967,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
                break;
        case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
                if (!(type & KVM_S390_INT_IO_AI_MASK &&
-                     kvm->arch.gisa_int.origin))
+                     kvm->arch.gisa_int.origin) ||
+                     kvm_s390_pv_cpu_get_handle(dst_vcpu))
                        kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT);
                break;
        default:
@@ -2080,6 +2214,10 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
        struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
        int i;
 
+       mutex_lock(&kvm->lock);
+       if (!kvm_s390_pv_is_protected(kvm))
+               fi->masked_irqs = 0;
+       mutex_unlock(&kvm->lock);
        spin_lock(&fi->lock);
        fi->pending_irqs = 0;
        memset(&fi->srv_signal, 0, sizeof(fi->srv_signal));
@@ -2146,7 +2284,8 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
                        n++;
                }
        }
-       if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs)) {
+       if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs) ||
+           test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs)) {
                if (n == max_irqs) {
                        /* signal userspace to try again */
                        ret = -ENOMEM;
@@ -2327,9 +2466,6 @@ static int register_io_adapter(struct kvm_device *dev,
        if (!adapter)
                return -ENOMEM;
 
-       INIT_LIST_HEAD(&adapter->maps);
-       init_rwsem(&adapter->maps_lock);
-       atomic_set(&adapter->nr_maps, 0);
        adapter->id = adapter_info.id;
        adapter->isc = adapter_info.isc;
        adapter->maskable = adapter_info.maskable;
@@ -2354,87 +2490,12 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
        return ret;
 }
 
-static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
-{
-       struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
-       struct s390_map_info *map;
-       int ret;
-
-       if (!adapter || !addr)
-               return -EINVAL;
-
-       map = kzalloc(sizeof(*map), GFP_KERNEL);
-       if (!map) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       INIT_LIST_HEAD(&map->list);
-       map->guest_addr = addr;
-       map->addr = gmap_translate(kvm->arch.gmap, addr);
-       if (map->addr == -EFAULT) {
-               ret = -EFAULT;
-               goto out;
-       }
-       ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
-       if (ret < 0)
-               goto out;
-       BUG_ON(ret != 1);
-       down_write(&adapter->maps_lock);
-       if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
-               list_add_tail(&map->list, &adapter->maps);
-               ret = 0;
-       } else {
-               put_page(map->page);
-               ret = -EINVAL;
-       }
-       up_write(&adapter->maps_lock);
-out:
-       if (ret)
-               kfree(map);
-       return ret;
-}
-
-static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
-{
-       struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
-       struct s390_map_info *map, *tmp;
-       int found = 0;
-
-       if (!adapter || !addr)
-               return -EINVAL;
-
-       down_write(&adapter->maps_lock);
-       list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
-               if (map->guest_addr == addr) {
-                       found = 1;
-                       atomic_dec(&adapter->nr_maps);
-                       list_del(&map->list);
-                       put_page(map->page);
-                       kfree(map);
-                       break;
-               }
-       }
-       up_write(&adapter->maps_lock);
-
-       return found ? 0 : -EINVAL;
-}
-
 void kvm_s390_destroy_adapters(struct kvm *kvm)
 {
        int i;
-       struct s390_map_info *map, *tmp;
 
-       for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
-               if (!kvm->arch.adapters[i])
-                       continue;
-               list_for_each_entry_safe(map, tmp,
-                                        &kvm->arch.adapters[i]->maps, list) {
-                       list_del(&map->list);
-                       put_page(map->page);
-                       kfree(map);
-               }
+       for (i = 0; i < MAX_S390_IO_ADAPTERS; i++)
                kfree(kvm->arch.adapters[i]);
-       }
 }
 
 static int modify_io_adapter(struct kvm_device *dev,
@@ -2456,11 +2517,14 @@ static int modify_io_adapter(struct kvm_device *dev,
                if (ret > 0)
                        ret = 0;
                break;
+       /*
+        * The following operations are no longer needed and therefore no-ops.
+        * The gpa to hva translation is done when an IRQ route is set up. The
+        * set_irq code uses get_user_pages_remote() to do the actual write.
+        */
        case KVM_S390_IO_ADAPTER_MAP:
-               ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr);
-               break;
        case KVM_S390_IO_ADAPTER_UNMAP:
-               ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr);
+               ret = 0;
                break;
        default:
                ret = -EINVAL;
@@ -2699,19 +2763,15 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
        return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
 }
 
-static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
-                                         u64 addr)
+static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
 {
-       struct s390_map_info *map;
+       struct page *page = NULL;
 
-       if (!adapter)
-               return NULL;
-
-       list_for_each_entry(map, &adapter->maps, list) {
-               if (map->guest_addr == addr)
-                       return map;
-       }
-       return NULL;
+       down_read(&kvm->mm->mmap_sem);
+       get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE,
+                             &page, NULL, NULL);
+       up_read(&kvm->mm->mmap_sem);
+       return page;
 }
 
 static int adapter_indicators_set(struct kvm *kvm,
@@ -2720,30 +2780,35 @@ static int adapter_indicators_set(struct kvm *kvm,
 {
        unsigned long bit;
        int summary_set, idx;
-       struct s390_map_info *info;
+       struct page *ind_page, *summary_page;
        void *map;
 
-       info = get_map_info(adapter, adapter_int->ind_addr);
-       if (!info)
+       ind_page = get_map_page(kvm, adapter_int->ind_addr);
+       if (!ind_page)
                return -1;
-       map = page_address(info->page);
-       bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
-       set_bit(bit, map);
-       idx = srcu_read_lock(&kvm->srcu);
-       mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
-       set_page_dirty_lock(info->page);
-       info = get_map_info(adapter, adapter_int->summary_addr);
-       if (!info) {
-               srcu_read_unlock(&kvm->srcu, idx);
+       summary_page = get_map_page(kvm, adapter_int->summary_addr);
+       if (!summary_page) {
+               put_page(ind_page);
                return -1;
        }
-       map = page_address(info->page);
-       bit = get_ind_bit(info->addr, adapter_int->summary_offset,
-                         adapter->swap);
+
+       idx = srcu_read_lock(&kvm->srcu);
+       map = page_address(ind_page);
+       bit = get_ind_bit(adapter_int->ind_addr,
+                         adapter_int->ind_offset, adapter->swap);
+       set_bit(bit, map);
+       mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT);
+       set_page_dirty_lock(ind_page);
+       map = page_address(summary_page);
+       bit = get_ind_bit(adapter_int->summary_addr,
+                         adapter_int->summary_offset, adapter->swap);
        summary_set = test_and_set_bit(bit, map);
-       mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
-       set_page_dirty_lock(info->page);
+       mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT);
+       set_page_dirty_lock(summary_page);
        srcu_read_unlock(&kvm->srcu, idx);
+
+       put_page(ind_page);
+       put_page(summary_page);
        return summary_set ? 0 : 1;
 }
 
@@ -2765,9 +2830,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
        adapter = get_io_adapter(kvm, e->adapter.adapter_id);
        if (!adapter)
                return -1;
-       down_read(&adapter->maps_lock);
        ret = adapter_indicators_set(kvm, adapter, &e->adapter);
-       up_read(&adapter->maps_lock);
        if ((ret > 0) && !adapter->masked) {
                ret = kvm_s390_inject_airq(kvm, adapter);
                if (ret == 0)
@@ -2818,23 +2881,27 @@ int kvm_set_routing_entry(struct kvm *kvm,
                          struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue)
 {
-       int ret;
+       u64 uaddr;
 
        switch (ue->type) {
+       /* we store the userspace addresses instead of the guest addresses */
        case KVM_IRQ_ROUTING_S390_ADAPTER:
                e->set = set_adapter_int;
-               e->adapter.summary_addr = ue->u.adapter.summary_addr;
-               e->adapter.ind_addr = ue->u.adapter.ind_addr;
+               uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr);
+               if (uaddr == -EFAULT)
+                       return -EFAULT;
+               e->adapter.summary_addr = uaddr;
+               uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr);
+               if (uaddr == -EFAULT)
+                       return -EFAULT;
+               e->adapter.ind_addr = uaddr;
                e->adapter.summary_offset = ue->u.adapter.summary_offset;
                e->adapter.ind_offset = ue->u.adapter.ind_offset;
                e->adapter.adapter_id = ue->u.adapter.adapter_id;
-               ret = 0;
-               break;
+               return 0;
        default:
-               ret = -EINVAL;
+               return -EINVAL;
        }
-
-       return ret;
 }
 
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
index d7ff30e..6b2649b 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * hosting IBM Z kernel virtual machines (s390x)
  *
- * Copyright IBM Corp. 2008, 2018
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -44,6 +44,7 @@
 #include <asm/cpacf.h>
 #include <asm/timex.h>
 #include <asm/ap.h>
+#include <asm/uv.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 
@@ -184,6 +185,11 @@ static u8 halt_poll_max_steal = 10;
 module_param(halt_poll_max_steal, byte, 0644);
 MODULE_PARM_DESC(halt_poll_max_steal, "Maximum percentage of steal time to allow polling");
 
+/* if set to true, the GISA will be initialized and used if available */
+static bool use_gisa  = true;
+module_param(use_gisa, bool, 0644);
+MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
+
 /*
  * For now we handle at most 16 double words as this is what the s390 base
  * kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -220,6 +226,7 @@ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
 static struct gmap_notifier gmap_notifier;
 static struct gmap_notifier vsie_gmap_notifier;
 debug_info_t *kvm_s390_dbf;
+debug_info_t *kvm_s390_dbf_uv;
 
 /* Section: not file related */
 int kvm_arch_hardware_enable(void)
@@ -233,8 +240,10 @@ int kvm_arch_check_processor_compat(void)
        return 0;
 }
 
+/* forward declarations */
 static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
                              unsigned long end);
+static int sca_switch_to_extended(struct kvm *kvm);
 
 static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
 {
@@ -460,7 +469,12 @@ int kvm_arch_init(void *opaque)
        if (!kvm_s390_dbf)
                return -ENOMEM;
 
-       if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view))
+       kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long));
+       if (!kvm_s390_dbf_uv)
+               goto out;
+
+       if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) ||
+           debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view))
                goto out;
 
        kvm_s390_cpu_feat_init();
@@ -487,6 +501,7 @@ void kvm_arch_exit(void)
 {
        kvm_s390_gib_destroy();
        debug_unregister(kvm_s390_dbf);
+       debug_unregister(kvm_s390_dbf_uv);
 }
 
 /* Section: device related */
@@ -564,14 +579,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_BPB:
                r = test_facility(82);
                break;
+       case KVM_CAP_S390_PROTECTED:
+               r = is_prot_virt_host();
+               break;
        default:
                r = 0;
        }
        return r;
 }
 
-static void kvm_s390_sync_dirty_log(struct kvm *kvm,
-                                   struct kvm_memory_slot *memslot)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
        int i;
        gfn_t cur_gfn, last_gfn;
@@ -612,9 +629,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 {
        int r;
        unsigned long n;
-       struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
-       int is_dirty = 0;
+       int is_dirty;
 
        if (kvm_is_ucontrol(kvm))
                return -EINVAL;
@@ -625,14 +641,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        if (log->slot >= KVM_USER_MEM_SLOTS)
                goto out;
 
-       slots = kvm_memslots(kvm);
-       memslot = id_to_memslot(slots, log->slot);
-       r = -ENOENT;
-       if (!memslot->dirty_bitmap)
-               goto out;
-
-       kvm_s390_sync_dirty_log(kvm, memslot);
-       r = kvm_get_dirty_log(kvm, log, &is_dirty);
+       r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot);
        if (r)
                goto out;
 
@@ -1993,6 +2002,9 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
        struct kvm_memslots *slots = kvm_memslots(kvm);
        struct kvm_memory_slot *ms;
 
+       if (unlikely(!slots->used_slots))
+               return 0;
+
        cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
        ms = gfn_to_memslot(kvm, cur_gfn);
        args->count = 0;
@@ -2158,6 +2170,194 @@ out:
        return r;
 }
 
+static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
+{
+       struct kvm_vcpu *vcpu;
+       u16 rc, rrc;
+       int ret = 0;
+       int i;
+
+       /*
+        * We ignore failures and try to destroy as many CPUs as possible.
+        * At the same time we must not free the assigned resources when
+        * this fails, as the ultravisor has still access to that memory.
+        * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak
+        * behind.
+        * We want to return the first failure rc and rrc, though.
+        */
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               mutex_lock(&vcpu->mutex);
+               if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) {
+                       *rcp = rc;
+                       *rrcp = rrc;
+                       ret = -EIO;
+               }
+               mutex_unlock(&vcpu->mutex);
+       }
+       return ret;
+}
+
+static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+       int i, r = 0;
+       u16 dummy;
+
+       struct kvm_vcpu *vcpu;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               mutex_lock(&vcpu->mutex);
+               r = kvm_s390_pv_create_cpu(vcpu, rc, rrc);
+               mutex_unlock(&vcpu->mutex);
+               if (r)
+                       break;
+       }
+       if (r)
+               kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+       return r;
+}
+
+static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
+{
+       int r = 0;
+       u16 dummy;
+       void __user *argp = (void __user *)cmd->data;
+
+       switch (cmd->cmd) {
+       case KVM_PV_ENABLE: {
+               r = -EINVAL;
+               if (kvm_s390_pv_is_protected(kvm))
+                       break;
+
+               /*
+                *  FMT 4 SIE needs esca. As we never switch back to bsca from
+                *  esca, we need no cleanup in the error cases below
+                */
+               r = sca_switch_to_extended(kvm);
+               if (r)
+                       break;
+
+               down_write(&current->mm->mmap_sem);
+               r = gmap_mark_unmergeable();
+               up_write(&current->mm->mmap_sem);
+               if (r)
+                       break;
+
+               r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc);
+               if (r)
+                       break;
+
+               r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc);
+               if (r)
+                       kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+
+               /* we need to block service interrupts from now on */
+               set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+               break;
+       }
+       case KVM_PV_DISABLE: {
+               r = -EINVAL;
+               if (!kvm_s390_pv_is_protected(kvm))
+                       break;
+
+               r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
+               /*
+                * If a CPU could not be destroyed, destroy VM will also fail.
+                * There is no point in trying to destroy it. Instead return
+                * the rc and rrc from the first CPU that failed destroying.
+                */
+               if (r)
+                       break;
+               r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc);
+
+               /* no need to block service interrupts any more */
+               clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+               break;
+       }
+       case KVM_PV_SET_SEC_PARMS: {
+               struct kvm_s390_pv_sec_parm parms = {};
+               void *hdr;
+
+               r = -EINVAL;
+               if (!kvm_s390_pv_is_protected(kvm))
+                       break;
+
+               r = -EFAULT;
+               if (copy_from_user(&parms, argp, sizeof(parms)))
+                       break;
+
+               /* Currently restricted to 8KB */
+               r = -EINVAL;
+               if (parms.length > PAGE_SIZE * 2)
+                       break;
+
+               r = -ENOMEM;
+               hdr = vmalloc(parms.length);
+               if (!hdr)
+                       break;
+
+               r = -EFAULT;
+               if (!copy_from_user(hdr, (void __user *)parms.origin,
+                                   parms.length))
+                       r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length,
+                                                     &cmd->rc, &cmd->rrc);
+
+               vfree(hdr);
+               break;
+       }
+       case KVM_PV_UNPACK: {
+               struct kvm_s390_pv_unp unp = {};
+
+               r = -EINVAL;
+               if (!kvm_s390_pv_is_protected(kvm))
+                       break;
+
+               r = -EFAULT;
+               if (copy_from_user(&unp, argp, sizeof(unp)))
+                       break;
+
+               r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak,
+                                      &cmd->rc, &cmd->rrc);
+               break;
+       }
+       case KVM_PV_VERIFY: {
+               r = -EINVAL;
+               if (!kvm_s390_pv_is_protected(kvm))
+                       break;
+
+               r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+                                 UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc);
+               KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc,
+                            cmd->rrc);
+               break;
+       }
+       case KVM_PV_PREP_RESET: {
+               r = -EINVAL;
+               if (!kvm_s390_pv_is_protected(kvm))
+                       break;
+
+               r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+                                 UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc);
+               KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x",
+                            cmd->rc, cmd->rrc);
+               break;
+       }
+       case KVM_PV_UNSHARE_ALL: {
+               r = -EINVAL;
+               if (!kvm_s390_pv_is_protected(kvm))
+                       break;
+
+               r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+                                 UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc);
+               KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x",
+                            cmd->rc, cmd->rrc);
+               break;
+       }
+       default:
+               r = -ENOTTY;
+       }
+       return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
@@ -2255,6 +2455,33 @@ long kvm_arch_vm_ioctl(struct file *filp,
                mutex_unlock(&kvm->slots_lock);
                break;
        }
+       case KVM_S390_PV_COMMAND: {
+               struct kvm_pv_cmd args;
+
+               /* protvirt means user sigp */
+               kvm->arch.user_cpu_state_ctrl = 1;
+               r = 0;
+               if (!is_prot_virt_host()) {
+                       r = -EINVAL;
+                       break;
+               }
+               if (copy_from_user(&args, argp, sizeof(args))) {
+                       r = -EFAULT;
+                       break;
+               }
+               if (args.flags) {
+                       r = -EINVAL;
+                       break;
+               }
+               mutex_lock(&kvm->lock);
+               r = kvm_s390_handle_pv(kvm, &args);
+               mutex_unlock(&kvm->lock);
+               if (copy_to_user(argp, &args, sizeof(args))) {
+                       r = -EFAULT;
+                       break;
+               }
+               break;
+       }
        default:
                r = -ENOTTY;
        }
@@ -2504,7 +2731,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm->arch.use_skf = sclp.has_skey;
        spin_lock_init(&kvm->arch.start_stop_lock);
        kvm_s390_vsie_init(kvm);
-       kvm_s390_gisa_init(kvm);
+       if (use_gisa)
+               kvm_s390_gisa_init(kvm);
        KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
 
        return 0;
@@ -2518,6 +2746,8 @@ out_err:
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+       u16 rc, rrc;
+
        VCPU_EVENT(vcpu, 3, "%s", "free cpu");
        trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
        kvm_s390_clear_local_irqs(vcpu);
@@ -2530,6 +2760,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
        if (vcpu->kvm->arch.use_cmma)
                kvm_s390_vcpu_unsetup_cmma(vcpu);
+       /* We can not hold the vcpu mutex here, we are already dying */
+       if (kvm_s390_pv_cpu_get_handle(vcpu))
+               kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc);
        free_page((unsigned long)(vcpu->arch.sie_block));
 }
 
@@ -2551,10 +2784,20 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+       u16 rc, rrc;
+
        kvm_free_vcpus(kvm);
        sca_dispose(kvm);
-       debug_unregister(kvm->arch.dbf);
        kvm_s390_gisa_destroy(kvm);
+       /*
+        * We are already at the end of life and kvm->lock is not taken.
+        * This is ok as the file descriptor is closed by now and nobody
+        * can mess with the pv state. To avoid lockdep_assert_held from
+        * complaining we do not use kvm_s390_pv_is_protected.
+        */
+       if (kvm_s390_pv_get_handle(kvm))
+               kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
+       debug_unregister(kvm->arch.dbf);
        free_page((unsigned long)kvm->arch.sie_page2);
        if (!kvm_is_ucontrol(kvm))
                gmap_remove(kvm->arch.gmap);
@@ -2650,6 +2893,9 @@ static int sca_switch_to_extended(struct kvm *kvm)
        unsigned int vcpu_idx;
        u32 scaol, scaoh;
 
+       if (kvm->arch.use_esca)
+               return 0;
+
        new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO);
        if (!new_sca)
                return -ENOMEM;
@@ -2901,6 +3147,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
 static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        int rc = 0;
+       u16 uvrc, uvrrc;
 
        atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
                                                    CPUSTAT_SM |
@@ -2968,6 +3215,14 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
 
        kvm_s390_vcpu_crypto_setup(vcpu);
 
+       mutex_lock(&vcpu->kvm->lock);
+       if (kvm_s390_pv_is_protected(vcpu->kvm)) {
+               rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc);
+               if (rc)
+                       kvm_s390_vcpu_unsetup_cmma(vcpu);
+       }
+       mutex_unlock(&vcpu->kvm->lock);
+
        return rc;
 }
 
@@ -3274,14 +3529,21 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
        kvm_s390_set_prefix(vcpu, 0);
        kvm_s390_set_cpu_timer(vcpu, 0);
        vcpu->arch.sie_block->ckc = 0;
-       vcpu->arch.sie_block->todpr = 0;
        memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr));
        vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK;
        vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK;
        vcpu->run->s.regs.fpc = 0;
-       vcpu->arch.sie_block->gbea = 1;
-       vcpu->arch.sie_block->pp = 0;
-       vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
+       /*
+        * Do not reset these registers in the protected case, as some of
+        * them are overlayed and they are not accessible in this case
+        * anyway.
+        */
+       if (!kvm_s390_pv_cpu_is_protected(vcpu)) {
+               vcpu->arch.sie_block->gbea = 1;
+               vcpu->arch.sie_block->pp = 0;
+               vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
+               vcpu->arch.sie_block->todpr = 0;
+       }
 }
 
 static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu)
@@ -3471,14 +3733,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 
        switch (mp_state->mp_state) {
        case KVM_MP_STATE_STOPPED:
-               kvm_s390_vcpu_stop(vcpu);
+               rc = kvm_s390_vcpu_stop(vcpu);
                break;
        case KVM_MP_STATE_OPERATING:
-               kvm_s390_vcpu_start(vcpu);
+               rc = kvm_s390_vcpu_start(vcpu);
                break;
        case KVM_MP_STATE_LOAD:
+               if (!kvm_s390_pv_cpu_is_protected(vcpu)) {
+                       rc = -ENXIO;
+                       break;
+               }
+               rc = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR_LOAD);
+               break;
        case KVM_MP_STATE_CHECK_STOP:
-               /* fall through - CHECK_STOP and LOAD are not supported yet */
+               fallthrough;    /* CHECK_STOP and LOAD are not supported yet */
        default:
                rc = -ENXIO;
        }
@@ -3828,9 +4096,11 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
        return vcpu_post_run_fault_in_sie(vcpu);
 }
 
+#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
 static int __vcpu_run(struct kvm_vcpu *vcpu)
 {
        int rc, exit_reason;
+       struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block;
 
        /*
         * We try to hold kvm->srcu during most of vcpu_run (except when run-
@@ -3852,8 +4122,28 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                guest_enter_irqoff();
                __disable_cpu_timer_accounting(vcpu);
                local_irq_enable();
+               if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+                       memcpy(sie_page->pv_grregs,
+                              vcpu->run->s.regs.gprs,
+                              sizeof(sie_page->pv_grregs));
+               }
                exit_reason = sie64a(vcpu->arch.sie_block,
                                     vcpu->run->s.regs.gprs);
+               if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+                       memcpy(vcpu->run->s.regs.gprs,
+                              sie_page->pv_grregs,
+                              sizeof(sie_page->pv_grregs));
+                       /*
+                        * We're not allowed to inject interrupts on intercepts
+                        * that leave the guest state in an "in-between" state
+                        * where the next SIE entry will do a continuation.
+                        * Fence interrupts in our "internal" PSW.
+                        */
+                       if (vcpu->arch.sie_block->icptcode == ICPT_PV_INSTR ||
+                           vcpu->arch.sie_block->icptcode == ICPT_PV_PREF) {
+                               vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
+                       }
+               }
                local_irq_disable();
                __enable_cpu_timer_accounting(vcpu);
                guest_exit_irqoff();
@@ -3867,7 +4157,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
        return rc;
 }
 
-static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct runtime_instr_cb *riccb;
        struct gs_cb *gscb;
@@ -3876,16 +4166,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        gscb = (struct gs_cb *) &kvm_run->s.regs.gscb;
        vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
        vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
-       if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
-               kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
-       if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
-               memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
-               /* some control register changes require a tlb flush */
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-       }
        if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
-               kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
-               vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
                vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr;
                vcpu->arch.sie_block->pp = kvm_run->s.regs.pp;
                vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea;
@@ -3926,6 +4207,36 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
                vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0;
        }
+       if (MACHINE_HAS_GS) {
+               preempt_disable();
+               __ctl_set_bit(2, 4);
+               if (current->thread.gs_cb) {
+                       vcpu->arch.host_gscb = current->thread.gs_cb;
+                       save_gs_cb(vcpu->arch.host_gscb);
+               }
+               if (vcpu->arch.gs_enabled) {
+                       current->thread.gs_cb = (struct gs_cb *)
+                                               &vcpu->run->s.regs.gscb;
+                       restore_gs_cb(current->thread.gs_cb);
+               }
+               preempt_enable();
+       }
+       /* SIE will load etoken directly from SDNX and therefore kvm_run */
+}
+
+static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
+               kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
+       if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
+               memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
+               /* some control register changes require a tlb flush */
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       }
+       if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
+               kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
+               vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
+       }
        save_access_regs(vcpu->arch.host_acrs);
        restore_access_regs(vcpu->run->s.regs.acrs);
        /* save host (userspace) fprs/vrs */
@@ -3940,23 +4251,47 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        if (test_fp_ctl(current->thread.fpu.fpc))
                /* User space provided an invalid FPC, let's clear it */
                current->thread.fpu.fpc = 0;
+
+       /* Sync fmt2 only data */
+       if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) {
+               sync_regs_fmt2(vcpu, kvm_run);
+       } else {
+               /*
+                * In several places we have to modify our internal view to
+                * not do things that are disallowed by the ultravisor. For
+                * example we must not inject interrupts after specific exits
+                * (e.g. 112 prefix page not secure). We do this by turning
+                * off the machine check, external and I/O interrupt bits
+                * of our PSW copy. To avoid getting validity intercepts, we
+                * do only accept the condition code from userspace.
+                */
+               vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_CC;
+               vcpu->arch.sie_block->gpsw.mask |= kvm_run->psw_mask &
+                                                  PSW_MASK_CC;
+       }
+
+       kvm_run->kvm_dirty_regs = 0;
+}
+
+static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
+       kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
+       kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
+       kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
        if (MACHINE_HAS_GS) {
-               preempt_disable();
                __ctl_set_bit(2, 4);
-               if (current->thread.gs_cb) {
-                       vcpu->arch.host_gscb = current->thread.gs_cb;
-                       save_gs_cb(vcpu->arch.host_gscb);
-               }
-               if (vcpu->arch.gs_enabled) {
-                       current->thread.gs_cb = (struct gs_cb *)
-                                               &vcpu->run->s.regs.gscb;
-                       restore_gs_cb(current->thread.gs_cb);
-               }
+               if (vcpu->arch.gs_enabled)
+                       save_gs_cb(current->thread.gs_cb);
+               preempt_disable();
+               current->thread.gs_cb = vcpu->arch.host_gscb;
+               restore_gs_cb(vcpu->arch.host_gscb);
                preempt_enable();
+               if (!vcpu->arch.host_gscb)
+                       __ctl_clear_bit(2, 4);
+               vcpu->arch.host_gscb = NULL;
        }
-       /* SIE will load etoken directly from SDNX and therefore kvm_run */
-
-       kvm_run->kvm_dirty_regs = 0;
+       /* SIE will save etoken directly into SDNX and therefore kvm_run */
 }
 
 static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -3967,13 +4302,9 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
        kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu);
        kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc;
-       kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
-       kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
-       kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
        kvm_run->s.regs.pft = vcpu->arch.pfault_token;
        kvm_run->s.regs.pfs = vcpu->arch.pfault_select;
        kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
-       kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
        save_access_regs(vcpu->run->s.regs.acrs);
        restore_access_regs(vcpu->arch.host_acrs);
        /* Save guest register state */
@@ -3982,19 +4313,8 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        /* Restore will be done lazily at return */
        current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
        current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
-       if (MACHINE_HAS_GS) {
-               __ctl_set_bit(2, 4);
-               if (vcpu->arch.gs_enabled)
-                       save_gs_cb(current->thread.gs_cb);
-               preempt_disable();
-               current->thread.gs_cb = vcpu->arch.host_gscb;
-               restore_gs_cb(vcpu->arch.host_gscb);
-               preempt_enable();
-               if (!vcpu->arch.host_gscb)
-                       __ctl_clear_bit(2, 4);
-               vcpu->arch.host_gscb = NULL;
-       }
-       /* SIE will save etoken directly into SDNX and therefore kvm_run */
+       if (likely(!kvm_s390_pv_cpu_is_protected(vcpu)))
+               store_regs_fmt2(vcpu, kvm_run);
 }
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -4018,6 +4338,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        kvm_sigset_activate(vcpu);
 
+       /*
+        * no need to check the return value of vcpu_start as it can only have
+        * an error for protvirt, but protvirt means user cpu state
+        */
        if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {
                kvm_s390_vcpu_start(vcpu);
        } else if (is_vcpu_stopped(vcpu)) {
@@ -4155,18 +4479,27 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
        kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
 }
 
-void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
+int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
 {
-       int i, online_vcpus, started_vcpus = 0;
+       int i, online_vcpus, r = 0, started_vcpus = 0;
 
        if (!is_vcpu_stopped(vcpu))
-               return;
+               return 0;
 
        trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1);
        /* Only one cpu at a time may enter/leave the STOPPED state. */
        spin_lock(&vcpu->kvm->arch.start_stop_lock);
        online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
 
+       /* Let's tell the UV that we want to change into the operating state */
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR);
+               if (r) {
+                       spin_unlock(&vcpu->kvm->arch.start_stop_lock);
+                       return r;
+               }
+       }
+
        for (i = 0; i < online_vcpus; i++) {
                if (!is_vcpu_stopped(vcpu->kvm->vcpus[i]))
                        started_vcpus++;
@@ -4186,27 +4519,43 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
 
        kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED);
        /*
+        * The real PSW might have changed due to a RESTART interpreted by the
+        * ultravisor. We block all interrupts and let the next sie exit
+        * refresh our view.
+        */
+       if (kvm_s390_pv_cpu_is_protected(vcpu))
+               vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
+       /*
         * Another VCPU might have used IBS while we were offline.
         * Let's play safe and flush the VCPU at startup.
         */
        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
        spin_unlock(&vcpu->kvm->arch.start_stop_lock);
-       return;
+       return 0;
 }
 
-void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
+int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
 {
-       int i, online_vcpus, started_vcpus = 0;
+       int i, online_vcpus, r = 0, started_vcpus = 0;
        struct kvm_vcpu *started_vcpu = NULL;
 
        if (is_vcpu_stopped(vcpu))
-               return;
+               return 0;
 
        trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0);
        /* Only one cpu at a time may enter/leave the STOPPED state. */
        spin_lock(&vcpu->kvm->arch.start_stop_lock);
        online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
 
+       /* Let's tell the UV that we want to change into the stopped state */
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_STP);
+               if (r) {
+                       spin_unlock(&vcpu->kvm->arch.start_stop_lock);
+                       return r;
+               }
+       }
+
        /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */
        kvm_s390_clear_stop_irq(vcpu);
 
@@ -4229,7 +4578,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
        }
 
        spin_unlock(&vcpu->kvm->arch.start_stop_lock);
-       return;
+       return 0;
 }
 
 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
@@ -4256,12 +4605,40 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
        return r;
 }
 
+static long kvm_s390_guest_sida_op(struct kvm_vcpu *vcpu,
+                                  struct kvm_s390_mem_op *mop)
+{
+       void __user *uaddr = (void __user *)mop->buf;
+       int r = 0;
+
+       if (mop->flags || !mop->size)
+               return -EINVAL;
+       if (mop->size + mop->sida_offset < mop->size)
+               return -EINVAL;
+       if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block))
+               return -E2BIG;
+
+       switch (mop->op) {
+       case KVM_S390_MEMOP_SIDA_READ:
+               if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) +
+                                mop->sida_offset), mop->size))
+                       r = -EFAULT;
+
+               break;
+       case KVM_S390_MEMOP_SIDA_WRITE:
+               if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) +
+                                  mop->sida_offset), uaddr, mop->size))
+                       r = -EFAULT;
+               break;
+       }
+       return r;
+}
 static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
                                  struct kvm_s390_mem_op *mop)
 {
        void __user *uaddr = (void __user *)mop->buf;
        void *tmpbuf = NULL;
-       int r, srcu_idx;
+       int r = 0;
        const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
                                    | KVM_S390_MEMOP_F_CHECK_ONLY;
 
@@ -4271,14 +4648,15 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
        if (mop->size > MEM_OP_MAX_SIZE)
                return -E2BIG;
 
+       if (kvm_s390_pv_cpu_is_protected(vcpu))
+               return -EINVAL;
+
        if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
                tmpbuf = vmalloc(mop->size);
                if (!tmpbuf)
                        return -ENOMEM;
        }
 
-       srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-
        switch (mop->op) {
        case KVM_S390_MEMOP_LOGICAL_READ:
                if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
@@ -4304,12 +4682,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
                }
                r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
                break;
-       default:
-               r = -EINVAL;
        }
 
-       srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-
        if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
                kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
 
@@ -4317,6 +4691,31 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
        return r;
 }
 
+static long kvm_s390_guest_memsida_op(struct kvm_vcpu *vcpu,
+                                     struct kvm_s390_mem_op *mop)
+{
+       int r, srcu_idx;
+
+       srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       switch (mop->op) {
+       case KVM_S390_MEMOP_LOGICAL_READ:
+       case KVM_S390_MEMOP_LOGICAL_WRITE:
+               r = kvm_s390_guest_mem_op(vcpu, mop);
+               break;
+       case KVM_S390_MEMOP_SIDA_READ:
+       case KVM_S390_MEMOP_SIDA_WRITE:
+               /* we are locked against sida going away by the vcpu->mutex */
+               r = kvm_s390_guest_sida_op(vcpu, mop);
+               break;
+       default:
+               r = -EINVAL;
+       }
+
+       srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+       return r;
+}
+
 long kvm_arch_vcpu_async_ioctl(struct file *filp,
                               unsigned int ioctl, unsigned long arg)
 {
@@ -4352,6 +4751,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        void __user *argp = (void __user *)arg;
        int idx;
        long r;
+       u16 rc, rrc;
 
        vcpu_load(vcpu);
 
@@ -4373,18 +4773,40 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        case KVM_S390_CLEAR_RESET:
                r = 0;
                kvm_arch_vcpu_ioctl_clear_reset(vcpu);
+               if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+                       r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+                                         UVC_CMD_CPU_RESET_CLEAR, &rc, &rrc);
+                       VCPU_EVENT(vcpu, 3, "PROTVIRT RESET CLEAR VCPU: rc %x rrc %x",
+                                  rc, rrc);
+               }
                break;
        case KVM_S390_INITIAL_RESET:
                r = 0;
                kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+               if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+                       r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+                                         UVC_CMD_CPU_RESET_INITIAL,
+                                         &rc, &rrc);
+                       VCPU_EVENT(vcpu, 3, "PROTVIRT RESET INITIAL VCPU: rc %x rrc %x",
+                                  rc, rrc);
+               }
                break;
        case KVM_S390_NORMAL_RESET:
                r = 0;
                kvm_arch_vcpu_ioctl_normal_reset(vcpu);
+               if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+                       r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+                                         UVC_CMD_CPU_RESET, &rc, &rrc);
+                       VCPU_EVENT(vcpu, 3, "PROTVIRT RESET NORMAL VCPU: rc %x rrc %x",
+                                  rc, rrc);
+               }
                break;
        case KVM_SET_ONE_REG:
        case KVM_GET_ONE_REG: {
                struct kvm_one_reg reg;
+               r = -EINVAL;
+               if (kvm_s390_pv_cpu_is_protected(vcpu))
+                       break;
                r = -EFAULT;
                if (copy_from_user(&reg, argp, sizeof(reg)))
                        break;
@@ -4447,7 +4869,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                struct kvm_s390_mem_op mem_op;
 
                if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
-                       r = kvm_s390_guest_mem_op(vcpu, &mem_op);
+                       r = kvm_s390_guest_memsida_op(vcpu, &mem_op);
                else
                        r = -EFAULT;
                break;
@@ -4507,12 +4929,6 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                           unsigned long npages)
-{
-       return 0;
-}
-
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
@@ -4533,12 +4949,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
        if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit)
                return -EINVAL;
 
+       /* When we are protected, we should not change the memory slots */
+       if (kvm_s390_pv_get_handle(kvm))
+               return -EINVAL;
        return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old,
+                               struct kvm_memory_slot *old,
                                const struct kvm_memory_slot *new,
                                enum kvm_mr_change change)
 {
@@ -4554,7 +4973,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                                        old->npages * PAGE_SIZE);
                if (rc)
                        break;
-               /* FALLTHROUGH */
+               fallthrough;
        case KVM_MR_CREATE:
                rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
                                      mem->guest_phys_addr, mem->memory_size);
index 6d9448d..79dcd64 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * definition for kvm on s390
  *
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -15,6 +15,7 @@
 #include <linux/hrtimer.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/lockdep.h>
 #include <asm/facility.h>
 #include <asm/processor.h>
 #include <asm/sclp.h>
 #define IS_ITDB_VALID(vcpu)    ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1))
 
 extern debug_info_t *kvm_s390_dbf;
+extern debug_info_t *kvm_s390_dbf_uv;
+
+#define KVM_UV_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
+do { \
+       debug_sprintf_event((d_kvm)->arch.dbf, d_loglevel, d_string "\n", \
+         d_args); \
+       debug_sprintf_event(kvm_s390_dbf_uv, d_loglevel, \
+                           "%d: " d_string "\n", (d_kvm)->userspace_pid, \
+                           d_args); \
+} while (0)
+
 #define KVM_EVENT(d_loglevel, d_string, d_args...)\
 do { \
        debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \
@@ -196,6 +208,39 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
        return kvm->arch.user_cpu_state_ctrl != 0;
 }
 
+/* implemented in pv.c */
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+                             u16 *rrc);
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+                      unsigned long tweak, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
+
+static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
+{
+       return kvm->arch.pv.handle;
+}
+
+static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.pv.handle;
+}
+
+static inline bool kvm_s390_pv_is_protected(struct kvm *kvm)
+{
+       lockdep_assert_held(&kvm->lock);
+       return !!kvm_s390_pv_get_handle(kvm);
+}
+
+static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
+{
+       lockdep_assert_held(&vcpu->mutex);
+       return !!kvm_s390_pv_cpu_get_handle(vcpu);
+}
+
 /* implemented in interrupt.c */
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
@@ -286,8 +331,8 @@ void kvm_s390_set_tod_clock(struct kvm *kvm,
 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
-void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
-void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
+int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
+int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu);
 bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu);
index ed52ffa..69a824f 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * handling privileged instructions
  *
- * Copyright IBM Corp. 2008, 2018
+ * Copyright IBM Corp. 2008, 2020
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -872,7 +872,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 
        operand2 = kvm_s390_get_base_disp_s(vcpu, &ar);
 
-       if (operand2 & 0xfff)
+       if (!kvm_s390_pv_cpu_is_protected(vcpu) && (operand2 & 0xfff))
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
        switch (fc) {
@@ -893,8 +893,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
                handle_stsi_3_2_2(vcpu, (void *) mem);
                break;
        }
-
-       rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
+       if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+               memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem,
+                      PAGE_SIZE);
+               rc = 0;
+       } else {
+               rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
+       }
        if (rc) {
                rc = kvm_s390_inject_prog_cond(vcpu, rc);
                goto out;
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
new file mode 100644 (file)
index 0000000..63e3301
--- /dev/null
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hosting Protected Virtual Machines
+ *
+ * Copyright IBM Corp. 2019, 2020
+ *    Author(s): Janosch Frank <frankja@linux.ibm.com>
+ */
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+#include <asm/pgalloc.h>
+#include <asm/gmap.h>
+#include <asm/uv.h>
+#include <asm/mman.h>
+#include "kvm-s390.h"
+
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+       int cc = 0;
+
+       if (kvm_s390_pv_cpu_get_handle(vcpu)) {
+               cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+                                  UVC_CMD_DESTROY_SEC_CPU, rc, rrc);
+
+               KVM_UV_EVENT(vcpu->kvm, 3,
+                            "PROTVIRT DESTROY VCPU %d: rc %x rrc %x",
+                            vcpu->vcpu_id, *rc, *rrc);
+               WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x",
+                         *rc, *rrc);
+       }
+       /* Intended memory leak for something that should never happen. */
+       if (!cc)
+               free_pages(vcpu->arch.pv.stor_base,
+                          get_order(uv_info.guest_cpu_stor_len));
+
+       free_page(sida_origin(vcpu->arch.sie_block));
+       vcpu->arch.sie_block->pv_handle_cpu = 0;
+       vcpu->arch.sie_block->pv_handle_config = 0;
+       memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
+       vcpu->arch.sie_block->sdf = 0;
+       /*
+        * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0).
+        * Use the reset value of gbea to avoid leaking the kernel pointer of
+        * the just freed sida.
+        */
+       vcpu->arch.sie_block->gbea = 1;
+       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+       return cc ? EIO : 0;
+}
+
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+       struct uv_cb_csc uvcb = {
+               .header.cmd = UVC_CMD_CREATE_SEC_CPU,
+               .header.len = sizeof(uvcb),
+       };
+       int cc;
+
+       if (kvm_s390_pv_cpu_get_handle(vcpu))
+               return -EINVAL;
+
+       vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL,
+                                                  get_order(uv_info.guest_cpu_stor_len));
+       if (!vcpu->arch.pv.stor_base)
+               return -ENOMEM;
+
+       /* Input */
+       uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
+       uvcb.num = vcpu->arch.sie_block->icpua;
+       uvcb.state_origin = (u64)vcpu->arch.sie_block;
+       uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base;
+
+       /* Alloc Secure Instruction Data Area Designation */
+       vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL | __GFP_ZERO);
+       if (!vcpu->arch.sie_block->sidad) {
+               free_pages(vcpu->arch.pv.stor_base,
+                          get_order(uv_info.guest_cpu_stor_len));
+               return -ENOMEM;
+       }
+
+       cc = uv_call(0, (u64)&uvcb);
+       *rc = uvcb.header.rc;
+       *rrc = uvcb.header.rrc;
+       KVM_UV_EVENT(vcpu->kvm, 3,
+                    "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x",
+                    vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc,
+                    uvcb.header.rrc);
+
+       if (cc) {
+               u16 dummy;
+
+               kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy);
+               return -EIO;
+       }
+
+       /* Output */
+       vcpu->arch.pv.handle = uvcb.cpu_handle;
+       vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle;
+       vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm);
+       vcpu->arch.sie_block->sdf = 2;
+       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       return 0;
+}
+
+/* only free resources when the destroy was successful */
+static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
+{
+       vfree(kvm->arch.pv.stor_var);
+       free_pages(kvm->arch.pv.stor_base,
+                  get_order(uv_info.guest_base_stor_len));
+       memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv));
+}
+
+static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
+{
+       unsigned long base = uv_info.guest_base_stor_len;
+       unsigned long virt = uv_info.guest_virt_var_stor_len;
+       unsigned long npages = 0, vlen = 0;
+       struct kvm_memory_slot *memslot;
+
+       kvm->arch.pv.stor_var = NULL;
+       kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
+       if (!kvm->arch.pv.stor_base)
+               return -ENOMEM;
+
+       /*
+        * Calculate current guest storage for allocation of the
+        * variable storage, which is based on the length in MB.
+        *
+        * Slots are sorted by GFN
+        */
+       mutex_lock(&kvm->slots_lock);
+       memslot = kvm_memslots(kvm)->memslots;
+       npages = memslot->base_gfn + memslot->npages;
+       mutex_unlock(&kvm->slots_lock);
+
+       kvm->arch.pv.guest_len = npages * PAGE_SIZE;
+
+       /* Allocate variable storage */
+       vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
+       vlen += uv_info.guest_virt_base_stor_len;
+       kvm->arch.pv.stor_var = vzalloc(vlen);
+       if (!kvm->arch.pv.stor_var)
+               goto out_err;
+       return 0;
+
+out_err:
+       kvm_s390_pv_dealloc_vm(kvm);
+       return -ENOMEM;
+}
+
+/* this should not fail, but if it does, we must not free the donated memory */
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+       int cc;
+
+       /* make all pages accessible before destroying the guest */
+       s390_reset_acc(kvm->mm);
+
+       cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+                          UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+       WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+       atomic_set(&kvm->mm->context.is_protected, 0);
+       KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
+       WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
+       /* Inteded memory leak on "impossible" error */
+       if (!cc)
+               kvm_s390_pv_dealloc_vm(kvm);
+       return cc ? -EIO : 0;
+}
+
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+       struct uv_cb_cgc uvcb = {
+               .header.cmd = UVC_CMD_CREATE_SEC_CONF,
+               .header.len = sizeof(uvcb)
+       };
+       int cc, ret;
+       u16 dummy;
+
+       ret = kvm_s390_pv_alloc_vm(kvm);
+       if (ret)
+               return ret;
+
+       /* Inputs */
+       uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
+       uvcb.guest_stor_len = kvm->arch.pv.guest_len;
+       uvcb.guest_asce = kvm->arch.gmap->asce;
+       uvcb.guest_sca = (unsigned long)kvm->arch.sca;
+       uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
+       uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
+
+       cc = uv_call(0, (u64)&uvcb);
+       *rc = uvcb.header.rc;
+       *rrc = uvcb.header.rrc;
+       KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
+                    uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
+
+       /* Outputs */
+       kvm->arch.pv.handle = uvcb.guest_handle;
+
+       if (cc) {
+               if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
+                       kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+               else
+                       kvm_s390_pv_dealloc_vm(kvm);
+               return -EIO;
+       }
+       kvm->arch.gmap->guest_handle = uvcb.guest_handle;
+       atomic_set(&kvm->mm->context.is_protected, 1);
+       return 0;
+}
+
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+                             u16 *rrc)
+{
+       struct uv_cb_ssc uvcb = {
+               .header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS,
+               .header.len = sizeof(uvcb),
+               .sec_header_origin = (u64)hdr,
+               .sec_header_len = length,
+               .guest_handle = kvm_s390_pv_get_handle(kvm),
+       };
+       int cc = uv_call(0, (u64)&uvcb);
+
+       *rc = uvcb.header.rc;
+       *rrc = uvcb.header.rrc;
+       KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
+                    *rc, *rrc);
+       return cc ? -EINVAL : 0;
+}
+
+static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
+                     u64 offset, u16 *rc, u16 *rrc)
+{
+       struct uv_cb_unp uvcb = {
+               .header.cmd = UVC_CMD_UNPACK_IMG,
+               .header.len = sizeof(uvcb),
+               .guest_handle = kvm_s390_pv_get_handle(kvm),
+               .gaddr = addr,
+               .tweak[0] = tweak,
+               .tweak[1] = offset,
+       };
+       int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
+
+       *rc = uvcb.header.rc;
+       *rrc = uvcb.header.rrc;
+
+       if (ret && ret != -EAGAIN)
+               KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x",
+                            uvcb.gaddr, *rc, *rrc);
+       return ret;
+}
+
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+                      unsigned long tweak, u16 *rc, u16 *rrc)
+{
+       u64 offset = 0;
+       int ret = 0;
+
+       if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK)
+               return -EINVAL;
+
+       KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx",
+                    addr, size);
+
+       while (offset < size) {
+               ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);
+               if (ret == -EAGAIN) {
+                       cond_resched();
+                       if (fatal_signal_pending(current))
+                               break;
+                       continue;
+               }
+               if (ret)
+                       break;
+               addr += PAGE_SIZE;
+               offset += PAGE_SIZE;
+       }
+       if (!ret)
+               KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful");
+       return ret;
+}
+
+int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
+{
+       struct uv_cb_cpu_set_state uvcb = {
+               .header.cmd     = UVC_CMD_CPU_SET_STATE,
+               .header.len     = sizeof(uvcb),
+               .cpu_handle     = kvm_s390_pv_cpu_get_handle(vcpu),
+               .state          = state,
+       };
+       int cc;
+
+       cc = uv_call(0, (u64)&uvcb);
+       KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x",
+                    vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc);
+       if (cc)
+               return -EINVAL;
+       return 0;
+}
index 7b0bb47..7bd86eb 100644 (file)
@@ -38,6 +38,7 @@
 #include <asm/irq.h>
 #include <asm/mmu_context.h>
 #include <asm/facility.h>
+#include <asm/uv.h>
 #include "../kernel/entry.h"
 
 #define __FAIL_ADDR_MASK -4096L
@@ -816,3 +817,80 @@ out_extint:
 early_initcall(pfault_irq_init);
 
 #endif /* CONFIG_PFAULT */
+
+#if IS_ENABLED(CONFIG_PGSTE)
+void do_secure_storage_access(struct pt_regs *regs)
+{
+       unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
+       struct vm_area_struct *vma;
+       struct mm_struct *mm;
+       struct page *page;
+       int rc;
+
+       switch (get_fault_type(regs)) {
+       case USER_FAULT:
+               mm = current->mm;
+               down_read(&mm->mmap_sem);
+               vma = find_vma(mm, addr);
+               if (!vma) {
+                       up_read(&mm->mmap_sem);
+                       do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+                       break;
+               }
+               page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
+               if (IS_ERR_OR_NULL(page)) {
+                       up_read(&mm->mmap_sem);
+                       break;
+               }
+               if (arch_make_page_accessible(page))
+                       send_sig(SIGSEGV, current, 0);
+               put_page(page);
+               up_read(&mm->mmap_sem);
+               break;
+       case KERNEL_FAULT:
+               page = phys_to_page(addr);
+               if (unlikely(!try_get_page(page)))
+                       break;
+               rc = arch_make_page_accessible(page);
+               put_page(page);
+               if (rc)
+                       BUG();
+               break;
+       case VDSO_FAULT:
+               /* fallthrough */
+       case GMAP_FAULT:
+               /* fallthrough */
+       default:
+               do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+               WARN_ON_ONCE(1);
+       }
+}
+NOKPROBE_SYMBOL(do_secure_storage_access);
+
+void do_non_secure_storage_access(struct pt_regs *regs)
+{
+       unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+       struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+       if (get_fault_type(regs) != GMAP_FAULT) {
+               do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+               WARN_ON_ONCE(1);
+               return;
+       }
+
+       if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
+               send_sig(SIGSEGV, current, 0);
+}
+NOKPROBE_SYMBOL(do_non_secure_storage_access);
+
+#else
+void do_secure_storage_access(struct pt_regs *regs)
+{
+       default_trap_handler(regs);
+}
+
+void do_non_secure_storage_access(struct pt_regs *regs)
+{
+       default_trap_handler(regs);
+}
+#endif
index edcdca9..2fbece4 100644 (file)
@@ -804,7 +804,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
                if (*table & _REGION_ENTRY_INVALID)
                        return NULL;
                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-               /* Fallthrough */
+               fallthrough;
        case _ASCE_TYPE_REGION2:
                table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
                if (level == 3)
@@ -812,7 +812,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
                if (*table & _REGION_ENTRY_INVALID)
                        return NULL;
                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-               /* Fallthrough */
+               fallthrough;
        case _ASCE_TYPE_REGION3:
                table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
                if (level == 2)
@@ -820,7 +820,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
                if (*table & _REGION_ENTRY_INVALID)
                        return NULL;
                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-               /* Fallthrough */
+               fallthrough;
        case _ASCE_TYPE_SEGMENT:
                table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
                if (level == 1)
@@ -2548,6 +2548,23 @@ int s390_enable_sie(void)
 }
 EXPORT_SYMBOL_GPL(s390_enable_sie);
 
+int gmap_mark_unmergeable(void)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       int ret;
+
+       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+               ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+                                 MADV_UNMERGEABLE, &vma->vm_flags);
+               if (ret)
+                       return ret;
+       }
+       mm->def_flags &= ~VM_MERGEABLE;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
+
 /*
  * Enable storage key handling from now on and initialize the storage
  * keys with the default key.
@@ -2593,7 +2610,6 @@ static const struct mm_walk_ops enable_skey_walk_ops = {
 int s390_enable_skey(void)
 {
        struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
        int rc = 0;
 
        down_write(&mm->mmap_sem);
@@ -2601,16 +2617,11 @@ int s390_enable_skey(void)
                goto out_up;
 
        mm->context.uses_skeys = 1;
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
-                               MADV_UNMERGEABLE, &vma->vm_flags)) {
-                       mm->context.uses_skeys = 0;
-                       rc = -ENOMEM;
-                       goto out_up;
-               }
+       rc = gmap_mark_unmergeable();
+       if (rc) {
+               mm->context.uses_skeys = 0;
+               goto out_up;
        }
-       mm->def_flags &= ~VM_MERGEABLE;
-
        walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
 
 out_up:
@@ -2640,3 +2651,38 @@ void s390_reset_cmma(struct mm_struct *mm)
        up_write(&mm->mmap_sem);
 }
 EXPORT_SYMBOL_GPL(s390_reset_cmma);
+
+/*
+ * make inaccessible pages accessible again
+ */
+static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
+                           unsigned long next, struct mm_walk *walk)
+{
+       pte_t pte = READ_ONCE(*ptep);
+
+       if (pte_present(pte))
+               WARN_ON_ONCE(uv_convert_from_secure(pte_val(pte) & PAGE_MASK));
+       return 0;
+}
+
+static const struct mm_walk_ops reset_acc_walk_ops = {
+       .pte_entry              = __s390_reset_acc,
+};
+
+#include <linux/sched/mm.h>
+void s390_reset_acc(struct mm_struct *mm)
+{
+       /*
+        * we might be called during
+        * reset:                             we walk the pages and clear
+        * close of all kvm file descriptors: we walk the pages and clear
+        * exit of process on fd closure:     vma already gone, do nothing
+        */
+       if (!mmget_not_zero(mm))
+               return;
+       down_read(&mm->mmap_sem);
+       walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+}
+EXPORT_SYMBOL_GPL(s390_reset_acc);
index 98959e8..9a183e9 100644 (file)
 
 #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
 
+#define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
+                                       KVM_DIRTY_LOG_INITIALLY_SET)
+
 /* x86-specific vcpu->requests bit members */
 #define KVM_REQ_MIGRATE_TIMER          KVM_ARCH_REQ(0)
 #define KVM_REQ_REPORT_TPR_ACCESS      KVM_ARCH_REQ(1)
 #define KVM_REQ_TRIPLE_FAULT           KVM_ARCH_REQ(2)
 #define KVM_REQ_MMU_SYNC               KVM_ARCH_REQ(3)
 #define KVM_REQ_CLOCK_UPDATE           KVM_ARCH_REQ(4)
-#define KVM_REQ_LOAD_CR3               KVM_ARCH_REQ(5)
+#define KVM_REQ_LOAD_MMU_PGD           KVM_ARCH_REQ(5)
 #define KVM_REQ_EVENT                  KVM_ARCH_REQ(6)
 #define KVM_REQ_APF_HALT               KVM_ARCH_REQ(7)
 #define KVM_REQ_STEAL_UPDATE           KVM_ARCH_REQ(8)
@@ -182,7 +185,10 @@ enum exit_fastpath_completion {
        EXIT_FASTPATH_SKIP_EMUL_INS,
 };
 
-#include <asm/kvm_emulate.h>
+struct x86_emulate_ctxt;
+struct x86_exception;
+enum x86_intercept;
+enum x86_intercept_stage;
 
 #define KVM_NR_MEM_OBJS 40
 
@@ -297,7 +303,6 @@ union kvm_mmu_extended_role {
                unsigned int cr4_pke:1;
                unsigned int cr4_smap:1;
                unsigned int cr4_smep:1;
-               unsigned int cr4_la57:1;
                unsigned int maxphyaddr:6;
        };
 };
@@ -382,8 +387,7 @@ struct kvm_mmu_root_info {
  * current mmu mode.
  */
 struct kvm_mmu {
-       void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
-       unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
+       unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
        u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
        int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err,
                          bool prefault);
@@ -678,7 +682,7 @@ struct kvm_vcpu_arch {
 
        /* emulate context */
 
-       struct x86_emulate_ctxt emulate_ctxt;
+       struct x86_emulate_ctxt *emulate_ctxt;
        bool emulate_regs_need_sync_to_vcpu;
        bool emulate_regs_need_sync_from_vcpu;
        int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
@@ -808,10 +812,6 @@ struct kvm_vcpu_arch {
        int pending_ioapic_eoi;
        int pending_external_vector;
 
-       /* GPA available */
-       bool gpa_available;
-       gpa_t gpa_val;
-
        /* be preempted when it's in kernel-mode(cpl=0) */
        bool preempted_in_kernel;
 
@@ -890,6 +890,7 @@ enum kvm_irqchip_mode {
 #define APICV_INHIBIT_REASON_NESTED     2
 #define APICV_INHIBIT_REASON_IRQWIN     3
 #define APICV_INHIBIT_REASON_PIT_REINJ  4
+#define APICV_INHIBIT_REASON_X2APIC    5
 
 struct kvm_arch {
        unsigned long n_used_mmu_pages;
@@ -920,6 +921,7 @@ struct kvm_arch {
        atomic_t vapics_in_nmi_mode;
        struct mutex apic_map_lock;
        struct kvm_apic_map *apic_map;
+       bool apic_map_dirty;
 
        bool apic_access_page_done;
        unsigned long apicv_inhibit_reasons;
@@ -1063,8 +1065,7 @@ struct kvm_x86_ops {
        bool (*has_emulated_msr)(int index);
        void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
-       struct kvm *(*vm_alloc)(void);
-       void (*vm_free)(struct kvm *);
+       unsigned int vm_size;
        int (*vm_init)(struct kvm *kvm);
        void (*vm_destroy)(struct kvm *kvm);
 
@@ -1090,7 +1091,6 @@ struct kvm_x86_ops {
        void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
        void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
        void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
-       void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
        int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
        void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
        void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@ -1153,13 +1153,8 @@ struct kvm_x86_ops {
        int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
        int (*get_tdp_level)(struct kvm_vcpu *vcpu);
        u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
-       int (*get_lpage_level)(void);
-       bool (*rdtscp_supported)(void);
-       bool (*invpcid_supported)(void);
-
-       void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
-       void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+       void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
        bool (*has_wbinvd_exit)(void);
 
@@ -1171,16 +1166,12 @@ struct kvm_x86_ops {
 
        int (*check_intercept)(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
-                              enum x86_intercept_stage stage);
+                              enum x86_intercept_stage stage,
+                              struct x86_exception *exception);
        void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu,
                enum exit_fastpath_completion *exit_fastpath);
-       bool (*mpx_supported)(void);
-       bool (*xsaves_supported)(void);
-       bool (*umip_emulated)(void);
-       bool (*pt_supported)(void);
-       bool (*pku_supported)(void);
 
-       int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
+       int (*check_nested_events)(struct kvm_vcpu *vcpu);
        void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
 
        void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
@@ -1276,19 +1267,18 @@ struct kvm_arch_async_pf {
        bool direct_map;
 };
 
+extern u64 __read_mostly host_efer;
+
 extern struct kvm_x86_ops *kvm_x86_ops;
 extern struct kmem_cache *x86_fpu_cache;
 
 #define __KVM_HAVE_ARCH_VM_ALLOC
 static inline struct kvm *kvm_arch_alloc_vm(void)
 {
-       return kvm_x86_ops->vm_alloc();
-}
-
-static inline void kvm_arch_free_vm(struct kvm *kvm)
-{
-       return kvm_x86_ops->vm_free(kvm);
+       return __vmalloc(kvm_x86_ops->vm_size,
+                        GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL);
 }
+void kvm_arch_free_vm(struct kvm *kvm);
 
 #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
 static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
@@ -1313,7 +1303,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
-                                     struct kvm_memory_slot *memslot);
+                                     struct kvm_memory_slot *memslot,
+                                     int start_level);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
@@ -1381,8 +1372,9 @@ extern u64 kvm_mce_cap_supported;
  *                decode the instruction length.  For use *only* by
  *                kvm_x86_ops->skip_emulated_instruction() implementations.
  *
- * EMULTYPE_ALLOW_RETRY - Set when the emulator should resume the guest to
- *                       retry native execution under certain conditions.
+ * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to
+ *                          retry native execution under certain conditions,
+ *                          Can only be set in conjunction with EMULTYPE_PF.
  *
  * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was
  *                          triggered by KVM's magic "force emulation" prefix,
@@ -1395,13 +1387,18 @@ extern u64 kvm_mce_cap_supported;
  *                     backdoor emulation, which is opt in via module param.
  *                     VMware backoor emulation handles select instructions
  *                     and reinjects the #GP for all other cases.
+ *
+ * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
+ *              case the CR2/GPA value pass on the stack is valid.
  */
 #define EMULTYPE_NO_DECODE         (1 << 0)
 #define EMULTYPE_TRAP_UD           (1 << 1)
 #define EMULTYPE_SKIP              (1 << 2)
-#define EMULTYPE_ALLOW_RETRY       (1 << 3)
+#define EMULTYPE_ALLOW_RETRY_PF            (1 << 3)
 #define EMULTYPE_TRAP_UD_FORCED            (1 << 4)
 #define EMULTYPE_VMWARE_GP         (1 << 5)
+#define EMULTYPE_PF                (1 << 6)
+
 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
 int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
                                        void *insn, int insn_len);
@@ -1414,8 +1411,6 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
 
-struct x86_emulate_ctxt;
-
 int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
 int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
@@ -1512,8 +1507,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
 
-void kvm_enable_tdp(void);
-void kvm_disable_tdp(void);
+void kvm_configure_mmu(bool enable_tdp, int tdp_page_level);
 
 static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
                                  struct x86_exception *exception)
index 172f974..87bd602 100644 (file)
@@ -49,8 +49,7 @@ struct kvm_page_track_notifier_node {
 void kvm_page_track_init(struct kvm *kvm);
 void kvm_page_track_cleanup(struct kvm *kvm);
 
-void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
-                                struct kvm_memory_slot *dont);
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
 int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
                                  unsigned long npages);
 
index 8521af3..5e090d1 100644 (file)
@@ -500,6 +500,18 @@ enum vmcs_field {
                                                 VMX_EPT_EXECUTABLE_MASK)
 #define VMX_EPT_MT_MASK                                (7ull << VMX_EPT_MT_EPTE_SHIFT)
 
+static inline u8 vmx_eptp_page_walk_level(u64 eptp)
+{
+       u64 encoded_level = eptp & VMX_EPTP_PWL_MASK;
+
+       if (encoded_level == VMX_EPTP_PWL_5)
+               return 5;
+
+       /* @eptp must be pre-validated by the caller. */
+       WARN_ON_ONCE(encoded_level != VMX_EPTP_PWL_4);
+       return 4;
+}
+
 /* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
 #define VMX_EPT_MISCONFIG_WX_VALUE             (VMX_EPT_WRITABLE_MASK |       \
                                                 VMX_EPT_EXECUTABLE_MASK)
index b1c4694..60ae93b 100644 (file)
 #include "trace.h"
 #include "pmu.h"
 
+/*
+ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
+ * aligned to sizeof(unsigned long) because it's not accessed via bitops.
+ */
+u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_cpu_caps);
+
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 {
        int feature_bit = 0;
@@ -45,23 +52,6 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
        return ret;
 }
 
-bool kvm_mpx_supported(void)
-{
-       return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
-                && kvm_x86_ops->mpx_supported());
-}
-EXPORT_SYMBOL_GPL(kvm_mpx_supported);
-
-u64 kvm_supported_xcr0(void)
-{
-       u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
-
-       if (!kvm_mpx_supported())
-               xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
-
-       return xcr0;
-}
-
 #define F feature_bit
 
 int kvm_update_cpuid(struct kvm_vcpu *vcpu)
@@ -74,32 +64,24 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
                return 0;
 
        /* Update OSXSAVE bit */
-       if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) {
-               best->ecx &= ~F(OSXSAVE);
-               if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
-                       best->ecx |= F(OSXSAVE);
-       }
+       if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1)
+               cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
+                                  kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
 
-       best->edx &= ~F(APIC);
-       if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)
-               best->edx |= F(APIC);
+       cpuid_entry_change(best, X86_FEATURE_APIC,
+                          vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
 
        if (apic) {
-               if (best->ecx & F(TSC_DEADLINE_TIMER))
+               if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
                        apic->lapic_timer.timer_mode_mask = 3 << 17;
                else
                        apic->lapic_timer.timer_mode_mask = 1 << 17;
        }
 
        best = kvm_find_cpuid_entry(vcpu, 7, 0);
-       if (best) {
-               /* Update OSPKE bit */
-               if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
-                       best->ecx &= ~F(OSPKE);
-                       if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
-                               best->ecx |= F(OSPKE);
-               }
-       }
+       if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
+               cpuid_entry_change(best, X86_FEATURE_OSPKE,
+                                  kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
 
        best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
        if (!best) {
@@ -107,14 +89,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
                vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
        } else {
                vcpu->arch.guest_supported_xcr0 =
-                       (best->eax | ((u64)best->edx << 32)) &
-                       kvm_supported_xcr0();
+                       (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
                vcpu->arch.guest_xstate_size = best->ebx =
                        xstate_required_size(vcpu->arch.xcr0, false);
        }
 
        best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
-       if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
+       if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
+                    cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
                best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
        /*
@@ -136,12 +118,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 
        if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
                best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
-               if (best) {
-                       if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT)
-                               best->ecx |= F(MWAIT);
-                       else
-                               best->ecx &= ~F(MWAIT);
-               }
+               if (best)
+                       cpuid_entry_change(best, X86_FEATURE_MWAIT,
+                                          vcpu->arch.ia32_misc_enable_msr &
+                                          MSR_IA32_MISC_ENABLE_MWAIT);
        }
 
        /* Update physical-address width */
@@ -154,10 +134,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 
 static int is_efer_nx(void)
 {
-       unsigned long long efer = 0;
-
-       rdmsrl_safe(MSR_EFER, &efer);
-       return efer & EFER_NX;
+       return host_efer & EFER_NX;
 }
 
 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
@@ -173,8 +150,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
                        break;
                }
        }
-       if (entry && (entry->edx & F(NX)) && !is_efer_nx()) {
-               entry->edx &= ~F(NX);
+       if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) {
+               cpuid_entry_clear(entry, X86_FEATURE_NX);
                printk(KERN_INFO "kvm: guest NX capability removed\n");
        }
 }
@@ -281,15 +258,189 @@ out:
        return r;
 }
 
-static __always_inline void cpuid_mask(u32 *word, int wordnum)
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
 {
-       reverse_cpuid_check(wordnum);
-       *word &= boot_cpu_data.x86_capability[wordnum];
+       const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
+       struct kvm_cpuid_entry2 entry;
+
+       reverse_cpuid_check(leaf);
+       kvm_cpu_caps[leaf] &= mask;
+
+       cpuid_count(cpuid.function, cpuid.index,
+                   &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
+
+       kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, &cpuid);
+}
+
+void kvm_set_cpu_caps(void)
+{
+       unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+       unsigned int f_gbpages = F(GBPAGES);
+       unsigned int f_lm = F(LM);
+#else
+       unsigned int f_gbpages = 0;
+       unsigned int f_lm = 0;
+#endif
+
+       BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+                    sizeof(boot_cpu_data.x86_capability));
+
+       memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
+              sizeof(kvm_cpu_caps));
+
+       kvm_cpu_cap_mask(CPUID_1_ECX,
+               /*
+                * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
+                * advertised to guests via CPUID!
+                */
+               F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+               0 /* DS-CPL, VMX, SMX, EST */ |
+               0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+               F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+               F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
+               F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+               0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+               F(F16C) | F(RDRAND)
+       );
+       /* KVM emulates x2apic in software irrespective of host support. */
+       kvm_cpu_cap_set(X86_FEATURE_X2APIC);
+
+       kvm_cpu_cap_mask(CPUID_1_EDX,
+               F(FPU) | F(VME) | F(DE) | F(PSE) |
+               F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+               F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+               F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+               F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
+               0 /* Reserved, DS, ACPI */ | F(MMX) |
+               F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+               0 /* HTT, TM, Reserved, PBE */
+       );
+
+       kvm_cpu_cap_mask(CPUID_7_0_EBX,
+               F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+               F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
+               F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
+               F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+               F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
+       );
+
+       kvm_cpu_cap_mask(CPUID_7_ECX,
+               F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
+               F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+               F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+       );
+       /* Set LA57 based on hardware capability. */
+       if (cpuid_ecx(7) & F(LA57))
+               kvm_cpu_cap_set(X86_FEATURE_LA57);
+
+       kvm_cpu_cap_mask(CPUID_7_EDX,
+               F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+               F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+               F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM)
+       );
+
+       /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
+       kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST);
+       kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES);
+
+       if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
+               kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL);
+       if (boot_cpu_has(X86_FEATURE_STIBP))
+               kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
+       if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
+               kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
+
+       kvm_cpu_cap_mask(CPUID_7_1_EAX,
+               F(AVX512_BF16)
+       );
+
+       kvm_cpu_cap_mask(CPUID_D_1_EAX,
+               F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
+       );
+
+       kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
+               F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+               F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+               F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
+               0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
+               F(TOPOEXT) | F(PERFCTR_CORE)
+       );
+
+       kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
+               F(FPU) | F(VME) | F(DE) | F(PSE) |
+               F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+               F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+               F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+               F(PAT) | F(PSE36) | 0 /* Reserved */ |
+               f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+               F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
+               0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
+       );
+
+       if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
+               kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+
+       kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
+               F(CLZERO) | F(XSAVEERPTR) |
+               F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+               F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON)
+       );
+
+       /*
+        * AMD has separate bits for each SPEC_CTRL bit.
+        * arch/x86/kernel/cpu/bugs.c is kind enough to
+        * record that in cpufeatures so use them.
+        */
+       if (boot_cpu_has(X86_FEATURE_IBPB))
+               kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB);
+       if (boot_cpu_has(X86_FEATURE_IBRS))
+               kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS);
+       if (boot_cpu_has(X86_FEATURE_STIBP))
+               kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP);
+       if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+               kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD);
+       if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+               kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO);
+       /*
+        * The preference is to use SPEC CTRL MSR instead of the
+        * VIRT_SPEC MSR.
+        */
+       if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+           !boot_cpu_has(X86_FEATURE_AMD_SSBD))
+               kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+       /*
+        * Hide all SVM features by default, SVM will set the cap bits for
+        * features it emulates and/or exposes for L1.
+        */
+       kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
+
+       kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
+               F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+               F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+               F(PMM) | F(PMM_EN)
+       );
 }
+EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
 
-static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
-                          u32 index)
+struct kvm_cpuid_array {
+       struct kvm_cpuid_entry2 *entries;
+       const int maxnent;
+       int nent;
+};
+
+static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
+                                             u32 function, u32 index)
 {
+       struct kvm_cpuid_entry2 *entry;
+
+       if (array->nent >= array->maxnent)
+               return NULL;
+
+       entry = &array->entries[array->nent++];
+
        entry->function = function;
        entry->index = index;
        entry->flags = 0;
@@ -298,9 +449,6 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
                    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
 
        switch (function) {
-       case 2:
-               entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-               break;
        case 4:
        case 7:
        case 0xb:
@@ -316,11 +464,18 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                break;
        }
+
+       return entry;
 }
 
-static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
-                                   u32 func, int *nent, int maxnent)
+static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 {
+       struct kvm_cpuid_entry2 *entry;
+
+       if (array->nent >= array->maxnent)
+               return -E2BIG;
+
+       entry = &array->entries[array->nent];
        entry->function = func;
        entry->index = 0;
        entry->flags = 0;
@@ -328,17 +483,17 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
        switch (func) {
        case 0:
                entry->eax = 7;
-               ++*nent;
+               ++array->nent;
                break;
        case 1:
                entry->ecx = F(MOVBE);
-               ++*nent;
+               ++array->nent;
                break;
        case 7:
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                entry->eax = 0;
                entry->ecx = F(RDPID);
-               ++*nent;
+               ++array->nent;
        default:
                break;
        }
@@ -346,223 +501,60 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
        return 0;
 }
 
-static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
+static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 {
-       unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
-       unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
-       unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
-       unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
-       unsigned f_la57;
-       unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0;
-
-       /* cpuid 7.0.ebx */
-       const u32 kvm_cpuid_7_0_ebx_x86_features =
-               F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-               F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
-               F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
-               F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
-               F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
-
-       /* cpuid 7.0.ecx*/
-       const u32 kvm_cpuid_7_0_ecx_x86_features =
-               F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
-               F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
-               F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
-
-       /* cpuid 7.0.edx*/
-       const u32 kvm_cpuid_7_0_edx_x86_features =
-               F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
-               F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
-               F(MD_CLEAR);
-
-       /* cpuid 7.1.eax */
-       const u32 kvm_cpuid_7_1_eax_x86_features =
-               F(AVX512_BF16);
-
-       switch (index) {
-       case 0:
-               entry->eax = min(entry->eax, 1u);
-               entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
-               cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
-               /* TSC_ADJUST is emulated */
-               entry->ebx |= F(TSC_ADJUST);
-
-               entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
-               f_la57 = entry->ecx & F(LA57);
-               cpuid_mask(&entry->ecx, CPUID_7_ECX);
-               /* Set LA57 based on hardware capability. */
-               entry->ecx |= f_la57;
-               entry->ecx |= f_umip;
-               entry->ecx |= f_pku;
-               /* PKU is not yet implemented for shadow paging. */
-               if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
-                       entry->ecx &= ~F(PKU);
-
-               entry->edx &= kvm_cpuid_7_0_edx_x86_features;
-               cpuid_mask(&entry->edx, CPUID_7_EDX);
-               if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
-                       entry->edx |= F(SPEC_CTRL);
-               if (boot_cpu_has(X86_FEATURE_STIBP))
-                       entry->edx |= F(INTEL_STIBP);
-               if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
-                   boot_cpu_has(X86_FEATURE_AMD_SSBD))
-                       entry->edx |= F(SPEC_CTRL_SSBD);
-               /*
-                * We emulate ARCH_CAPABILITIES in software even
-                * if the host doesn't support it.
-                */
-               entry->edx |= F(ARCH_CAPABILITIES);
-               break;
-       case 1:
-               entry->eax &= kvm_cpuid_7_1_eax_x86_features;
-               entry->ebx = 0;
-               entry->ecx = 0;
-               entry->edx = 0;
-               break;
-       default:
-               WARN_ON_ONCE(1);
-               entry->eax = 0;
-               entry->ebx = 0;
-               entry->ecx = 0;
-               entry->edx = 0;
-               break;
-       }
-}
-
-static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
-                                 int *nent, int maxnent)
-{
-       int r;
-       unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
-       unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
-                               ? F(GBPAGES) : 0;
-       unsigned f_lm = F(LM);
-#else
-       unsigned f_gbpages = 0;
-       unsigned f_lm = 0;
-#endif
-       unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
-       unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
-       unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
-
-       /* cpuid 1.edx */
-       const u32 kvm_cpuid_1_edx_x86_features =
-               F(FPU) | F(VME) | F(DE) | F(PSE) |
-               F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-               F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
-               F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-               F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
-               0 /* Reserved, DS, ACPI */ | F(MMX) |
-               F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
-               0 /* HTT, TM, Reserved, PBE */;
-       /* cpuid 0x80000001.edx */
-       const u32 kvm_cpuid_8000_0001_edx_x86_features =
-               F(FPU) | F(VME) | F(DE) | F(PSE) |
-               F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-               F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
-               F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-               F(PAT) | F(PSE36) | 0 /* Reserved */ |
-               f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
-               F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
-               0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
-       /* cpuid 1.ecx */
-       const u32 kvm_cpuid_1_ecx_x86_features =
-               /* NOTE: MONITOR (and MWAIT) are emulated as NOP,
-                * but *not* advertised to guests via CPUID ! */
-               F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
-               0 /* DS-CPL, VMX, SMX, EST */ |
-               0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-               F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
-               F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
-               F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-               0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
-               F(F16C) | F(RDRAND);
-       /* cpuid 0x80000001.ecx */
-       const u32 kvm_cpuid_8000_0001_ecx_x86_features =
-               F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
-               F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-               F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
-               0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
-               F(TOPOEXT) | F(PERFCTR_CORE);
-
-       /* cpuid 0x80000008.ebx */
-       const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-               F(CLZERO) | F(XSAVEERPTR) |
-               F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
-               F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON);
-
-       /* cpuid 0xC0000001.edx */
-       const u32 kvm_cpuid_C000_0001_edx_x86_features =
-               F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
-               F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
-               F(PMM) | F(PMM_EN);
-
-       /* cpuid 0xD.1.eax */
-       const u32 kvm_cpuid_D_1_eax_x86_features =
-               F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
+       struct kvm_cpuid_entry2 *entry;
+       int r, i, max_idx;
 
        /* all calls to cpuid_count() should be made on the same cpu */
        get_cpu();
 
        r = -E2BIG;
 
-       if (WARN_ON(*nent >= maxnent))
+       entry = do_host_cpuid(array, function, 0);
+       if (!entry)
                goto out;
 
-       do_host_cpuid(entry, function, 0);
-       ++*nent;
-
        switch (function) {
        case 0:
                /* Limited to the highest leaf implemented in KVM. */
                entry->eax = min(entry->eax, 0x1fU);
                break;
        case 1:
-               entry->edx &= kvm_cpuid_1_edx_x86_features;
-               cpuid_mask(&entry->edx, CPUID_1_EDX);
-               entry->ecx &= kvm_cpuid_1_ecx_x86_features;
-               cpuid_mask(&entry->ecx, CPUID_1_ECX);
-               /* we support x2apic emulation even if host does not support
-                * it since we emulate x2apic in software */
-               entry->ecx |= F(X2APIC);
+               cpuid_entry_override(entry, CPUID_1_EDX);
+               cpuid_entry_override(entry, CPUID_1_ECX);
                break;
-       /* function 2 entries are STATEFUL. That is, repeated cpuid commands
-        * may return different values. This forces us to get_cpu() before
-        * issuing the first command, and also to emulate this annoying behavior
-        * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
-       case 2: {
-               int t, times = entry->eax & 0xff;
-
-               entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-               for (t = 1; t < times; ++t) {
-                       if (*nent >= maxnent)
-                               goto out;
-
-                       do_host_cpuid(&entry[t], function, 0);
-                       ++*nent;
-               }
+       case 2:
+               /*
+                * On ancient CPUs, function 2 entries are STATEFUL.  That is,
+                * CPUID(function=2, index=0) may return different results each
+                * time, with the least-significant byte in EAX enumerating the
+                * number of times software should do CPUID(2, 0).
+                *
+                * Modern CPUs, i.e. every CPU KVM has *ever* run on are less
+                * idiotic.  Intel's SDM states that EAX & 0xff "will always
+                * return 01H. Software should ignore this value and not
+                * interpret it as an informational descriptor", while AMD's
+                * APM states that CPUID(2) is reserved.
+                *
+                * WARN if a frankenstein CPU that supports virtualization and
+                * a stateful CPUID.0x2 is encountered.
+                */
+               WARN_ON_ONCE((entry->eax & 0xff) > 1);
                break;
-       }
        /* functions 4 and 0x8000001d have additional index. */
        case 4:
-       case 0x8000001d: {
-               int i, cache_type;
-
-               /* read more entries until cache_type is zero */
-               for (i = 1; ; ++i) {
-                       if (*nent >= maxnent)
+       case 0x8000001d:
+               /*
+                * Read entries until the cache type in the previous entry is
+                * zero, i.e. indicates an invalid entry.
+                */
+               for (i = 1; entry->eax & 0x1f; ++i) {
+                       entry = do_host_cpuid(array, function, i);
+                       if (!entry)
                                goto out;
-
-                       cache_type = entry[i - 1].eax & 0x1f;
-                       if (!cache_type)
-                               break;
-                       do_host_cpuid(&entry[i], function, i);
-                       ++*nent;
                }
                break;
-       }
        case 6: /* Thermal management */
                entry->eax = 0x4; /* allow ARAT */
                entry->ebx = 0;
@@ -570,22 +562,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->edx = 0;
                break;
        /* function 7 has additional index. */
-       case 7: {
-               int i;
-
-               for (i = 0; ; ) {
-                       do_cpuid_7_mask(&entry[i], i);
-                       if (i == entry->eax)
-                               break;
-                       if (*nent >= maxnent)
+       case 7:
+               entry->eax = min(entry->eax, 1u);
+               cpuid_entry_override(entry, CPUID_7_0_EBX);
+               cpuid_entry_override(entry, CPUID_7_ECX);
+               cpuid_entry_override(entry, CPUID_7_EDX);
+
+               /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
+               if (entry->eax == 1) {
+                       entry = do_host_cpuid(array, function, 1);
+                       if (!entry)
                                goto out;
 
-                       ++i;
-                       do_host_cpuid(&entry[i], function, i);
-                       ++*nent;
+                       cpuid_entry_override(entry, CPUID_7_1_EAX);
+                       entry->ebx = 0;
+                       entry->ecx = 0;
+                       entry->edx = 0;
                }
                break;
-       }
        case 9:
                break;
        case 0xa: { /* Architectural Performance Monitoring */
@@ -622,79 +616,81 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
         * thus they can be handled by common code.
         */
        case 0x1f:
-       case 0xb: {
-               int i;
-
+       case 0xb:
                /*
-                * We filled in entry[0] for CPUID(EAX=<function>,
-                * ECX=00H) above.  If its level type (ECX[15:8]) is
-                * zero, then the leaf is unimplemented, and we're
-                * done.  Otherwise, continue to populate entries
-                * until the level type (ECX[15:8]) of the previously
-                * added entry is zero.
+                * Populate entries until the level type (ECX[15:8]) of the
+                * previous entry is zero.  Note, CPUID EAX.{0x1f,0xb}.0 is
+                * the starting entry, filled by the primary do_host_cpuid().
                 */
-               for (i = 1; entry[i - 1].ecx & 0xff00; ++i) {
-                       if (*nent >= maxnent)
+               for (i = 1; entry->ecx & 0xff00; ++i) {
+                       entry = do_host_cpuid(array, function, i);
+                       if (!entry)
                                goto out;
-
-                       do_host_cpuid(&entry[i], function, i);
-                       ++*nent;
                }
                break;
-       }
-       case 0xd: {
-               int idx, i;
-               u64 supported = kvm_supported_xcr0();
-
-               entry->eax &= supported;
-               entry->ebx = xstate_required_size(supported, false);
+       case 0xd:
+               entry->eax &= supported_xcr0;
+               entry->ebx = xstate_required_size(supported_xcr0, false);
                entry->ecx = entry->ebx;
-               entry->edx &= supported >> 32;
-               if (!supported)
+               entry->edx &= supported_xcr0 >> 32;
+               if (!supported_xcr0)
                        break;
 
-               for (idx = 1, i = 1; idx < 64; ++idx) {
-                       u64 mask = ((u64)1 << idx);
-                       if (*nent >= maxnent)
+               entry = do_host_cpuid(array, function, 1);
+               if (!entry)
+                       goto out;
+
+               cpuid_entry_override(entry, CPUID_D_1_EAX);
+               if (entry->eax & (F(XSAVES)|F(XSAVEC)))
+                       entry->ebx = xstate_required_size(supported_xcr0 | supported_xss,
+                                                         true);
+               else {
+                       WARN_ON_ONCE(supported_xss != 0);
+                       entry->ebx = 0;
+               }
+               entry->ecx &= supported_xss;
+               entry->edx &= supported_xss >> 32;
+
+               for (i = 2; i < 64; ++i) {
+                       bool s_state;
+                       if (supported_xcr0 & BIT_ULL(i))
+                               s_state = false;
+                       else if (supported_xss & BIT_ULL(i))
+                               s_state = true;
+                       else
+                               continue;
+
+                       entry = do_host_cpuid(array, function, i);
+                       if (!entry)
                                goto out;
 
-                       do_host_cpuid(&entry[i], function, idx);
-                       if (idx == 1) {
-                               entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
-                               cpuid_mask(&entry[i].eax, CPUID_D_1_EAX);
-                               entry[i].ebx = 0;
-                               if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
-                                       entry[i].ebx =
-                                               xstate_required_size(supported,
-                                                                    true);
-                       } else {
-                               if (entry[i].eax == 0 || !(supported & mask))
-                                       continue;
-                               if (WARN_ON_ONCE(entry[i].ecx & 1))
-                                       continue;
+                       /*
+                        * The supported check above should have filtered out
+                        * invalid sub-leafs.  Only valid sub-leafs should
+                        * reach this point, and they should have a non-zero
+                        * save state size.  Furthermore, check whether the
+                        * processor agrees with supported_xcr0/supported_xss
+                        * on whether this is an XCR0- or IA32_XSS-managed area.
+                        */
+                       if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
+                               --array->nent;
+                               continue;
                        }
-                       entry[i].ecx = 0;
-                       entry[i].edx = 0;
-                       ++*nent;
-                       ++i;
+                       entry->edx = 0;
                }
                break;
-       }
        /* Intel PT */
-       case 0x14: {
-               int t, times = entry->eax;
-
-               if (!f_intel_pt)
+       case 0x14:
+               if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
+                       entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
                        break;
+               }
 
-               for (t = 1; t <= times; ++t) {
-                       if (*nent >= maxnent)
+               for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+                       if (!do_host_cpuid(array, function, i))
                                goto out;
-                       do_host_cpuid(&entry[t], function, t);
-                       ++*nent;
                }
                break;
-       }
        case KVM_CPUID_SIGNATURE: {
                static const char signature[12] = "KVMKVMKVM\0\0";
                const u32 *sigptr = (const u32 *)signature;
@@ -729,10 +725,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->eax = min(entry->eax, 0x8000001f);
                break;
        case 0x80000001:
-               entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
-               cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
-               entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
-               cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
+               cpuid_entry_override(entry, CPUID_8000_0001_EDX);
+               cpuid_entry_override(entry, CPUID_8000_0001_ECX);
                break;
        case 0x80000007: /* Advanced power management */
                /* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -750,33 +744,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
                        g_phys_as = phys_as;
                entry->eax = g_phys_as | (virt_as << 8);
                entry->edx = 0;
-               entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
-               cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
-               /*
-                * AMD has separate bits for each SPEC_CTRL bit.
-                * arch/x86/kernel/cpu/bugs.c is kind enough to
-                * record that in cpufeatures so use them.
-                */
-               if (boot_cpu_has(X86_FEATURE_IBPB))
-                       entry->ebx |= F(AMD_IBPB);
-               if (boot_cpu_has(X86_FEATURE_IBRS))
-                       entry->ebx |= F(AMD_IBRS);
-               if (boot_cpu_has(X86_FEATURE_STIBP))
-                       entry->ebx |= F(AMD_STIBP);
-               if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
-                   boot_cpu_has(X86_FEATURE_AMD_SSBD))
-                       entry->ebx |= F(AMD_SSBD);
-               if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
-                       entry->ebx |= F(AMD_SSB_NO);
-               /*
-                * The preference is to use SPEC CTRL MSR instead of the
-                * VIRT_SPEC MSR.
-                */
-               if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
-                   !boot_cpu_has(X86_FEATURE_AMD_SSBD))
-                       entry->ebx |= F(VIRT_SSBD);
+               cpuid_entry_override(entry, CPUID_8000_0008_EBX);
                break;
        }
+       case 0x8000000A:
+               if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) {
+                       entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+                       break;
+               }
+               entry->eax = 1; /* SVM revision 1 */
+               entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+                                  ASID emulation to nested SVM */
+               entry->ecx = 0; /* Reserved */
+               cpuid_entry_override(entry, CPUID_8000_000A_EDX);
+               break;
        case 0x80000019:
                entry->ecx = entry->edx = 0;
                break;
@@ -794,8 +775,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->eax = min(entry->eax, 0xC0000004);
                break;
        case 0xC0000001:
-               entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
-               cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
+               cpuid_entry_override(entry, CPUID_C000_0001_EDX);
                break;
        case 3: /* Processor serial number */
        case 5: /* MONITOR/MWAIT */
@@ -807,8 +787,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
                break;
        }
 
-       kvm_x86_ops->set_supported_cpuid(function, entry);
-
        r = 0;
 
 out:
@@ -817,26 +795,39 @@ out:
        return r;
 }
 
-static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func,
-                        int *nent, int maxnent, unsigned int type)
+static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+                        unsigned int type)
 {
-       if (*nent >= maxnent)
-               return -E2BIG;
-
        if (type == KVM_GET_EMULATED_CPUID)
-               return __do_cpuid_func_emulated(entry, func, nent, maxnent);
+               return __do_cpuid_func_emulated(array, func);
 
-       return __do_cpuid_func(entry, func, nent, maxnent);
+       return __do_cpuid_func(array, func);
 }
 
-struct kvm_cpuid_param {
-       u32 func;
-       bool (*qualifier)(const struct kvm_cpuid_param *param);
-};
+#define CENTAUR_CPUID_SIGNATURE 0xC0000000
 
-static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
+static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+                         unsigned int type)
 {
-       return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
+       u32 limit;
+       int r;
+
+       if (func == CENTAUR_CPUID_SIGNATURE &&
+           boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR)
+               return 0;
+
+       r = do_cpuid_func(array, func, type);
+       if (r)
+               return r;
+
+       limit = array->entries[array->nent - 1].eax;
+       for (func = func + 1; func <= limit; ++func) {
+               r = do_cpuid_func(array, func, type);
+               if (r)
+                       break;
+       }
+
+       return r;
 }
 
 static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
@@ -870,157 +861,145 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
                            struct kvm_cpuid_entry2 __user *entries,
                            unsigned int type)
 {
-       struct kvm_cpuid_entry2 *cpuid_entries;
-       int limit, nent = 0, r = -E2BIG, i;
-       u32 func;
-       static const struct kvm_cpuid_param param[] = {
-               { .func = 0 },
-               { .func = 0x80000000 },
-               { .func = 0xC0000000, .qualifier = is_centaur_cpu },
-               { .func = KVM_CPUID_SIGNATURE },
+       static const u32 funcs[] = {
+               0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
+       };
+
+       struct kvm_cpuid_array array = {
+               .nent = 0,
+               .maxnent = cpuid->nent,
        };
+       int r, i;
 
        if (cpuid->nent < 1)
-               goto out;
+               return -E2BIG;
        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
                cpuid->nent = KVM_MAX_CPUID_ENTRIES;
 
        if (sanity_check_entries(entries, cpuid->nent, type))
                return -EINVAL;
 
-       r = -ENOMEM;
-       cpuid_entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
+       array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
                                           cpuid->nent));
-       if (!cpuid_entries)
-               goto out;
-
-       r = 0;
-       for (i = 0; i < ARRAY_SIZE(param); i++) {
-               const struct kvm_cpuid_param *ent = &param[i];
-
-               if (ent->qualifier && !ent->qualifier(ent))
-                       continue;
-
-               r = do_cpuid_func(&cpuid_entries[nent], ent->func,
-                                 &nent, cpuid->nent, type);
-
-               if (r)
-                       goto out_free;
-
-               limit = cpuid_entries[nent - 1].eax;
-               for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
-                       r = do_cpuid_func(&cpuid_entries[nent], func,
-                                         &nent, cpuid->nent, type);
+       if (!array.entries)
+               return -ENOMEM;
 
+       for (i = 0; i < ARRAY_SIZE(funcs); i++) {
+               r = get_cpuid_func(&array, funcs[i], type);
                if (r)
                        goto out_free;
        }
+       cpuid->nent = array.nent;
 
-       r = -EFAULT;
-       if (copy_to_user(entries, cpuid_entries,
-                        nent * sizeof(struct kvm_cpuid_entry2)))
-               goto out_free;
-       cpuid->nent = nent;
-       r = 0;
+       if (copy_to_user(entries, array.entries,
+                        array.nent * sizeof(struct kvm_cpuid_entry2)))
+               r = -EFAULT;
 
 out_free:
-       vfree(cpuid_entries);
-out:
+       vfree(array.entries);
        return r;
 }
 
-static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
-{
-       struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-       struct kvm_cpuid_entry2 *ej;
-       int j = i;
-       int nent = vcpu->arch.cpuid_nent;
-
-       e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
-       /* when no next entry is found, the current entry[i] is reselected */
-       do {
-               j = (j + 1) % nent;
-               ej = &vcpu->arch.cpuid_entries[j];
-       } while (ej->function != e->function);
-
-       ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-
-       return j;
-}
-
-/* find an entry with matching function, matching index (if needed), and that
- * should be read next (if it's stateful) */
-static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
-       u32 function, u32 index)
-{
-       if (e->function != function)
-               return 0;
-       if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
-               return 0;
-       if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-           !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
-               return 0;
-       return 1;
-}
-
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index)
 {
+       struct kvm_cpuid_entry2 *e;
        int i;
-       struct kvm_cpuid_entry2 *best = NULL;
 
        for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-               struct kvm_cpuid_entry2 *e;
-
                e = &vcpu->arch.cpuid_entries[i];
-               if (is_matching_cpuid_entry(e, function, index)) {
-                       if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
-                               move_to_next_stateful_cpuid_entry(vcpu, i);
-                       best = e;
-                       break;
-               }
+
+               if (e->function == function && (e->index == index ||
+                   !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
+                       return e;
        }
-       return best;
+       return NULL;
 }
 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
 
 /*
- * If the basic or extended CPUID leaf requested is higher than the
- * maximum supported basic or extended leaf, respectively, then it is
- * out of range.
+ * Intel CPUID semantics treats any query for an out-of-range leaf as if the
+ * highest basic leaf (i.e. CPUID.0H:EAX) were requested.  AMD CPUID semantics
+ * returns all zeroes for any undefined leaf, whether or not the leaf is in
+ * range.  Centaur/VIA follows Intel semantics.
+ *
+ * A leaf is considered out-of-range if its function is higher than the maximum
+ * supported leaf of its associated class or if its associated class does not
+ * exist.
+ *
+ * There are three primary classes to be considered, with their respective
+ * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive.  A primary
+ * class exists if a guest CPUID entry for its <base> leaf exists.  For a given
+ * class, CPUID.<base>.EAX contains the max supported leaf for the class.
+ *
+ *  - Basic:      0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff
+ *  - Hypervisor: 0x40000000 - 0x4fffffff
+ *  - Extended:   0x80000000 - 0xbfffffff
+ *  - Centaur:    0xc0000000 - 0xcfffffff
+ *
+ * The Hypervisor class is further subdivided into sub-classes that each act as
+ * their own indepdent class associated with a 0x100 byte range.  E.g. if Qemu
+ * is advertising support for both HyperV and KVM, the resulting Hypervisor
+ * CPUID sub-classes are:
+ *
+ *  - HyperV:     0x40000000 - 0x400000ff
+ *  - KVM:        0x40000100 - 0x400001ff
  */
-static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function)
+static struct kvm_cpuid_entry2 *
+get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
 {
-       struct kvm_cpuid_entry2 *max;
+       struct kvm_cpuid_entry2 *basic, *class;
+       u32 function = *fn_ptr;
+
+       basic = kvm_find_cpuid_entry(vcpu, 0, 0);
+       if (!basic)
+               return NULL;
+
+       if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) ||
+           is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx))
+               return NULL;
 
-       max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
-       return max && function <= max->eax;
+       if (function >= 0x40000000 && function <= 0x4fffffff)
+               class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0);
+       else if (function >= 0xc0000000)
+               class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0);
+       else
+               class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+
+       if (class && function <= class->eax)
+               return NULL;
+
+       /*
+        * Leaf specific adjustments are also applied when redirecting to the
+        * max basic entry, e.g. if the max basic leaf is 0xb but there is no
+        * entry for CPUID.0xb.index (see below), then the output value for EDX
+        * needs to be pulled from CPUID.0xb.1.
+        */
+       *fn_ptr = basic->eax;
+
+       /*
+        * The class does not exist or the requested function is out of range;
+        * the effective CPUID entry is the max basic leaf.  Note, the index of
+        * the original requested leaf is observed!
+        */
+       return kvm_find_cpuid_entry(vcpu, basic->eax, index);
 }
 
 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
-              u32 *ecx, u32 *edx, bool check_limit)
+              u32 *ecx, u32 *edx, bool exact_only)
 {
-       u32 function = *eax, index = *ecx;
+       u32 orig_function = *eax, function = *eax, index = *ecx;
        struct kvm_cpuid_entry2 *entry;
-       struct kvm_cpuid_entry2 *max;
-       bool found;
+       bool exact, used_max_basic = false;
 
        entry = kvm_find_cpuid_entry(vcpu, function, index);
-       found = entry;
-       /*
-        * Intel CPUID semantics treats any query for an out-of-range
-        * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were
-        * requested. AMD CPUID semantics returns all zeroes for any
-        * undefined leaf, whether or not the leaf is in range.
-        */
-       if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) &&
-           !cpuid_function_in_range(vcpu, function)) {
-               max = kvm_find_cpuid_entry(vcpu, 0, 0);
-               if (max) {
-                       function = max->eax;
-                       entry = kvm_find_cpuid_entry(vcpu, function, index);
-               }
+       exact = !!entry;
+
+       if (!entry && !exact_only) {
+               entry = get_out_of_range_cpuid_entry(vcpu, &function, index);
+               used_max_basic = !!entry;
        }
+
        if (entry) {
                *eax = entry->eax;
                *ebx = entry->ebx;
@@ -1049,8 +1028,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
                        }
                }
        }
-       trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found);
-       return found;
+       trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact,
+                       used_max_basic);
+       return exact;
 }
 EXPORT_SYMBOL_GPL(kvm_cpuid);
 
@@ -1063,7 +1043,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 
        eax = kvm_rax_read(vcpu);
        ecx = kvm_rcx_read(vcpu);
-       kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
+       kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
        kvm_rax_write(vcpu, eax);
        kvm_rbx_write(vcpu, ebx);
        kvm_rcx_write(vcpu, ecx);
index 7366c61..23b4cd1 100644 (file)
@@ -6,8 +6,10 @@
 #include <asm/cpu.h>
 #include <asm/processor.h>
 
+extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+void kvm_set_cpu_caps(void);
+
 int kvm_update_cpuid(struct kvm_vcpu *vcpu);
-bool kvm_mpx_supported(void);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
 int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
@@ -23,7 +25,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
                              struct kvm_cpuid2 *cpuid,
                              struct kvm_cpuid_entry2 __user *entries);
 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
-              u32 *ecx, u32 *edx, bool check_limit);
+              u32 *ecx, u32 *edx, bool exact_only);
 
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
 
@@ -64,7 +66,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
  * and can't be used by KVM to query/control guest capabilities.  And obviously
  * the leaf being queried must have an entry in the lookup table.
  */
-static __always_inline void reverse_cpuid_check(unsigned x86_leaf)
+static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
 {
        BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
        BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
@@ -88,24 +90,18 @@ static __always_inline u32 __feature_bit(int x86_feature)
 
 #define feature_bit(name)  __feature_bit(X86_FEATURE_##name)
 
-static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
 {
-       unsigned x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = x86_feature / 32;
 
        reverse_cpuid_check(x86_leaf);
        return reverse_cpuid[x86_leaf];
 }
 
-static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+                                                 const struct cpuid_reg *cpuid)
 {
-       struct kvm_cpuid_entry2 *entry;
-       const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
-
-       entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
-       if (!entry)
-               return NULL;
-
-       switch (cpuid.reg) {
+       switch (cpuid->reg) {
        case CPUID_EAX:
                return &entry->eax;
        case CPUID_EBX:
@@ -120,9 +116,86 @@ static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsi
        }
 }
 
-static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+                                               unsigned int x86_feature)
+{
+       const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+       return __cpuid_entry_get_reg(entry, &cpuid);
+}
+
+static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
+                                          unsigned int x86_feature)
+{
+       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+       return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
+                                           unsigned int x86_feature)
+{
+       return cpuid_entry_get(entry, x86_feature);
+}
+
+static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
+                                             unsigned int x86_feature)
+{
+       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+       *reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
+                                           unsigned int x86_feature)
+{
+       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+       *reg |= __feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
+                                              unsigned int x86_feature,
+                                              bool set)
+{
+       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+       /*
+        * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
+        * compiler into using CMOV instead of Jcc when possible.
+        */
+       if (set)
+               *reg |= __feature_bit(x86_feature);
+       else
+               *reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
+                                                enum cpuid_leafs leaf)
+{
+       u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
+
+       BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps));
+       *reg = kvm_cpu_caps[leaf];
+}
+
+static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
+                                                    unsigned int x86_feature)
 {
-       int *reg;
+       const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+       struct kvm_cpuid_entry2 *entry;
+
+       entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
+       if (!entry)
+               return NULL;
+
+       return __cpuid_entry_get_reg(entry, &cpuid);
+}
+
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
+                                           unsigned int x86_feature)
+{
+       u32 *reg;
 
        reg = guest_cpuid_get_register(vcpu, x86_feature);
        if (!reg)
@@ -131,21 +204,24 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_
        return *reg & __feature_bit(x86_feature);
 }
 
-static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu,
+                                             unsigned int x86_feature)
 {
-       int *reg;
+       u32 *reg;
 
        reg = guest_cpuid_get_register(vcpu, x86_feature);
        if (reg)
                *reg &= ~__feature_bit(x86_feature);
 }
 
-static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
+static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
 
        best = kvm_find_cpuid_entry(vcpu, 0, 0);
-       return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx;
+       return best &&
+              (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) ||
+               is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
 }
 
 static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
@@ -192,4 +268,39 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
                  MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
 }
 
+static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
+{
+       unsigned int x86_leaf = x86_feature / 32;
+
+       reverse_cpuid_check(x86_leaf);
+       kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
+{
+       unsigned int x86_leaf = x86_feature / 32;
+
+       reverse_cpuid_check(x86_leaf);
+       kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
+}
+
+static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
+{
+       unsigned int x86_leaf = x86_feature / 32;
+
+       reverse_cpuid_check(x86_leaf);
+       return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
+}
+
+static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature)
+{
+       return !!kvm_cpu_cap_get(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
+{
+       if (boot_cpu_has(x86_feature))
+               kvm_cpu_cap_set(x86_feature);
+}
+
 #endif
index dd19fb3..fefa32d 100644 (file)
@@ -20,7 +20,7 @@
 
 #include <linux/kvm_host.h>
 #include "kvm_cache_regs.h"
-#include <asm/kvm_emulate.h>
+#include "kvm_emulate.h"
 #include <linux/stringify.h>
 #include <asm/fpu/api.h>
 #include <asm/debugreg.h>
@@ -665,6 +665,17 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
        ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
 }
 
+static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
+{
+       return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline bool emul_is_noncanonical_address(u64 la,
+                                               struct x86_emulate_ctxt *ctxt)
+{
+       return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
+}
+
 /*
  * x86 defines three classes of vector instructions: explicitly
  * aligned, explicitly unaligned, and the rest, which change behaviour
@@ -2711,10 +2722,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
        u32 eax, ebx, ecx, edx;
 
        eax = ecx = 0;
-       ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
-       return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
-               && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
-               && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+       ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+       return is_guest_vendor_intel(ebx, ecx, edx);
 }
 
 static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
@@ -2731,36 +2740,18 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 
        eax = 0x00000000;
        ecx = 0x00000000;
-       ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+       ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
        /*
-        * Intel ("GenuineIntel")
-        * remark: Intel CPUs only support "syscall" in 64bit
-        * longmode. Also an 64bit guest with a
-        * 32bit compat-app running will #UD !! While this
-        * behaviour can be fixed (by emulating) into AMD
-        * response - CPUs of AMD can't behave like Intel.
+        * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a
+        * 64bit guest with a 32bit compat-app running will #UD !! While this
+        * behaviour can be fixed (by emulating) into AMD response - CPUs of
+        * AMD can't behave like Intel.
         */
-       if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
-           ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
-           edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
+       if (is_guest_vendor_intel(ebx, ecx, edx))
                return false;
 
-       /* AMD ("AuthenticAMD") */
-       if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
-           ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
-           edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
-               return true;
-
-       /* AMD ("AMDisbetter!") */
-       if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
-           ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
-           edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
-               return true;
-
-       /* Hygon ("HygonGenuine") */
-       if (ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx &&
-           ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx &&
-           edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx)
+       if (is_guest_vendor_amd(ebx, ecx, edx) ||
+           is_guest_vendor_hygon(ebx, ecx, edx))
                return true;
 
        /*
@@ -3980,7 +3971,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
 
        eax = reg_read(ctxt, VCPU_REGS_RAX);
        ecx = reg_read(ctxt, VCPU_REGS_RCX);
-       ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+       ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
        *reg_write(ctxt, VCPU_REGS_RAX) = eax;
        *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
        *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
@@ -4250,7 +4241,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
                        eax = 0x80000008;
                        ecx = 0;
                        if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx,
-                                                &edx, false))
+                                                &edx, true))
                                maxphyaddr = eax & 0xff;
                        else
                                maxphyaddr = 36;
index b24c606..febca33 100644 (file)
@@ -367,7 +367,7 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
 {
        struct kvm_kpit_state *ps = &pit->pit_state;
 
-       pr_debug("load_count val is %d, channel is %d\n", val, channel);
+       pr_debug("load_count val is %u, channel is %d\n", val, channel);
 
        /*
         * The largest possible initial count is 0; this is equivalent
similarity index 93%
rename from arch/x86/include/asm/kvm_emulate.h
rename to arch/x86/kvm/kvm_emulate.h
index 2a8f2bd..4688b26 100644 (file)
@@ -221,7 +221,7 @@ struct x86_emulate_ops {
                         enum x86_intercept_stage stage);
 
        bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
-                         u32 *ecx, u32 *edx, bool check_limit);
+                         u32 *ecx, u32 *edx, bool exact_only);
        bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt);
        bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
        bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
@@ -301,6 +301,7 @@ struct fastop;
 typedef void (*fastop_t)(struct fastop *);
 
 struct x86_emulate_ctxt {
+       void *vcpu;
        const struct x86_emulate_ops *ops;
 
        /* Register state before/after emulation. */
@@ -319,6 +320,10 @@ struct x86_emulate_ctxt {
        bool have_exception;
        struct x86_exception exception;
 
+       /* GPA available */
+       bool gpa_available;
+       gpa_t gpa_val;
+
        /*
         * decode cache
         */
@@ -329,9 +334,6 @@ struct x86_emulate_ctxt {
        u8 intercept;
        u8 op_bytes;
        u8 ad_bytes;
-       struct operand src;
-       struct operand src2;
-       struct operand dst;
        union {
                int (*execute)(struct x86_emulate_ctxt *ctxt);
                fastop_t fop;
@@ -359,6 +361,11 @@ struct x86_emulate_ctxt {
        u8 seg_override;
        u64 d;
        unsigned long _eip;
+
+       /* Here begins the usercopy section. */
+       struct operand src;
+       struct operand src2;
+       struct operand dst;
        struct operand memop;
        /* Fields above regs are cleared together. */
        unsigned long _regs[NR_VCPU_REGS];
@@ -389,6 +396,34 @@ struct x86_emulate_ctxt {
 #define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e
 #define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69
 
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_ebx 0x746e6543
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_ecx 0x736c7561
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_edx 0x48727561
+
+static inline bool is_guest_vendor_intel(u32 ebx, u32 ecx, u32 edx)
+{
+       return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+              ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+              edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
+
+static inline bool is_guest_vendor_amd(u32 ebx, u32 ecx, u32 edx)
+{
+       return (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+               ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+               edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) ||
+              (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+               ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+               edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx);
+}
+
+static inline bool is_guest_vendor_hygon(u32 ebx, u32 ecx, u32 edx)
+{
+       return ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx &&
+              ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx &&
+              edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx;
+}
+
 enum x86_intercept_stage {
        X86_ICTP_NONE = 0,   /* Allow zero-init to not match anything */
        X86_ICPT_PRE_EXCEPT,
index e3099c6..b754e49 100644 (file)
@@ -164,14 +164,28 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
        kvfree(map);
 }
 
-static void recalculate_apic_map(struct kvm *kvm)
+void kvm_recalculate_apic_map(struct kvm *kvm)
 {
        struct kvm_apic_map *new, *old = NULL;
        struct kvm_vcpu *vcpu;
        int i;
        u32 max_id = 255; /* enough space for any xAPIC ID */
 
+       if (!kvm->arch.apic_map_dirty) {
+               /*
+                * Read kvm->arch.apic_map_dirty before
+                * kvm->arch.apic_map
+                */
+               smp_rmb();
+               return;
+       }
+
        mutex_lock(&kvm->arch.apic_map_lock);
+       if (!kvm->arch.apic_map_dirty) {
+               /* Someone else has updated the map. */
+               mutex_unlock(&kvm->arch.apic_map_lock);
+               return;
+       }
 
        kvm_for_each_vcpu(i, vcpu, kvm)
                if (kvm_apic_present(vcpu))
@@ -236,6 +250,12 @@ out:
        old = rcu_dereference_protected(kvm->arch.apic_map,
                        lockdep_is_held(&kvm->arch.apic_map_lock));
        rcu_assign_pointer(kvm->arch.apic_map, new);
+       /*
+        * Write kvm->arch.apic_map before
+        * clearing apic->apic_map_dirty
+        */
+       smp_wmb();
+       kvm->arch.apic_map_dirty = false;
        mutex_unlock(&kvm->arch.apic_map_lock);
 
        if (old)
@@ -257,20 +277,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
                else
                        static_key_slow_inc(&apic_sw_disabled.key);
 
-               recalculate_apic_map(apic->vcpu->kvm);
+               apic->vcpu->kvm->arch.apic_map_dirty = true;
        }
 }
 
 static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
 {
        kvm_lapic_set_reg(apic, APIC_ID, id << 24);
-       recalculate_apic_map(apic->vcpu->kvm);
+       apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
 {
        kvm_lapic_set_reg(apic, APIC_LDR, id);
-       recalculate_apic_map(apic->vcpu->kvm);
+       apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
@@ -286,7 +306,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 
        kvm_lapic_set_reg(apic, APIC_ID, id);
        kvm_lapic_set_reg(apic, APIC_LDR, ldr);
-       recalculate_apic_map(apic->vcpu->kvm);
+       apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
@@ -294,11 +314,6 @@ static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
        return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
 }
 
-static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
-{
-       return kvm_lapic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
-}
-
 static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
 {
        return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
@@ -1226,7 +1241,7 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
 }
 EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
 
-static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
 {
        struct kvm_lapic_irq irq;
 
@@ -1911,7 +1926,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
        case APIC_DFR:
                if (!apic_x2apic_mode(apic)) {
                        kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
-                       recalculate_apic_map(apic->vcpu->kvm);
+                       apic->vcpu->kvm->arch.apic_map_dirty = true;
                } else
                        ret = 1;
                break;
@@ -1940,7 +1955,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
        case APIC_ICR:
                /* No delay here, so we always clear the pending bit */
                val &= ~(1 << 12);
-               apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
+               kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
                kvm_lapic_set_reg(apic, APIC_ICR, val);
                break;
 
@@ -2017,6 +2032,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
                break;
        }
 
+       kvm_recalculate_apic_map(apic->vcpu->kvm);
+
        return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
@@ -2165,7 +2182,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
                        static_key_slow_dec_deferred(&apic_hw_disabled);
                } else {
                        static_key_slow_inc(&apic_hw_disabled.key);
-                       recalculate_apic_map(vcpu->kvm);
+                       vcpu->kvm->arch.apic_map_dirty = true;
                }
        }
 
@@ -2206,6 +2223,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
        if (!apic)
                return;
 
+       vcpu->kvm->arch.apic_map_dirty = false;
        /* Stop the timer in case it's a reset to an active apic */
        hrtimer_cancel(&apic->lapic_timer.timer);
 
@@ -2257,6 +2275,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 
        vcpu->arch.apic_arb_prio = 0;
        vcpu->arch.apic_attention = 0;
+
+       kvm_recalculate_apic_map(vcpu->kvm);
 }
 
 /*
@@ -2478,17 +2498,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
        struct kvm_lapic *apic = vcpu->arch.apic;
        int r;
 
-
        kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
        /* set SPIV separately to get count of SW disabled APICs right */
        apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
 
        r = kvm_apic_state_fixup(vcpu, s, true);
-       if (r)
+       if (r) {
+               kvm_recalculate_apic_map(vcpu->kvm);
                return r;
+       }
        memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
 
-       recalculate_apic_map(vcpu->kvm);
+       kvm_recalculate_apic_map(vcpu->kvm);
        kvm_apic_set_version(vcpu);
 
        apic_update_ppr(apic);
index ec6fbfe..40ed6ed 100644 (file)
@@ -78,6 +78,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_recalculate_apic_map(struct kvm *kvm);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
 int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
@@ -95,6 +96,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
                struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
index a647601..e6bfe79 100644 (file)
@@ -95,11 +95,11 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
        return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
 }
 
-static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
+static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
 {
        if (VALID_PAGE(vcpu->arch.mmu->root_hpa))
-               vcpu->arch.mmu->set_cr3(vcpu, vcpu->arch.mmu->root_hpa |
-                                             kvm_get_active_pcid(vcpu));
+               kvm_x86_ops->load_mmu_pgd(vcpu, vcpu->arch.mmu->root_hpa |
+                                               kvm_get_active_pcid(vcpu));
 }
 
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
index 87e9ba2..560e85e 100644 (file)
@@ -19,6 +19,7 @@
 #include "mmu.h"
 #include "x86.h"
 #include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
 #include "cpuid.h"
 
 #include <linux/kvm_host.h>
@@ -86,6 +87,8 @@ __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
  */
 bool tdp_enabled = false;
 
+static int max_page_level __read_mostly;
+
 enum {
        AUDIT_PRE_PAGE_FAULT,
        AUDIT_POST_PAGE_FAULT,
@@ -215,17 +218,6 @@ struct kvm_shadow_walk_iterator {
        unsigned index;
 };
 
-static const union kvm_mmu_page_role mmu_base_role_mask = {
-       .cr0_wp = 1,
-       .gpte_is_8_bytes = 1,
-       .nxe = 1,
-       .smep_andnot_wp = 1,
-       .smap_andnot_wp = 1,
-       .smm = 1,
-       .guest_mode = 1,
-       .ad_disabled = 1,
-};
-
 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
        for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
                                         (_root), (_addr));                \
@@ -3292,7 +3284,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
        if (!slot)
                return PT_PAGE_TABLE_LEVEL;
 
-       max_level = min(max_level, kvm_x86_ops->get_lpage_level());
+       max_level = min(max_level, max_page_level);
        for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) {
                linfo = lpage_info_slot(gfn, slot, max_level);
                if (!linfo->disallow_lpage)
@@ -3568,8 +3560,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                 * write-protected for dirty-logging or access tracking.
                 */
                if ((error_code & PFERR_WRITE_MASK) &&
-                   spte_can_locklessly_be_made_writable(spte))
-               {
+                   spte_can_locklessly_be_made_writable(spte)) {
                        new_spte |= PT_WRITABLE_MASK;
 
                        /*
@@ -3731,7 +3722,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
        } else
                BUG();
-       vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
+
+       /* root_cr3 is ignored for direct MMUs. */
+       vcpu->arch.mmu->root_cr3 = 0;
 
        return 0;
 }
@@ -3743,7 +3736,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
        gfn_t root_gfn, root_cr3;
        int i;
 
-       root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
+       root_cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
        root_gfn = root_cr3 >> PAGE_SHIFT;
 
        if (mmu_check_root(vcpu, root_gfn))
@@ -4080,7 +4073,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
        arch.gfn = gfn;
        arch.direct_map = vcpu->arch.mmu->direct_map;
-       arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
+       arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
 
        return kvm_setup_async_pf(vcpu, cr2_or_gpa,
                                  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
@@ -4252,6 +4245,14 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
        context->nx = false;
 }
 
+static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t cr3,
+                                 union kvm_mmu_page_role role)
+{
+       return (role.direct || cr3 == root->cr3) &&
+              VALID_PAGE(root->hpa) && page_header(root->hpa) &&
+              role.word == page_header(root->hpa)->role.word;
+}
+
 /*
  * Find out if a previously cached root matching the new CR3/role is available.
  * The current root is also inserted into the cache.
@@ -4270,12 +4271,13 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
        root.cr3 = mmu->root_cr3;
        root.hpa = mmu->root_hpa;
 
+       if (is_root_usable(&root, new_cr3, new_role))
+               return true;
+
        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
                swap(root, mmu->prev_roots[i]);
 
-               if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
-                   page_header(root.hpa) != NULL &&
-                   new_role.word == page_header(root.hpa)->role.word)
+               if (is_root_usable(&root, new_cr3, new_role))
                        break;
        }
 
@@ -4309,7 +4311,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
                         * accompanied by KVM_REQ_MMU_RELOAD, which will free
                         * the root set here and allocate a new one.
                         */
-                       kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
+                       kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
                        if (!skip_tlb_flush) {
                                kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
                                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -4508,7 +4510,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
                                cpuid_maxphyaddr(vcpu), context->root_level,
                                context->nx,
                                guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
-                               is_pse(vcpu), guest_cpuid_is_amd(vcpu));
+                               is_pse(vcpu),
+                               guest_cpuid_is_amd_or_hygon(vcpu));
 }
 
 static void
@@ -4874,7 +4877,6 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
        ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
        ext.cr4_pse = !!is_pse(vcpu);
        ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
-       ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
        ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
 
        ext.valid = 1;
@@ -4920,7 +4922,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        union kvm_mmu_role new_role =
                kvm_calc_tdp_mmu_root_page_role(vcpu, false);
 
-       new_role.base.word &= mmu_base_role_mask.word;
        if (new_role.as_u64 == context->mmu_role.as_u64)
                return;
 
@@ -4931,8 +4932,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->update_pte = nonpaging_update_pte;
        context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
        context->direct_map = true;
-       context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
-       context->get_cr3 = get_cr3;
+       context->get_guest_pgd = get_cr3;
        context->get_pdptr = kvm_pdptr_read;
        context->inject_page_fault = kvm_inject_page_fault;
 
@@ -4992,7 +4992,6 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
        union kvm_mmu_role new_role =
                kvm_calc_shadow_mmu_root_page_role(vcpu, false);
 
-       new_role.base.word &= mmu_base_role_mask.word;
        if (new_role.as_u64 == context->mmu_role.as_u64)
                return;
 
@@ -5012,14 +5011,14 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
 static union kvm_mmu_role
 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
-                                  bool execonly)
+                                  bool execonly, u8 level)
 {
        union kvm_mmu_role role = {0};
 
        /* SMM flag is inherited from root_mmu */
        role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
 
-       role.base.level = PT64_ROOT_4LEVEL;
+       role.base.level = level;
        role.base.gpte_is_8_bytes = true;
        role.base.direct = false;
        role.base.ad_disabled = !accessed_dirty;
@@ -5043,17 +5042,17 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
                             bool accessed_dirty, gpa_t new_eptp)
 {
        struct kvm_mmu *context = vcpu->arch.mmu;
+       u8 level = vmx_eptp_page_walk_level(new_eptp);
        union kvm_mmu_role new_role =
                kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
-                                                  execonly);
+                                                  execonly, level);
 
        __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
 
-       new_role.base.word &= mmu_base_role_mask.word;
        if (new_role.as_u64 == context->mmu_role.as_u64)
                return;
 
-       context->shadow_root_level = PT64_ROOT_4LEVEL;
+       context->shadow_root_level = level;
 
        context->nx = true;
        context->ept_ad = accessed_dirty;
@@ -5062,7 +5061,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
        context->sync_page = ept_sync_page;
        context->invlpg = ept_invlpg;
        context->update_pte = ept_update_pte;
-       context->root_level = PT64_ROOT_4LEVEL;
+       context->root_level = level;
        context->direct_map = false;
        context->mmu_role.as_u64 = new_role.as_u64;
 
@@ -5079,8 +5078,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
        struct kvm_mmu *context = vcpu->arch.mmu;
 
        kvm_init_shadow_mmu(vcpu);
-       context->set_cr3           = kvm_x86_ops->set_cr3;
-       context->get_cr3           = get_cr3;
+       context->get_guest_pgd     = get_cr3;
        context->get_pdptr         = kvm_pdptr_read;
        context->inject_page_fault = kvm_inject_page_fault;
 }
@@ -5090,12 +5088,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
        union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
        struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
-       new_role.base.word &= mmu_base_role_mask.word;
        if (new_role.as_u64 == g_context->mmu_role.as_u64)
                return;
 
        g_context->mmu_role.as_u64 = new_role.as_u64;
-       g_context->get_cr3           = get_cr3;
+       g_context->get_guest_pgd     = get_cr3;
        g_context->get_pdptr         = kvm_pdptr_read;
        g_context->inject_page_fault = kvm_inject_page_fault;
 
@@ -5185,7 +5182,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        kvm_mmu_sync_roots(vcpu);
        if (r)
                goto out;
-       kvm_mmu_load_cr3(vcpu);
+       kvm_mmu_load_pgd(vcpu);
        kvm_x86_ops->tlb_flush(vcpu, true);
 out:
        return r;
@@ -5329,6 +5326,22 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
        return spte;
 }
 
+/*
+ * Ignore various flags when determining if a SPTE can be immediately
+ * overwritten for the current MMU.
+ *  - level: explicitly checked in mmu_pte_write_new_pte(), and will never
+ *    match the current MMU role, as MMU's level tracks the root level.
+ *  - access: updated based on the new guest PTE
+ *  - quadrant: handled by get_written_sptes()
+ *  - invalid: always false (loop only walks valid shadow pages)
+ */
+static const union kvm_mmu_page_role role_ign = {
+       .level = 0xf,
+       .access = 0x7,
+       .quadrant = 0x3,
+       .invalid = 0x1,
+};
+
 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                              const u8 *new, int bytes,
                              struct kvm_page_track_notifier_node *node)
@@ -5384,8 +5397,8 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                        entry = *spte;
                        mmu_page_zap_pte(vcpu->kvm, sp, spte);
                        if (gentry &&
-                             !((sp->role.word ^ base_role)
-                             & mmu_base_role_mask.word) && rmap_can_add(vcpu))
+                           !((sp->role.word ^ base_role) & ~role_ign.word) &&
+                           rmap_can_add(vcpu))
                                mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
                        if (need_remote_flush(entry, *spte))
                                remote_flush = true;
@@ -5416,18 +5429,12 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                       void *insn, int insn_len)
 {
-       int r, emulation_type = 0;
+       int r, emulation_type = EMULTYPE_PF;
        bool direct = vcpu->arch.mmu->direct_map;
 
        if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
                return RET_PF_RETRY;
 
-       /* With shadow page tables, fault_address contains a GVA or nGPA.  */
-       if (vcpu->arch.mmu->direct_map) {
-               vcpu->arch.gpa_available = true;
-               vcpu->arch.gpa_val = cr2_or_gpa;
-       }
-
        r = RET_PF_INVALID;
        if (unlikely(error_code & PFERR_RSVD_MASK)) {
                r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
@@ -5471,7 +5478,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
         * for L1 isn't going to magically fix whatever issue cause L2 to fail.
         */
        if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
-               emulation_type = EMULTYPE_ALLOW_RETRY;
+               emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
 emulate:
        /*
         * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
@@ -5553,18 +5560,25 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
 
-void kvm_enable_tdp(void)
+void kvm_configure_mmu(bool enable_tdp, int tdp_page_level)
 {
-       tdp_enabled = true;
-}
-EXPORT_SYMBOL_GPL(kvm_enable_tdp);
+       tdp_enabled = enable_tdp;
 
-void kvm_disable_tdp(void)
-{
-       tdp_enabled = false;
+       /*
+        * max_page_level reflects the capabilities of KVM's MMU irrespective
+        * of kernel support, e.g. KVM may be capable of using 1GB pages when
+        * the kernel is not.  But, KVM never creates a page size greater than
+        * what is used by the kernel for any given HVA, i.e. the kernel's
+        * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
+        */
+       if (tdp_enabled)
+               max_page_level = tdp_page_level;
+       else if (boot_cpu_has(X86_FEATURE_GBPAGES))
+               max_page_level = PT_PDPE_LEVEL;
+       else
+               max_page_level = PT_DIRECTORY_LEVEL;
 }
-EXPORT_SYMBOL_GPL(kvm_disable_tdp);
-
+EXPORT_SYMBOL_GPL(kvm_configure_mmu);
 
 /* The return value indicates if tlb flush on all vcpus is needed. */
 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
@@ -5860,23 +5874,17 @@ static bool slot_rmap_write_protect(struct kvm *kvm,
 }
 
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
-                                     struct kvm_memory_slot *memslot)
+                                     struct kvm_memory_slot *memslot,
+                                     int start_level)
 {
        bool flush;
 
        spin_lock(&kvm->mmu_lock);
-       flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
-                                     false);
+       flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
+                               start_level, PT_MAX_HUGEPAGE_LEVEL, false);
        spin_unlock(&kvm->mmu_lock);
 
        /*
-        * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
-        * which do tlb flush out of mmu-lock should be serialized by
-        * kvm->slots_lock otherwise tlb flush would be missed.
-        */
-       lockdep_assert_held(&kvm->slots_lock);
-
-       /*
         * We can flush all the TLBs out of the mmu lock without TLB
         * corruption since we just change the spte from writable to
         * readonly so that we only need to care the case of changing
@@ -5888,8 +5896,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
         * on PT_WRITABLE_MASK anymore.
         */
        if (flush)
-               kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
-                       memslot->npages);
+               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 
 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
@@ -5941,6 +5948,21 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
        spin_unlock(&kvm->mmu_lock);
 }
 
+void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+                                       struct kvm_memory_slot *memslot)
+{
+       /*
+        * All current use cases for flushing the TLBs for a specific memslot
+        * are related to dirty logging, and do the TLB flush out of mmu_lock.
+        * The interaction between the various operations on memslot must be
+        * serialized by slots_locks to ensure the TLB flush from one operation
+        * is observed by any other operation on the same memslot.
+        */
+       lockdep_assert_held(&kvm->slots_lock);
+       kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
+                                          memslot->npages);
+}
+
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot)
 {
@@ -5950,8 +5972,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
        flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
        spin_unlock(&kvm->mmu_lock);
 
-       lockdep_assert_held(&kvm->slots_lock);
-
        /*
         * It's also safe to flush TLBs out of mmu lock here as currently this
         * function is only used for dirty logging, in which case flushing TLB
@@ -5959,8 +5979,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
         * dirty_bitmap.
         */
        if (flush)
-               kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
-                               memslot->npages);
+               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
 
@@ -5974,12 +5993,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
                                        false);
        spin_unlock(&kvm->mmu_lock);
 
-       /* see kvm_mmu_slot_remove_write_access */
-       lockdep_assert_held(&kvm->slots_lock);
-
        if (flush)
-               kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
-                               memslot->npages);
+               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
 
@@ -5992,12 +6007,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
        flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
        spin_unlock(&kvm->mmu_lock);
 
-       lockdep_assert_held(&kvm->slots_lock);
-
-       /* see kvm_mmu_slot_leaf_clear_dirty */
        if (flush)
-               kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
-                               memslot->npages);
+               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
index 3521e2d..ddc1ec3 100644 (file)
 #include <linux/kvm_host.h>
 #include <linux/rculist.h>
 
-#include <asm/kvm_host.h>
 #include <asm/kvm_page_track.h>
 
 #include "mmu.h"
 
-void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
-                                struct kvm_memory_slot *dont)
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
 {
        int i;
 
-       for (i = 0; i < KVM_PAGE_TRACK_MAX; i++)
-               if (!dont || free->arch.gfn_track[i] !=
-                     dont->arch.gfn_track[i]) {
-                       kvfree(free->arch.gfn_track[i]);
-                       free->arch.gfn_track[i] = NULL;
-               }
+       for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+               kvfree(slot->arch.gfn_track[i]);
+               slot->arch.gfn_track[i] = NULL;
+       }
 }
 
 int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
@@ -48,7 +44,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
        return 0;
 
 track_free:
-       kvm_page_track_free_memslot(slot, NULL);
+       kvm_page_track_free_memslot(slot);
        return -ENOMEM;
 }
 
index e4c8a4c..1ddbfff 100644 (file)
@@ -66,7 +66,7 @@
        #define PT_GUEST_ACCESSED_SHIFT 8
        #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
        #define CMPXCHG cmpxchg64
-       #define PT_MAX_FULL_LEVELS 4
+       #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
 #else
        #error Invalid PTTYPE value
 #endif
@@ -333,7 +333,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
        trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
        walker->level = mmu->root_level;
-       pte           = mmu->get_cr3(vcpu);
+       pte           = mmu->get_guest_pgd(vcpu);
        have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
 
 #if PTTYPE == 64
index bcc6a73..d1f8ca5 100644 (file)
@@ -111,7 +111,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
                .config = config,
        };
 
-       attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+       attr.sample_period = get_sample_period(pmc, pmc->counter);
 
        if (in_tx)
                attr.config |= HSW_IN_TX;
@@ -158,7 +158,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
 
        /* recalibrate sample period and check if it's accepted by perf core */
        if (perf_event_period(pmc->perf_event,
-                       (-pmc->counter) & pmc_bitmask(pmc)))
+                             get_sample_period(pmc, pmc->counter)))
                return false;
 
        /* reuse perf_event to serve as pmc_reprogram_counter() does*/
index 1333298..d7da2b9 100644 (file)
@@ -129,6 +129,15 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
        return NULL;
 }
 
+static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
+{
+       u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
+
+       if (!sample_period)
+               sample_period = pmc_bitmask(pmc) + 1;
+       return sample_period;
+}
+
 void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
 void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
 void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
index 24c0b2b..05cb45b 100644 (file)
@@ -521,10 +521,31 @@ static void recalc_intercepts(struct vcpu_svm *svm)
        h = &svm->nested.hsave->control;
        g = &svm->nested;
 
-       c->intercept_cr = h->intercept_cr | g->intercept_cr;
-       c->intercept_dr = h->intercept_dr | g->intercept_dr;
-       c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
-       c->intercept = h->intercept | g->intercept;
+       c->intercept_cr = h->intercept_cr;
+       c->intercept_dr = h->intercept_dr;
+       c->intercept_exceptions = h->intercept_exceptions;
+       c->intercept = h->intercept;
+
+       if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+               /* We only want the cr8 intercept bits of L1 */
+               c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
+               c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
+
+               /*
+                * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
+                * affect any interrupt we may want to inject; therefore,
+                * interrupt window vmexits are irrelevant to L0.
+                */
+               c->intercept &= ~(1ULL << INTERCEPT_VINTR);
+       }
+
+       /* We don't want to see VMMCALLs from a nested guest */
+       c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+
+       c->intercept_cr |= g->intercept_cr;
+       c->intercept_dr |= g->intercept_dr;
+       c->intercept_exceptions |= g->intercept_exceptions;
+       c->intercept |= g->intercept;
 }
 
 static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
@@ -629,6 +650,11 @@ static inline void clr_intercept(struct vcpu_svm *svm, int bit)
        recalc_intercepts(svm);
 }
 
+static inline bool is_intercept(struct vcpu_svm *svm, int bit)
+{
+       return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
+}
+
 static inline bool vgif_enabled(struct vcpu_svm *svm)
 {
        return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
@@ -1208,6 +1234,7 @@ static int avic_ga_log_notifier(u32 ga_tag)
        u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
 
        pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+       trace_kvm_avic_ga_log(vm_id, vcpu_id);
 
        spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
        hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
@@ -1369,6 +1396,29 @@ static void svm_hardware_teardown(void)
        iopm_base = 0;
 }
 
+static __init void svm_set_cpu_caps(void)
+{
+       kvm_set_cpu_caps();
+
+       supported_xss = 0;
+
+       /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+       if (nested) {
+               kvm_cpu_cap_set(X86_FEATURE_SVM);
+
+               if (nrips)
+                       kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+
+               if (npt_enabled)
+                       kvm_cpu_cap_set(X86_FEATURE_NPT);
+       }
+
+       /* CPUID 0x80000008 */
+       if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+           boot_cpu_has(X86_FEATURE_AMD_SSBD))
+               kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+}
+
 static __init int svm_hardware_setup(void)
 {
        int cpu;
@@ -1387,6 +1437,8 @@ static __init int svm_hardware_setup(void)
 
        init_msrpm_offsets();
 
+       supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+
        if (boot_cpu_has(X86_FEATURE_NX))
                kvm_enable_efer_bits(EFER_NX);
 
@@ -1434,16 +1486,11 @@ static __init int svm_hardware_setup(void)
        if (!boot_cpu_has(X86_FEATURE_NPT))
                npt_enabled = false;
 
-       if (npt_enabled && !npt) {
-               printk(KERN_INFO "kvm: Nested Paging disabled\n");
+       if (npt_enabled && !npt)
                npt_enabled = false;
-       }
 
-       if (npt_enabled) {
-               printk(KERN_INFO "kvm: Nested Paging enabled\n");
-               kvm_enable_tdp();
-       } else
-               kvm_disable_tdp();
+       kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL);
+       pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
 
        if (nrips) {
                if (!boot_cpu_has(X86_FEATURE_NRIPS))
@@ -1479,6 +1526,8 @@ static __init int svm_hardware_setup(void)
                        pr_info("Virtual GIF supported\n");
        }
 
+       svm_set_cpu_caps();
+
        return 0;
 
 err:
@@ -1946,19 +1995,6 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
        kfree(region);
 }
 
-static struct kvm *svm_vm_alloc(void)
-{
-       struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
-                                           GFP_KERNEL_ACCOUNT | __GFP_ZERO,
-                                           PAGE_KERNEL);
-       return &kvm_svm->kvm;
-}
-
-static void svm_vm_free(struct kvm *kvm)
-{
-       vfree(to_kvm_svm(kvm));
-}
-
 static void sev_vm_destroy(struct kvm *kvm)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -2186,7 +2222,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        }
        init_vmcb(svm);
 
-       kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
+       kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
        kvm_rdx_write(vcpu, eax);
 
        if (kvm_vcpu_apicv_active(vcpu) && !init_event)
@@ -2420,14 +2456,38 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
        }
 }
 
+static inline void svm_enable_vintr(struct vcpu_svm *svm)
+{
+       struct vmcb_control_area *control;
+
+       /* The following fields are ignored when AVIC is enabled */
+       WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
+
+       /*
+        * This is just a dummy VINTR to actually cause a vmexit to happen.
+        * Actual injection of virtual interrupts happens through EVENTINJ.
+        */
+       control = &svm->vmcb->control;
+       control->int_vector = 0x0;
+       control->int_ctl &= ~V_INTR_PRIO_MASK;
+       control->int_ctl |= V_IRQ_MASK |
+               ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+       mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
 static void svm_set_vintr(struct vcpu_svm *svm)
 {
        set_intercept(svm, INTERCEPT_VINTR);
+       if (is_intercept(svm, INTERCEPT_VINTR))
+               svm_enable_vintr(svm);
 }
 
 static void svm_clear_vintr(struct vcpu_svm *svm)
 {
        clr_intercept(svm, INTERCEPT_VINTR);
+
+       svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+       mark_dirty(svm->vmcb, VMCB_INTR);
 }
 
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -2983,15 +3043,6 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
        return pdpte;
 }
 
-static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
-                                  unsigned long root)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       svm->vmcb->control.nested_cr3 = __sme_set(root);
-       mark_dirty(svm->vmcb, VMCB_NPT);
-}
-
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
                                       struct x86_exception *fault)
 {
@@ -3027,8 +3078,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 
        vcpu->arch.mmu = &vcpu->arch.guest_mmu;
        kvm_init_shadow_mmu(vcpu);
-       vcpu->arch.mmu->set_cr3           = nested_svm_set_tdp_cr3;
-       vcpu->arch.mmu->get_cr3           = nested_svm_get_tdp_cr3;
+       vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
        vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
        vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
        vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
@@ -3089,43 +3139,36 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
        return vmexit;
 }
 
-/* This function returns true if it is save to enable the irq window */
-static inline bool nested_svm_intr(struct vcpu_svm *svm)
+static void nested_svm_intr(struct vcpu_svm *svm)
 {
-       if (!is_guest_mode(&svm->vcpu))
-               return true;
-
-       if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
-               return true;
-
-       if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
-               return false;
-
-       /*
-        * if vmexit was already requested (by intercepted exception
-        * for instance) do not overwrite it with "external interrupt"
-        * vmexit.
-        */
-       if (svm->nested.exit_required)
-               return false;
-
        svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
        svm->vmcb->control.exit_info_1 = 0;
        svm->vmcb->control.exit_info_2 = 0;
 
-       if (svm->nested.intercept & 1ULL) {
-               /*
-                * The #vmexit can't be emulated here directly because this
-                * code path runs with irqs and preemption disabled. A
-                * #vmexit emulation might sleep. Only signal request for
-                * the #vmexit here.
-                */
-               svm->nested.exit_required = true;
-               trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-               return false;
+       /* nested_svm_vmexit this gets called afterwards from handle_exit */
+       svm->nested.exit_required = true;
+       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+}
+
+static bool nested_exit_on_intr(struct vcpu_svm *svm)
+{
+       return (svm->nested.intercept & 1ULL);
+}
+
+static int svm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool block_nested_events =
+               kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
+
+       if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
+               if (block_nested_events)
+                       return -EBUSY;
+               nested_svm_intr(svm);
+               return 0;
        }
 
-       return true;
+       return 0;
 }
 
 /* This function returns true if it is save to enable the nmi window */
@@ -3244,9 +3287,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
        return NESTED_EXIT_CONTINUE;
 }
 
-/*
- * If this function returns true, this #vmexit was already handled
- */
 static int nested_svm_intercept(struct vcpu_svm *svm)
 {
        u32 exit_code = svm->vmcb->control.exit_code;
@@ -3521,6 +3561,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 
 static bool nested_vmcb_checks(struct vmcb *vmcb)
 {
+       if ((vmcb->save.efer & EFER_SVME) == 0)
+               return false;
+
        if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
                return false;
 
@@ -3537,6 +3580,10 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
                                 struct vmcb *nested_vmcb, struct kvm_host_map *map)
 {
+       bool evaluate_pending_interrupts =
+               is_intercept(svm, INTERCEPT_VINTR) ||
+               is_intercept(svm, INTERCEPT_IRET);
+
        if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
                svm->vcpu.arch.hflags |= HF_HIF_MASK;
        else
@@ -3596,15 +3643,6 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
        else
                svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
-       if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
-               /* We only want the cr8 intercept bits of the guest */
-               clr_cr_intercept(svm, INTERCEPT_CR8_READ);
-               clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
-       }
-
-       /* We don't want to see VMMCALLs from a nested guest */
-       clr_intercept(svm, INTERCEPT_VMMCALL);
-
        svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
        svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
 
@@ -3632,7 +3670,21 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 
        svm->nested.vmcb = vmcb_gpa;
 
+       /*
+        * If L1 had a pending IRQ/NMI before executing VMRUN,
+        * which wasn't delivered because it was disallowed (e.g.
+        * interrupts disabled), L0 needs to evaluate if this pending
+        * event should cause an exit from L2 to L1 or be delivered
+        * directly to L2.
+        *
+        * Usually this would be handled by the processor noticing an
+        * IRQ/NMI window request.  However, VMRUN can unblock interrupts
+        * by implicitly setting GIF, so force L0 to perform pending event
+        * evaluation by requesting a KVM_REQ_EVENT.
+        */
        enable_gif(svm);
+       if (unlikely(evaluate_pending_interrupts))
+               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 
        mark_all_dirty(svm->vmcb);
 }
@@ -3834,11 +3886,8 @@ static int clgi_interception(struct vcpu_svm *svm)
        disable_gif(svm);
 
        /* After a CLGI no interrupts should come */
-       if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
+       if (!kvm_vcpu_apicv_active(&svm->vcpu))
                svm_clear_vintr(svm);
-               svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-               mark_dirty(svm->vmcb, VMCB_INTR);
-       }
 
        return ret;
 }
@@ -5124,19 +5173,6 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
        ++vcpu->stat.nmi_injections;
 }
 
-static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
-{
-       struct vmcb_control_area *control;
-
-       /* The following fields are ignored when AVIC is enabled */
-       control = &svm->vmcb->control;
-       control->int_vector = irq;
-       control->int_ctl &= ~V_INTR_PRIO_MASK;
-       control->int_ctl |= V_IRQ_MASK |
-               ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
-       mark_dirty(svm->vmcb, VMCB_INTR);
-}
-
 static void svm_set_irq(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -5525,18 +5561,15 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb *vmcb = svm->vmcb;
-       int ret;
 
        if (!gif_set(svm) ||
             (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
                return 0;
 
-       ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
-
-       if (is_guest_mode(vcpu))
-               return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
-
-       return ret;
+       if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
+               return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
+       else
+               return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
 }
 
 static void enable_irq_window(struct kvm_vcpu *vcpu)
@@ -5551,7 +5584,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
         * enabled, the STGI interception will not occur. Enable the irq
         * window under the assumption that the hardware will set the GIF.
         */
-       if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
+       if (vgif_enabled(svm) || gif_set(svm)) {
                /*
                 * IRQ window is not needed when AVIC is enabled,
                 * unless we have pending ExtINT since it cannot be injected
@@ -5560,7 +5593,6 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
                 */
                svm_toggle_avic_for_irq_window(vcpu, false);
                svm_set_vintr(svm);
-               svm_inject_irq(svm, 0x0);
        }
 }
 
@@ -5946,24 +5978,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 }
 STACK_FRAME_NON_STANDARD(svm_vcpu_run);
 
-static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       bool update_guest_cr3 = true;
+       unsigned long cr3;
 
-       svm->vmcb->save.cr3 = __sme_set(root);
-       mark_dirty(svm->vmcb, VMCB_CR);
-}
-
-static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
+       cr3 = __sme_set(root);
+       if (npt_enabled) {
+               svm->vmcb->control.nested_cr3 = cr3;
+               mark_dirty(svm->vmcb, VMCB_NPT);
 
-       svm->vmcb->control.nested_cr3 = __sme_set(root);
-       mark_dirty(svm->vmcb, VMCB_NPT);
+               /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
+               if (is_guest_mode(vcpu))
+                       update_guest_cr3 = false;
+               else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+                       cr3 = vcpu->arch.cr3;
+               else /* CR3 is already up-to-date.  */
+                       update_guest_cr3 = false;
+       }
 
-       /* Also sync guest cr3 here in case we live migrate */
-       svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
-       mark_dirty(svm->vmcb, VMCB_CR);
+       if (update_guest_cr3) {
+               svm->vmcb->save.cr3 = cr3;
+               mark_dirty(svm->vmcb, VMCB_CR);
+       }
 }
 
 static int is_disabled(void)
@@ -6025,12 +6063,19 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
                                    boot_cpu_has(X86_FEATURE_XSAVES);
 
        /* Update nrips enabled cache */
-       svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+       svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
+                            guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
 
        if (!kvm_vcpu_apicv_active(vcpu))
                return;
 
-       guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
+       /*
+        * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
+        * is exposed to the guest, disable AVIC.
+        */
+       if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+               kvm_request_apicv_update(vcpu->kvm, false,
+                                        APICV_INHIBIT_REASON_X2APIC);
 
        /*
         * Currently, AVIC does not work with nested virtualization.
@@ -6041,88 +6086,11 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
                                         APICV_INHIBIT_REASON_NESTED);
 }
 
-#define F feature_bit
-
-static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-{
-       switch (func) {
-       case 0x1:
-               if (avic)
-                       entry->ecx &= ~F(X2APIC);
-               break;
-       case 0x80000001:
-               if (nested)
-                       entry->ecx |= (1 << 2); /* Set SVM bit */
-               break;
-       case 0x80000008:
-               if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
-                    boot_cpu_has(X86_FEATURE_AMD_SSBD))
-                       entry->ebx |= F(VIRT_SSBD);
-               break;
-       case 0x8000000A:
-               entry->eax = 1; /* SVM revision 1 */
-               entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
-                                  ASID emulation to nested SVM */
-               entry->ecx = 0; /* Reserved */
-               entry->edx = 0; /* Per default do not support any
-                                  additional features */
-
-               /* Support next_rip if host supports it */
-               if (boot_cpu_has(X86_FEATURE_NRIPS))
-                       entry->edx |= F(NRIPS);
-
-               /* Support NPT for the guest if enabled */
-               if (npt_enabled)
-                       entry->edx |= F(NPT);
-
-       }
-}
-
-static int svm_get_lpage_level(void)
-{
-       return PT_PDPE_LEVEL;
-}
-
-static bool svm_rdtscp_supported(void)
-{
-       return boot_cpu_has(X86_FEATURE_RDTSCP);
-}
-
-static bool svm_invpcid_supported(void)
-{
-       return false;
-}
-
-static bool svm_mpx_supported(void)
-{
-       return false;
-}
-
-static bool svm_xsaves_supported(void)
-{
-       return boot_cpu_has(X86_FEATURE_XSAVES);
-}
-
-static bool svm_umip_emulated(void)
-{
-       return false;
-}
-
-static bool svm_pt_supported(void)
-{
-       return false;
-}
-
 static bool svm_has_wbinvd_exit(void)
 {
        return true;
 }
 
-static bool svm_pku_supported(void)
-{
-       return false;
-}
-
 #define PRE_EX(exit)  { .exit_code = (exit), \
                        .stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -6189,7 +6157,8 @@ static const struct __x86_intercept {
 
 static int svm_check_intercept(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
-                              enum x86_intercept_stage stage)
+                              enum x86_intercept_stage stage,
+                              struct x86_exception *exception)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        int vmexit, ret = X86EMUL_CONTINUE;
@@ -7373,7 +7342,8 @@ static bool svm_check_apicv_inhibit_reasons(ulong bit)
                          BIT(APICV_INHIBIT_REASON_HYPERV) |
                          BIT(APICV_INHIBIT_REASON_NESTED) |
                          BIT(APICV_INHIBIT_REASON_IRQWIN) |
-                         BIT(APICV_INHIBIT_REASON_PIT_REINJ);
+                         BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
+                         BIT(APICV_INHIBIT_REASON_X2APIC);
 
        return supported & BIT(bit);
 }
@@ -7398,8 +7368,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .vcpu_free = svm_free_vcpu,
        .vcpu_reset = svm_vcpu_reset,
 
-       .vm_alloc = svm_vm_alloc,
-       .vm_free = svm_vm_free,
+       .vm_size = sizeof(struct kvm_svm),
        .vm_init = svm_vm_init,
        .vm_destroy = svm_vm_destroy,
 
@@ -7421,7 +7390,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
        .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
        .set_cr0 = svm_set_cr0,
-       .set_cr3 = svm_set_cr3,
        .set_cr4 = svm_set_cr4,
        .set_efer = svm_set_efer,
        .get_idt = svm_get_idt,
@@ -7474,26 +7442,14 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
        .get_exit_info = svm_get_exit_info,
 
-       .get_lpage_level = svm_get_lpage_level,
-
        .cpuid_update = svm_cpuid_update,
 
-       .rdtscp_supported = svm_rdtscp_supported,
-       .invpcid_supported = svm_invpcid_supported,
-       .mpx_supported = svm_mpx_supported,
-       .xsaves_supported = svm_xsaves_supported,
-       .umip_emulated = svm_umip_emulated,
-       .pt_supported = svm_pt_supported,
-       .pku_supported = svm_pku_supported,
-
-       .set_supported_cpuid = svm_set_supported_cpuid,
-
        .has_wbinvd_exit = svm_has_wbinvd_exit,
 
        .read_l1_tsc_offset = svm_read_l1_tsc_offset,
        .write_l1_tsc_offset = svm_write_l1_tsc_offset,
 
-       .set_tdp_cr3 = set_tdp_cr3,
+       .load_mmu_pgd = svm_load_mmu_pgd,
 
        .check_intercept = svm_check_intercept,
        .handle_exit_irqoff = svm_handle_exit_irqoff,
@@ -7523,6 +7479,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
 
        .apic_init_signal_blocked = svm_apic_init_signal_blocked,
+
+       .check_nested_events = svm_check_nested_events,
 };
 
 static int __init svm_init(void)
index f194dd0..c3d1e9f 100644 (file)
@@ -151,32 +151,38 @@ TRACE_EVENT(kvm_fast_mmio,
  * Tracepoint for cpuid.
  */
 TRACE_EVENT(kvm_cpuid,
-       TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
-                unsigned long rcx, unsigned long rdx, bool found),
-       TP_ARGS(function, rax, rbx, rcx, rdx, found),
+       TP_PROTO(unsigned int function, unsigned int index, unsigned long rax,
+                unsigned long rbx, unsigned long rcx, unsigned long rdx,
+                bool found, bool used_max_basic),
+       TP_ARGS(function, index, rax, rbx, rcx, rdx, found, used_max_basic),
 
        TP_STRUCT__entry(
                __field(        unsigned int,   function        )
+               __field(        unsigned int,   index           )
                __field(        unsigned long,  rax             )
                __field(        unsigned long,  rbx             )
                __field(        unsigned long,  rcx             )
                __field(        unsigned long,  rdx             )
                __field(        bool,           found           )
+               __field(        bool,           used_max_basic  )
        ),
 
        TP_fast_assign(
                __entry->function       = function;
+               __entry->index          = index;
                __entry->rax            = rax;
                __entry->rbx            = rbx;
                __entry->rcx            = rcx;
                __entry->rdx            = rdx;
                __entry->found          = found;
+               __entry->used_max_basic = used_max_basic;
        ),
 
-       TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s",
-                 __entry->function, __entry->rax,
+       TP_printk("func %x idx %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s%s",
+                 __entry->function, __entry->index, __entry->rax,
                  __entry->rbx, __entry->rcx, __entry->rdx,
-                 __entry->found ? "found" : "not found")
+                 __entry->found ? "found" : "not found",
+                 __entry->used_max_basic ? ", used max basic" : "")
 );
 
 #define AREG(x) { APIC_##x, "APIC_" #x }
@@ -745,13 +751,13 @@ TRACE_EVENT(kvm_emulate_insn,
 
        TP_fast_assign(
                __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
-               __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr
-                              - vcpu->arch.emulate_ctxt.fetch.data;
-               __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len;
+               __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
+                              - vcpu->arch.emulate_ctxt->fetch.data;
+               __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
                memcpy(__entry->insn,
-                      vcpu->arch.emulate_ctxt.fetch.data,
+                      vcpu->arch.emulate_ctxt->fetch.data,
                       15);
-               __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
+               __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode);
                __entry->failed = failed;
                ),
 
@@ -1367,6 +1373,24 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
                  __entry->vec)
 );
 
+TRACE_EVENT(kvm_avic_ga_log,
+           TP_PROTO(u32 vmid, u32 vcpuid),
+           TP_ARGS(vmid, vcpuid),
+
+       TP_STRUCT__entry(
+               __field(u32, vmid)
+               __field(u32, vcpuid)
+       ),
+
+       TP_fast_assign(
+               __entry->vmid = vmid;
+               __entry->vcpuid = vcpuid;
+       ),
+
+       TP_printk("vmid=%u, vcpuid=%u",
+                 __entry->vmid, __entry->vcpuid)
+);
+
 TRACE_EVENT(kvm_hv_timer_state,
                TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
                TP_ARGS(vcpu_id, hv_timer_in_use),
index f486e26..8903475 100644 (file)
@@ -101,7 +101,7 @@ static inline bool cpu_has_load_perf_global_ctrl(void)
               (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 }
 
-static inline bool vmx_mpx_supported(void)
+static inline bool cpu_has_vmx_mpx(void)
 {
        return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
                (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
@@ -146,11 +146,6 @@ static inline bool vmx_umip_emulated(void)
                SECONDARY_EXEC_DESC;
 }
 
-static inline bool vmx_pku_supported(void)
-{
-       return boot_cpu_has(X86_FEATURE_PKU);
-}
-
 static inline bool cpu_has_vmx_rdtscp(void)
 {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -354,4 +349,22 @@ static inline bool cpu_has_vmx_intel_pt(void)
                (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
 }
 
+/*
+ * Processor Trace can operate in one of three modes:
+ *  a. system-wide: trace both host/guest and output to host buffer
+ *  b. host-only:   only trace host and output to host buffer
+ *  c. host-guest:  trace host and guest simultaneously and output to their
+ *                  respective buffer
+ *
+ * KVM currently only supports (a) and (c).
+ */
+static inline bool vmx_pt_mode_is_system(void)
+{
+       return pt_mode == PT_MODE_SYSTEM;
+}
+static inline bool vmx_pt_mode_is_host_guest(void)
+{
+       return pt_mode == PT_MODE_HOST_GUEST;
+}
+
 #endif /* __KVM_X86_VMX_CAPS_H */
index 6de47f2..e5f7a7e 100644 (file)
@@ -198,6 +198,13 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
 static inline void evmcs_touch_msr_bitmap(void) {}
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
+enum nested_evmptrld_status {
+       EVMPTRLD_DISABLED,
+       EVMPTRLD_SUCCEEDED,
+       EVMPTRLD_VMFAIL,
+       EVMPTRLD_ERROR,
+};
+
 bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
 uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
 int nested_enable_evmcs(struct kvm_vcpu *vcpu,
index e920d78..4ff859c 100644 (file)
@@ -224,7 +224,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
                return;
 
        kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
-       vmx->nested.hv_evmcs_vmptr = -1ull;
+       vmx->nested.hv_evmcs_vmptr = 0;
        vmx->nested.hv_evmcs = NULL;
 }
 
@@ -353,9 +353,8 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
                        to_vmx(vcpu)->nested.msrs.ept_caps &
                        VMX_EPT_EXECUTE_ONLY_BIT,
                        nested_ept_ad_enabled(vcpu),
-                       nested_ept_get_cr3(vcpu));
-       vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
-       vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
+                       nested_ept_get_eptp(vcpu));
+       vcpu->arch.mmu->get_guest_pgd     = nested_ept_get_eptp;
        vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
        vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
 
@@ -1910,20 +1909,21 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
  * This is an equivalent of the nested hypervisor executing the vmptrld
  * instruction.
  */
-static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
-                                                bool from_launch)
+static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
+       struct kvm_vcpu *vcpu, bool from_launch)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        bool evmcs_gpa_changed = false;
        u64 evmcs_gpa;
 
        if (likely(!vmx->nested.enlightened_vmcs_enabled))
-               return 1;
+               return EVMPTRLD_DISABLED;
 
        if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
-               return 1;
+               return EVMPTRLD_DISABLED;
 
-       if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
+       if (unlikely(!vmx->nested.hv_evmcs ||
+                    evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
                if (!vmx->nested.hv_evmcs)
                        vmx->nested.current_vmptr = -1ull;
 
@@ -1931,7 +1931,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
 
                if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
                                 &vmx->nested.hv_evmcs_map))
-                       return 0;
+                       return EVMPTRLD_ERROR;
 
                vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
 
@@ -1960,7 +1960,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
                if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
                    (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
                        nested_release_evmcs(vcpu);
-                       return 0;
+                       return EVMPTRLD_VMFAIL;
                }
 
                vmx->nested.dirty_vmcs12 = true;
@@ -1989,21 +1989,13 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
                vmx->nested.hv_evmcs->hv_clean_fields &=
                        ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
 
-       return 1;
+       return EVMPTRLD_SUCCEEDED;
 }
 
 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       /*
-        * hv_evmcs may end up being not mapped after migration (when
-        * L2 was running), map it here to make sure vmcs12 changes are
-        * properly reflected.
-        */
-       if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
-               nested_vmx_handle_enlightened_vmptrld(vcpu, false);
-
        if (vmx->nested.hv_evmcs) {
                copy_vmcs12_to_enlightened(vmx);
                /* All fields are clean */
@@ -2474,9 +2466,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                         * If L1 use EPT, then L0 needs to execute INVEPT on
                         * EPTP02 instead of EPTP01. Therefore, delay TLB
                         * flush until vmcs02->eptp is fully updated by
-                        * KVM_REQ_LOAD_CR3. Note that this assumes
+                        * KVM_REQ_LOAD_MMU_PGD. Note that this assumes
                         * KVM_REQ_TLB_FLUSH is evaluated after
-                        * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
+                        * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest().
                         */
                        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                }
@@ -2521,7 +2513,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        /*
         * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
         * on nested VM-Exit, which can occur without actually running L2 and
-        * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
+        * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
         * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
         * transition to HLT instead of running L2.
         */
@@ -2563,13 +2555,13 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
        return 0;
 }
 
-static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
+static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int maxphyaddr = cpuid_maxphyaddr(vcpu);
 
        /* Check for memory type validity */
-       switch (address & VMX_EPTP_MT_MASK) {
+       switch (new_eptp & VMX_EPTP_MT_MASK) {
        case VMX_EPTP_MT_UC:
                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
                        return false;
@@ -2582,16 +2574,26 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
                return false;
        }
 
-       /* only 4 levels page-walk length are valid */
-       if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
+       /* Page-walk levels validity. */
+       switch (new_eptp & VMX_EPTP_PWL_MASK) {
+       case VMX_EPTP_PWL_5:
+               if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
+                       return false;
+               break;
+       case VMX_EPTP_PWL_4:
+               if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
+                       return false;
+               break;
+       default:
                return false;
+       }
 
        /* Reserved bits should not be set */
-       if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
+       if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f)))
                return false;
 
        /* AD, if set, should be supported */
-       if (address & VMX_EPTP_AD_ENABLE_BIT) {
+       if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
                        return false;
        }
@@ -2640,7 +2642,7 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        if (nested_cpu_has_ept(vmcs12) &&
-           CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
+           CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
                return -EINVAL;
 
        if (nested_cpu_has_vmfunc(vmcs12)) {
@@ -2960,7 +2962,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
        /*
         * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
         * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
-        * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
+        * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
         * there is no need to preserve other bits or save/restore the field.
         */
        vmcs_writel(GUEST_RFLAGS, 0);
@@ -3052,6 +3054,27 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
        struct page *page;
        u64 hpa;
 
+       /*
+        * hv_evmcs may end up being not mapped after migration (when
+        * L2 was running), map it here to make sure vmcs12 changes are
+        * properly reflected.
+        */
+       if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) {
+               enum nested_evmptrld_status evmptrld_status =
+                       nested_vmx_handle_enlightened_vmptrld(vcpu, false);
+
+               if (evmptrld_status == EVMPTRLD_VMFAIL ||
+                   evmptrld_status == EVMPTRLD_ERROR) {
+                       pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+                                            __func__);
+                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       vcpu->run->internal.suberror =
+                               KVM_INTERNAL_ERROR_EMULATION;
+                       vcpu->run->internal.ndata = 0;
+                       return false;
+               }
+       }
+
        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
                /*
                 * Translate L1 physical address to host physical
@@ -3315,12 +3338,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        enum nvmx_vmentry_status status;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
+       enum nested_evmptrld_status evmptrld_status;
 
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
+       evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
+       if (evmptrld_status == EVMPTRLD_ERROR) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
+       } else if (evmptrld_status == EVMPTRLD_VMFAIL) {
+               return nested_vmx_failInvalid(vcpu);
+       }
 
        if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
                return nested_vmx_failInvalid(vcpu);
@@ -3498,7 +3527,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
 }
 
 
-static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        gfn_t gfn;
@@ -3603,7 +3632,7 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
                            vcpu->arch.exception.payload);
 }
 
-static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qual;
@@ -3679,8 +3708,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
                return 0;
        }
 
-       if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
-           nested_exit_on_intr(vcpu)) {
+       if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
                if (block_nested_events)
                        return -EBUSY;
                nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
@@ -4023,7 +4051,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         *
         * If vmcs12 uses EPT, we need to execute this flush on EPTP01
         * and therefore we request the TLB flush to happen only after VMCS EPTP
-        * has been set by KVM_REQ_LOAD_CR3.
+        * has been set by KVM_REQ_LOAD_MMU_PGD.
         */
        if (enable_vpid &&
            (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
@@ -4328,17 +4356,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
        if (likely(!vmx->fail)) {
-               /*
-                * TODO: SDM says that with acknowledge interrupt on
-                * exit, bit 31 of the VM-exit interrupt information
-                * (valid interrupt) is always set to 1 on
-                * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
-                * need kvm_cpu_has_interrupt().  See the commit
-                * message for details.
-                */
-               if (nested_exit_intr_ack_set(vcpu) &&
-                   exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
-                   kvm_cpu_has_interrupt(vcpu)) {
+               if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+                   nested_exit_intr_ack_set(vcpu)) {
                        int irq = kvm_cpu_get_interrupt(vcpu);
                        WARN_ON(irq < 0);
                        vmcs12->vm_exit_intr_info = irq |
@@ -4382,7 +4401,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
  * Decode the memory-address operand of a vmx instruction, as recorded on an
  * exit caused by such an instruction (run by a guest hypervisor).
  * On success, returns 0. When the operand is invalid, returns 1 and throws
- * #UD or #GP.
+ * #UD, #GP, or #SS.
  */
 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
                        u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
@@ -4423,7 +4442,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
        if (base_is_valid)
                off += kvm_register_read(vcpu, base_reg);
        if (index_is_valid)
-               off += kvm_register_read(vcpu, index_reg)<<scaling;
+               off += kvm_register_read(vcpu, index_reg) << scaling;
        vmx_get_segment(vcpu, &s, seg_reg);
 
        /*
@@ -4602,7 +4621,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
        vmx->nested.vmcs02_initialized = false;
        vmx->nested.vmxon = true;
 
-       if (pt_mode == PT_MODE_HOST_GUEST) {
+       if (vmx_pt_mode_is_host_guest()) {
                vmx->pt_desc.guest.ctl = 0;
                pt_update_intercept_for_msr(vmx);
        }
@@ -5234,7 +5253,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
                                     struct vmcs12 *vmcs12)
 {
        u32 index = kvm_rcx_read(vcpu);
-       u64 address;
+       u64 new_eptp;
        bool accessed_dirty;
        struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
 
@@ -5247,23 +5266,23 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
 
 
        if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
-                                    &address, index * 8, 8))
+                                    &new_eptp, index * 8, 8))
                return 1;
 
-       accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
+       accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT);
 
        /*
         * If the (L2) guest does a vmfunc to the currently
         * active ept pointer, we don't have to do anything else
         */
-       if (vmcs12->ept_pointer != address) {
-               if (!valid_ept_address(vcpu, address))
+       if (vmcs12->ept_pointer != new_eptp) {
+               if (!nested_vmx_check_eptp(vcpu, new_eptp))
                        return 1;
 
                kvm_mmu_unload(vcpu);
                mmu->ept_ad = accessed_dirty;
                mmu->mmu_role.base.ad_disabled = !accessed_dirty;
-               vmcs12->ept_pointer = address;
+               vmcs12->ept_pointer = new_eptp;
                /*
                 * TODO: Check what's the correct approach in case
                 * mmu reload fails. Currently, we just let the next
@@ -5524,8 +5543,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-       if (vmx->nested.nested_run_pending)
-               return false;
+       WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
        if (unlikely(vmx->fail)) {
                trace_kvm_nested_vmenter_failed(
@@ -5534,19 +5552,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
                return true;
        }
 
-       /*
-        * The host physical addresses of some pages of guest memory
-        * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
-        * Page). The CPU may write to these pages via their host
-        * physical address while L2 is running, bypassing any
-        * address-translation-based dirty tracking (e.g. EPT write
-        * protection).
-        *
-        * Mark them dirty on every exit from L2 to prevent them from
-        * getting out of sync with dirty tracking.
-        */
-       nested_mark_vmcs12_pages_dirty(vcpu);
-
        trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
                                vmcs_readl(EXIT_QUALIFICATION),
                                vmx->idt_vectoring_info,
@@ -5627,7 +5632,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
        case EXIT_REASON_MWAIT_INSTRUCTION:
                return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
        case EXIT_REASON_MONITOR_TRAP_FLAG:
-               return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+               return nested_cpu_has_mtf(vmcs12);
        case EXIT_REASON_MONITOR_INSTRUCTION:
                return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
        case EXIT_REASON_PAUSE_INSTRUCTION:
@@ -5904,10 +5909,12 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
                set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
        } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
                /*
-                * Sync eVMCS upon entry as we may not have
-                * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
+                * nested_vmx_handle_enlightened_vmptrld() cannot be called
+                * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
+                * restored yet. EVMCS will be mapped from
+                * nested_get_vmcs12_pages().
                 */
-               vmx->nested.need_vmcs12_to_shadow_sync = true;
+               kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
        } else {
                return -EINVAL;
        }
@@ -6129,11 +6136,13 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
                /* nested EPT: emulate EPT also to L1 */
                msrs->secondary_ctls_high |=
                        SECONDARY_EXEC_ENABLE_EPT;
-               msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
-                        VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
-               if (cpu_has_vmx_ept_execute_only())
-                       msrs->ept_caps |=
-                               VMX_EPT_EXECUTE_ONLY_BIT;
+               msrs->ept_caps =
+                       VMX_EPT_PAGE_WALK_4_BIT |
+                       VMX_EPT_PAGE_WALK_5_BIT |
+                       VMX_EPTP_WB_BIT |
+                       VMX_EPT_INVEPT_BIT |
+                       VMX_EPT_EXECUTE_ONLY_BIT;
+
                msrs->ept_caps &= ept_caps;
                msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
                        VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
index 9aeda46..f70968b 100644 (file)
@@ -33,6 +33,7 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
                        u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
                                 int size);
 
@@ -60,7 +61,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
                vmx->nested.hv_evmcs;
 }
 
-static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu)
 {
        /* return the page table to be shadowed - in our case, EPT12 */
        return get_vmcs12(vcpu)->ept_pointer;
@@ -68,7 +69,7 @@ static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
 
 static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
 {
-       return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
+       return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
 }
 
 /*
index fd21cdb..7c85773 100644 (file)
@@ -263,9 +263,15 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        if (!msr_info->host_initiated)
                                data = (s64)(s32)data;
                        pmc->counter += data - pmc_read_counter(pmc);
+                       if (pmc->perf_event)
+                               perf_event_period(pmc->perf_event,
+                                                 get_sample_period(pmc, data));
                        return 0;
                } else if ((pmc = get_fixed_pmc(pmu, msr))) {
                        pmc->counter += data - pmc_read_counter(pmc);
+                       if (pmc->perf_event)
+                               perf_event_period(pmc->perf_event,
+                                                 get_sample_period(pmc, data));
                        return 0;
                } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
                        if (data == pmc->eventsel)
@@ -329,7 +335,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
        pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
                        & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
                            MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
-       if (kvm_x86_ops->pt_supported())
+       if (vmx_pt_mode_is_host_guest())
                pmu->global_ovf_ctrl_mask &=
                                ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
 
index 81ada2c..ca20651 100644 (file)
@@ -135,12 +135,12 @@ SYM_FUNC_START(__vmx_vcpu_run)
        cmpb $0, %bl
 
        /* Load guest registers.  Don't clobber flags. */
-       mov VCPU_RBX(%_ASM_AX), %_ASM_BX
        mov VCPU_RCX(%_ASM_AX), %_ASM_CX
        mov VCPU_RDX(%_ASM_AX), %_ASM_DX
+       mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+       mov VCPU_RBP(%_ASM_AX), %_ASM_BP
        mov VCPU_RSI(%_ASM_AX), %_ASM_SI
        mov VCPU_RDI(%_ASM_AX), %_ASM_DI
-       mov VCPU_RBP(%_ASM_AX), %_ASM_BP
 #ifdef CONFIG_X86_64
        mov VCPU_R8 (%_ASM_AX),  %r8
        mov VCPU_R9 (%_ASM_AX),  %r9
@@ -168,12 +168,12 @@ SYM_FUNC_START(__vmx_vcpu_run)
 
        /* Save all guest registers, including RAX from the stack */
        __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX)
-       mov %_ASM_BX,   VCPU_RBX(%_ASM_AX)
        mov %_ASM_CX,   VCPU_RCX(%_ASM_AX)
        mov %_ASM_DX,   VCPU_RDX(%_ASM_AX)
+       mov %_ASM_BX,   VCPU_RBX(%_ASM_AX)
+       mov %_ASM_BP,   VCPU_RBP(%_ASM_AX)
        mov %_ASM_SI,   VCPU_RSI(%_ASM_AX)
        mov %_ASM_DI,   VCPU_RDI(%_ASM_AX)
-       mov %_ASM_BP,   VCPU_RBP(%_ASM_AX)
 #ifdef CONFIG_X86_64
        mov %r8,  VCPU_R8 (%_ASM_AX)
        mov %r9,  VCPU_R9 (%_ASM_AX)
@@ -197,12 +197,12 @@ SYM_FUNC_START(__vmx_vcpu_run)
         * free.  RSP and RAX are exempt as RSP is restored by hardware during
         * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
         */
-1:     xor %ebx, %ebx
-       xor %ecx, %ecx
+1:     xor %ecx, %ecx
        xor %edx, %edx
+       xor %ebx, %ebx
+       xor %ebp, %ebp
        xor %esi, %esi
        xor %edi, %edi
-       xor %ebp, %ebp
 #ifdef CONFIG_X86_64
        xor %r8d,  %r8d
        xor %r9d,  %r9d
index 40b1e61..a7dd678 100644 (file)
@@ -435,7 +435,6 @@ static const struct kvm_vmx_segment_field {
        VMX_SEGMENT_FIELD(LDTR),
 };
 
-u64 host_efer;
 static unsigned long host_idt_base;
 
 /*
@@ -656,53 +655,16 @@ static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr,
        return ret;
 }
 
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
-{
-       vmcs_clear(loaded_vmcs->vmcs);
-       if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
-               vmcs_clear(loaded_vmcs->shadow_vmcs);
-       loaded_vmcs->cpu = -1;
-       loaded_vmcs->launched = 0;
-}
-
 #ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
-       cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
-       cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
-       return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
 static void crash_vmclear_local_loaded_vmcss(void)
 {
        int cpu = raw_smp_processor_id();
        struct loaded_vmcs *v;
 
-       if (!crash_local_vmclear_enabled(cpu))
-               return;
-
        list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                            loaded_vmcss_on_cpu_link)
                vmcs_clear(v->vmcs);
 }
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
 #endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
@@ -714,19 +676,24 @@ static void __loaded_vmcs_clear(void *arg)
                return; /* vcpu migration can race with cpu offline */
        if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
-       crash_disable_local_vmclear(cpu);
+
+       vmcs_clear(loaded_vmcs->vmcs);
+       if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+               vmcs_clear(loaded_vmcs->shadow_vmcs);
+
        list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
 
        /*
-        * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
-        * is before setting loaded_vmcs->vcpu to -1 which is done in
-        * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
-        * then adds the vmcs into percpu list before it is deleted.
+        * Ensure all writes to loaded_vmcs, including deleting it from its
+        * current percpu list, complete before setting loaded_vmcs->vcpu to
+        * -1, otherwise a different cpu can see vcpu == -1 first and add
+        * loaded_vmcs to its percpu list before it's deleted from this cpu's
+        * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
         */
        smp_wmb();
 
-       loaded_vmcs_init(loaded_vmcs);
-       crash_enable_local_vmclear(cpu);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
 }
 
 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -810,7 +777,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
        if (to_vmx(vcpu)->rmode.vm86_active)
                eb = ~0;
        if (enable_ept)
-               eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+               eb &= ~(1u << PF_VECTOR);
 
        /* When we are running a nested L2 guest and L1 specified for it a
         * certain exception bitmap, we must trap the same exceptions and pass
@@ -1061,7 +1028,7 @@ static unsigned long segment_base(u16 selector)
 
 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
 {
-       return (pt_mode == PT_MODE_HOST_GUEST) &&
+       return vmx_pt_mode_is_host_guest() &&
               !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
 }
 
@@ -1095,7 +1062,7 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
 
 static void pt_guest_enter(struct vcpu_vmx *vmx)
 {
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                return;
 
        /*
@@ -1112,7 +1079,7 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
 
 static void pt_guest_exit(struct vcpu_vmx *vmx)
 {
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                return;
 
        if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
@@ -1345,18 +1312,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
        if (!already_loaded) {
                loaded_vmcs_clear(vmx->loaded_vmcs);
                local_irq_disable();
-               crash_disable_local_vmclear(cpu);
 
                /*
-                * Read loaded_vmcs->cpu should be before fetching
-                * loaded_vmcs->loaded_vmcss_on_cpu_link.
-                * See the comments in __loaded_vmcs_clear().
+                * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+                * this cpu's percpu list, otherwise it may not yet be deleted
+                * from its previous cpu's percpu list.  Pairs with the
+                * smb_wmb() in __loaded_vmcs_clear().
                 */
                smp_rmb();
 
                list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
-               crash_enable_local_vmclear(cpu);
                local_irq_enable();
        }
 
@@ -1689,16 +1655,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
        vmx_clear_hlt(vcpu);
 }
 
-static bool vmx_rdtscp_supported(void)
-{
-       return cpu_has_vmx_rdtscp();
-}
-
-static bool vmx_invpcid_supported(void)
-{
-       return cpu_has_vmx_invpcid();
-}
-
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
@@ -1906,24 +1862,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                                                        &msr_info->data);
                break;
        case MSR_IA32_RTIT_CTL:
-               if (pt_mode != PT_MODE_HOST_GUEST)
+               if (!vmx_pt_mode_is_host_guest())
                        return 1;
                msr_info->data = vmx->pt_desc.guest.ctl;
                break;
        case MSR_IA32_RTIT_STATUS:
-               if (pt_mode != PT_MODE_HOST_GUEST)
+               if (!vmx_pt_mode_is_host_guest())
                        return 1;
                msr_info->data = vmx->pt_desc.guest.status;
                break;
        case MSR_IA32_RTIT_CR3_MATCH:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        !intel_pt_validate_cap(vmx->pt_desc.caps,
                                                PT_CAP_cr3_filtering))
                        return 1;
                msr_info->data = vmx->pt_desc.guest.cr3_match;
                break;
        case MSR_IA32_RTIT_OUTPUT_BASE:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_topa_output) &&
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1932,7 +1888,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vmx->pt_desc.guest.output_base;
                break;
        case MSR_IA32_RTIT_OUTPUT_MASK:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_topa_output) &&
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1942,7 +1898,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
                index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_num_address_ranges)))
                        return 1;
@@ -2148,7 +2104,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                return vmx_set_vmx_msr(vcpu, msr_index, data);
        case MSR_IA32_RTIT_CTL:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        vmx_rtit_ctl_check(vcpu, data) ||
                        vmx->nested.vmxon)
                        return 1;
@@ -2264,18 +2220,33 @@ static __init int vmx_disabled_by_bios(void)
               !boot_cpu_has(X86_FEATURE_VMX);
 }
 
-static void kvm_cpu_vmxon(u64 addr)
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
 {
+       u64 msr;
+
        cr4_set_bits(X86_CR4_VMXE);
        intel_pt_handle_vmx(1);
 
-       asm volatile ("vmxon %0" : : "m"(addr));
+       asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+                         _ASM_EXTABLE(1b, %l[fault])
+                         : : [vmxon_pointer] "m"(vmxon_pointer)
+                         : : fault);
+       return 0;
+
+fault:
+       WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+                 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+       intel_pt_handle_vmx(0);
+       cr4_clear_bits(X86_CR4_VMXE);
+
+       return -EFAULT;
 }
 
 static int hardware_enable(void)
 {
        int cpu = raw_smp_processor_id();
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+       int r;
 
        if (cr4_read_shadow() & X86_CR4_VMXE)
                return -EBUSY;
@@ -2292,18 +2263,10 @@ static int hardware_enable(void)
        INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
        spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 
-       /*
-        * Now we can enable the vmclear operation in kdump
-        * since the loaded_vmcss_on_cpu list on this cpu
-        * has been initialized.
-        *
-        * Though the cpu is not in VMX operation now, there
-        * is no problem to enable the vmclear operation
-        * for the loaded_vmcss_on_cpu list is empty!
-        */
-       crash_enable_local_vmclear(cpu);
+       r = kvm_cpu_vmxon(phys_addr);
+       if (r)
+               return r;
 
-       kvm_cpu_vmxon(phys_addr);
        if (enable_ept)
                ept_sync_global();
 
@@ -2603,9 +2566,12 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
        if (!loaded_vmcs->vmcs)
                return -ENOMEM;
 
+       vmcs_clear(loaded_vmcs->vmcs);
+
        loaded_vmcs->shadow_vmcs = NULL;
        loaded_vmcs->hv_timer_soft_disabled = false;
-       loaded_vmcs_init(loaded_vmcs);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
 
        if (cpu_has_vmx_msr_bitmap()) {
                loaded_vmcs->msr_bitmap = (unsigned long *)
@@ -2987,9 +2953,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static int get_ept_level(struct kvm_vcpu *vcpu)
 {
-       /* Nested EPT currently only supports 4-level walks. */
        if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
-               return 4;
+               return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
        if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
                return 5;
        return 4;
@@ -3009,7 +2974,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
        return eptp;
 }
 
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        struct kvm *kvm = vcpu->kvm;
        bool update_guest_cr3 = true;
@@ -4026,7 +3991,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 
        u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
 
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
        if (!cpu_need_virtualize_apic_accesses(vcpu))
                exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
@@ -4081,7 +4046,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
                }
        }
 
-       if (vmx_rdtscp_supported()) {
+       if (cpu_has_vmx_rdtscp()) {
                bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
                if (!rdtscp_enabled)
                        exec_control &= ~SECONDARY_EXEC_RDTSCP;
@@ -4096,7 +4061,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
                }
        }
 
-       if (vmx_invpcid_supported()) {
+       if (cpu_has_vmx_invpcid()) {
                /* Exposing INVPCID only when PCID is exposed */
                bool invpcid_enabled =
                        guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
@@ -4267,7 +4232,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
        if (cpu_has_vmx_encls_vmexit())
                vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
 
-       if (pt_mode == PT_MODE_HOST_GUEST) {
+       if (vmx_pt_mode_is_host_guest()) {
                memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
                /* Bit[6~0] are forced to 1, writes are ignored. */
                vmx->pt_desc.guest.output_mask = 0x7F;
@@ -4495,8 +4460,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-       return (!to_vmx(vcpu)->nested.nested_run_pending &&
-               vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return false;
+
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+               return true;
+
+       return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
@@ -4552,7 +4522,6 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
        case GP_VECTOR:
        case MF_VECTOR:
                return true;
-       break;
        }
        return false;
 }
@@ -5329,7 +5298,6 @@ static void vmx_enable_tdp(void)
                VMX_EPT_RWX_MASK, 0ull);
 
        ept_set_mmio_spte_mask();
-       kvm_enable_tdp();
 }
 
 /*
@@ -5862,8 +5830,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
 
-       if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
-               return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+       if (is_guest_mode(vcpu)) {
+               /*
+                * The host physical addresses of some pages of guest memory
+                * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+                * Page). The CPU may write to these pages via their host
+                * physical address while L2 is running, bypassing any
+                * address-translation-based dirty tracking (e.g. EPT write
+                * protection).
+                *
+                * Mark them dirty on every exit from L2 to prevent them from
+                * getting out of sync with dirty tracking.
+                */
+               nested_mark_vmcs12_pages_dirty(vcpu);
+
+               if (nested_vmx_exit_reflected(vcpu, exit_reason))
+                       return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+       }
 
        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
                dump_vmcs();
@@ -6223,15 +6206,13 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
        vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
        /* if exit due to PF check for async PF */
-       if (is_page_fault(vmx->exit_intr_info))
+       if (is_page_fault(vmx->exit_intr_info)) {
                vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
        /* Handle machine checks before interrupts are enabled */
-       if (is_machine_check(vmx->exit_intr_info))
+       } else if (is_machine_check(vmx->exit_intr_info)) {
                kvm_machine_check();
-
        /* We need to handle NMIs before interrupts are enabled */
-       if (is_nmi(vmx->exit_intr_info)) {
+       } else if (is_nmi(vmx->exit_intr_info)) {
                kvm_before_interrupt(&vmx->vcpu);
                asm("int $2");
                kvm_after_interrupt(&vmx->vcpu);
@@ -6317,11 +6298,6 @@ static bool vmx_has_emulated_msr(int index)
        }
 }
 
-static bool vmx_pt_supported(void)
-{
-       return pt_mode == PT_MODE_HOST_GUEST;
-}
-
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -6567,7 +6543,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        pt_guest_enter(vmx);
 
-       atomic_switch_perf_msrs(vmx);
+       if (vcpu_to_pmu(vcpu)->version)
+               atomic_switch_perf_msrs(vmx);
        atomic_switch_umwait_control_msr(vmx);
 
        if (enable_preemption_timer)
@@ -6684,20 +6661,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx_complete_interrupts(vmx);
 }
 
-static struct kvm *vmx_vm_alloc(void)
-{
-       struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
-                                           GFP_KERNEL_ACCOUNT | __GFP_ZERO,
-                                           PAGE_KERNEL);
-       return &kvm_vmx->kvm;
-}
-
-static void vmx_vm_free(struct kvm *kvm)
-{
-       kfree(kvm->arch.hyperv.hv_pa_pg);
-       vfree(to_kvm_vmx(kvm));
-}
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6900,17 +6863,24 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
        u8 cache;
        u64 ipat = 0;
 
-       /* For VT-d and EPT combination
-        * 1. MMIO: always map as UC
-        * 2. EPT with VT-d:
-        *   a. VT-d without snooping control feature: can't guarantee the
-        *      result, try to trust guest.
-        *   b. VT-d with snooping control feature: snooping control feature of
-        *      VT-d engine can guarantee the cache correctness. Just set it
-        *      to WB to keep consistent with host. So the same as item 3.
-        * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
-        *    consistent with host MTRR
+       /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
+        * memory aliases with conflicting memory types and sometimes MCEs.
+        * We have to be careful as to what are honored and when.
+        *
+        * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
+        * UC.  The effective memory type is UC or WC depending on guest PAT.
+        * This was historically the source of MCEs and we want to be
+        * conservative.
+        *
+        * When there is no need to deal with noncoherent DMA (e.g., no VT-d
+        * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
+        * EPT memory type is set to WB.  The effective memory type is forced
+        * WB.
+        *
+        * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
+        * EPT memory type is used to emulate guest CD/MTRR.
         */
+
        if (is_mmio) {
                cache = MTRR_TYPE_UNCACHABLE;
                goto exit;
@@ -6937,15 +6907,6 @@ exit:
        return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
 }
 
-static int vmx_get_lpage_level(void)
-{
-       if (enable_ept && !cpu_has_vmx_ept_1g_page())
-               return PT_DIRECTORY_LEVEL;
-       else
-               /* For shadow and EPT supported 1GB page */
-               return PT_PDPE_LEVEL;
-}
-
 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
 {
        /*
@@ -7136,10 +7097,37 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
        }
 }
 
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static __init void vmx_set_cpu_caps(void)
 {
-       if (func == 1 && nested)
-               entry->ecx |= feature_bit(VMX);
+       kvm_set_cpu_caps();
+
+       /* CPUID 0x1 */
+       if (nested)
+               kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+       /* CPUID 0x7 */
+       if (kvm_mpx_supported())
+               kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+       if (cpu_has_vmx_invpcid())
+               kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+       if (vmx_pt_mode_is_host_guest())
+               kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+
+       /* PKU is not yet implemented for shadow paging. */
+       if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
+               kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
+
+       if (vmx_umip_emulated())
+               kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
+       /* CPUID 0xD.1 */
+       supported_xss = 0;
+       if (!vmx_xsaves_supported())
+               kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
+       /* CPUID 0x80000001 */
+       if (!cpu_has_vmx_rdtscp())
+               kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
 }
 
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -7183,10 +7171,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
 
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
-                              enum x86_intercept_stage stage)
+                              enum x86_intercept_stage stage,
+                              struct x86_exception *exception)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 
        switch (info->intercept) {
        /*
@@ -7195,8 +7183,8 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
         */
        case x86_intercept_rdtscp:
                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
-                       ctxt->exception.vector = UD_VECTOR;
-                       ctxt->exception.error_code_valid = false;
+                       exception->vector = UD_VECTOR;
+                       exception->error_code_valid = false;
                        return X86EMUL_PROPAGATE_FAULT;
                }
                break;
@@ -7307,7 +7295,8 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_slot_enable_log_dirty(struct kvm *kvm,
                                     struct kvm_memory_slot *slot)
 {
-       kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+       if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
+               kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
        kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
 }
 
@@ -7661,9 +7650,7 @@ static __init int hardware_setup(void)
 {
        unsigned long host_bndcfgs;
        struct desc_ptr dt;
-       int r, i;
-
-       rdmsrl_safe(MSR_EFER, &host_efer);
+       int r, i, ept_lpage_level;
 
        store_idt(&dt);
        host_idt_base = dt.address;
@@ -7682,6 +7669,10 @@ static __init int hardware_setup(void)
                WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
        }
 
+       if (!cpu_has_vmx_mpx())
+               supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+                                   XFEATURE_MASK_BNDCSR);
+
        if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
            !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                enable_vpid = 0;
@@ -7715,9 +7706,6 @@ static __init int hardware_setup(void)
        if (!cpu_has_vmx_tpr_shadow())
                kvm_x86_ops->update_cr8_intercept = NULL;
 
-       if (enable_ept && !cpu_has_vmx_ept_2m_page())
-               kvm_disable_largepages();
-
 #if IS_ENABLED(CONFIG_HYPERV)
        if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
            && enable_ept) {
@@ -7750,8 +7738,16 @@ static __init int hardware_setup(void)
 
        if (enable_ept)
                vmx_enable_tdp();
+
+       if (!enable_ept)
+               ept_lpage_level = 0;
+       else if (cpu_has_vmx_ept_1g_page())
+               ept_lpage_level = PT_PDPE_LEVEL;
+       else if (cpu_has_vmx_ept_2m_page())
+               ept_lpage_level = PT_DIRECTORY_LEVEL;
        else
-               kvm_disable_tdp();
+               ept_lpage_level = PT_PAGE_TABLE_LEVEL;
+       kvm_configure_mmu(enable_ept, ept_lpage_level);
 
        /*
         * Only enable PML when hardware supports PML feature, and both EPT
@@ -7815,6 +7811,8 @@ static __init int hardware_setup(void)
                        return r;
        }
 
+       vmx_set_cpu_caps();
+
        r = alloc_kvm_area();
        if (r)
                nested_vmx_hardware_unsetup();
@@ -7848,9 +7846,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_accelerated_tpr = report_flexpriority,
        .has_emulated_msr = vmx_has_emulated_msr,
 
+       .vm_size = sizeof(struct kvm_vmx),
        .vm_init = vmx_vm_init,
-       .vm_alloc = vmx_vm_alloc,
-       .vm_free = vmx_vm_free,
 
        .vcpu_create = vmx_create_vcpu,
        .vcpu_free = vmx_free_vcpu,
@@ -7872,7 +7869,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
        .set_cr0 = vmx_set_cr0,
-       .set_cr3 = vmx_set_cr3,
        .set_cr4 = vmx_set_cr4,
        .set_efer = vmx_set_efer,
        .get_idt = vmx_get_idt,
@@ -7928,29 +7924,17 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
        .get_exit_info = vmx_get_exit_info,
 
-       .get_lpage_level = vmx_get_lpage_level,
-
        .cpuid_update = vmx_cpuid_update,
 
-       .rdtscp_supported = vmx_rdtscp_supported,
-       .invpcid_supported = vmx_invpcid_supported,
-
-       .set_supported_cpuid = vmx_set_supported_cpuid,
-
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
        .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
        .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
 
-       .set_tdp_cr3 = vmx_set_cr3,
+       .load_mmu_pgd = vmx_load_mmu_pgd,
 
        .check_intercept = vmx_check_intercept,
        .handle_exit_irqoff = vmx_handle_exit_irqoff,
-       .mpx_supported = vmx_mpx_supported,
-       .xsaves_supported = vmx_xsaves_supported,
-       .umip_emulated = vmx_umip_emulated,
-       .pt_supported = vmx_pt_supported,
-       .pku_supported = vmx_pku_supported,
 
        .request_immediate_exit = vmx_request_immediate_exit,
 
index e64da06..79d38f4 100644 (file)
@@ -12,7 +12,6 @@
 #include "vmcs.h"
 
 extern const u32 vmx_msr_index[];
-extern u64 host_efer;
 
 extern u32 get_umwait_control_msr(void);
 
@@ -335,9 +334,9 @@ u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
 void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3);
 void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -452,7 +451,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 static inline u32 vmx_vmentry_ctrl(void)
 {
        u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
                                  VM_ENTRY_LOAD_IA32_RTIT_CTL);
        /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
@@ -463,7 +462,7 @@ static inline u32 vmx_vmentry_ctrl(void)
 static inline u32 vmx_vmexit_ctrl(void)
 {
        u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
                                 VM_EXIT_CLEAR_IA32_RTIT_CTL);
        /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
@@ -493,7 +492,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
 void free_vmcs(struct vmcs *vmcs);
 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs);
 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
 
 static inline struct vmcs *alloc_vmcs(bool shadow)
index 5de2006..1b6d9ac 100644 (file)
@@ -22,6 +22,7 @@
 #include "i8254.h"
 #include "tss.h"
 #include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
 #include "x86.h"
 #include "cpuid.h"
 #include "pmu.h"
@@ -81,7 +82,7 @@ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
 #define emul_to_vcpu(ctxt) \
-       container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+       ((struct kvm_vcpu *)(ctxt)->vcpu)
 
 /* EFER defaults:
  * - enable syscall per default because its emulated by KVM
@@ -180,7 +181,17 @@ struct kvm_shared_msrs {
 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 static struct kvm_shared_msrs __percpu *shared_msrs;
 
+#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+                               | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+                               | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+                               | XFEATURE_MASK_PKRU)
+
+u64 __read_mostly host_efer;
+EXPORT_SYMBOL_GPL(host_efer);
+
 static u64 __read_mostly host_xss;
+u64 __read_mostly supported_xss;
+EXPORT_SYMBOL_GPL(supported_xss);
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -226,10 +237,25 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 };
 
 u64 __read_mostly host_xcr0;
+u64 __read_mostly supported_xcr0;
+EXPORT_SYMBOL_GPL(supported_xcr0);
 
 struct kmem_cache *x86_fpu_cache;
 EXPORT_SYMBOL_GPL(x86_fpu_cache);
 
+static struct kmem_cache *x86_emulator_cache;
+
+static struct kmem_cache *kvm_alloc_emulator_cache(void)
+{
+       unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
+       unsigned int size = sizeof(struct x86_emulate_ctxt);
+
+       return kmem_cache_create_usercopy("x86_emulator", size,
+                                         __alignof__(struct x86_emulate_ctxt),
+                                         SLAB_ACCOUNT, useroffset,
+                                         size - useroffset, NULL);
+}
+
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
@@ -350,6 +376,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        }
 
        kvm_lapic_set_base(vcpu, msr_info->data);
+       kvm_recalculate_apic_map(vcpu->kvm);
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
@@ -903,10 +930,10 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
 {
        u64 reserved_bits = __cr4_reserved_bits(cpu_has, c);
 
-       if (cpuid_ecx(0x7) & feature_bit(LA57))
+       if (kvm_cpu_cap_has(X86_FEATURE_LA57))
                reserved_bits &= ~X86_CR4_LA57;
 
-       if (kvm_x86_ops->umip_emulated())
+       if (kvm_cpu_cap_has(X86_FEATURE_UMIP))
                reserved_bits &= ~X86_CR4_UMIP;
 
        return reserved_bits;
@@ -1558,8 +1585,12 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
                ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
                ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
 
+               data &= ~(1 << 12);
+               kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
                kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
-               return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
+               kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
+               trace_kvm_apic_write(APIC_ICR, (u32)data);
+               return 0;
        }
 
        return 1;
@@ -1568,11 +1599,12 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
 enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
 {
        u32 msr = kvm_rcx_read(vcpu);
-       u64 data = kvm_read_edx_eax(vcpu);
+       u64 data;
        int ret = 0;
 
        switch (msr) {
        case APIC_BASE_MSR + (APIC_ICR >> 4):
+               data = kvm_read_edx_eax(vcpu);
                ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
                break;
        default:
@@ -2523,7 +2555,7 @@ static void kvmclock_sync_fn(struct work_struct *work)
 static bool can_set_mci_status(struct kvm_vcpu *vcpu)
 {
        /* McStatusWrEn enabled? */
-       if (guest_cpuid_is_amd(vcpu))
+       if (guest_cpuid_is_amd_or_hygon(vcpu))
                return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
 
        return false;
@@ -2798,12 +2830,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
                        return 1;
                /*
-                * We do support PT if kvm_x86_ops->pt_supported(), but we do
-                * not support IA32_XSS[bit 8]. Guests will have to use
-                * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
-                * MSRs.
+                * KVM supports exposing PT to the guest, but does not support
+                * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
+                * XSAVES/XRSTORS to save/restore PT MSRs.
                 */
-               if (data != 0)
+               if (data & ~supported_xss)
                        return 1;
                vcpu->arch.ia32_xss = data;
                break;
@@ -3077,7 +3108,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
                return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
-               break;
        case MSR_IA32_TSCDEADLINE:
                msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
                break;
@@ -3160,7 +3190,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                return kvm_hv_get_msr_common(vcpu,
                                             msr_info->index, &msr_info->data,
                                             msr_info->host_initiated);
-               break;
        case MSR_IA32_BBL_CR_CTL3:
                /* This legacy MSR exists but isn't fully documented in current
                 * silicon.  It is however accessed by winxp in very narrow
@@ -3464,7 +3493,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
                r = 0;
                break;
        }
-       case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+       case KVM_X86_GET_MCE_CAP_SUPPORTED:
                r = -EFAULT;
                if (copy_to_user(argp, &kvm_mce_cap_supported,
                                 sizeof(kvm_mce_cap_supported)))
@@ -3496,9 +3525,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
        case KVM_GET_MSRS:
                r = msr_io(NULL, argp, do_get_msr_feature, 1);
                break;
-       }
        default:
                r = -EINVAL;
+               break;
        }
 out:
        return r;
@@ -4101,8 +4130,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
                 * with old userspace.
                 */
-               if (xstate_bv & ~kvm_supported_xcr0() ||
-                       mxcsr & ~mxcsr_feature_mask)
+               if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
                        return -EINVAL;
                load_xsave(vcpu, (u8 *)guest_xsave->region);
        } else {
@@ -4761,77 +4789,13 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
        return 0;
 }
 
-/**
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- * @kvm: kvm instance
- * @log: slot id and address to which we copy the log
- *
- * Steps 1-4 below provide general overview of dirty page logging. See
- * kvm_get_dirty_log_protect() function description for additional details.
- *
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- * always flush the TLB (step 4) even if previous step failed  and the dirty
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- * writes will be marked dirty for next log read.
- *
- *   1. Take a snapshot of the bit and clear it if needed.
- *   2. Write protect the corresponding page.
- *   3. Copy the snapshot to the userspace.
- *   4. Flush TLB's if needed.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
-       bool flush = false;
-       int r;
-
-       mutex_lock(&kvm->slots_lock);
-
        /*
         * Flush potentially hardware-cached dirty pages to dirty_bitmap.
         */
        if (kvm_x86_ops->flush_log_dirty)
                kvm_x86_ops->flush_log_dirty(kvm);
-
-       r = kvm_get_dirty_log_protect(kvm, log, &flush);
-
-       /*
-        * All the TLBs can be flushed out of mmu lock, see the comments in
-        * kvm_mmu_slot_remove_write_access().
-        */
-       lockdep_assert_held(&kvm->slots_lock);
-       if (flush)
-               kvm_flush_remote_tlbs(kvm);
-
-       mutex_unlock(&kvm->slots_lock);
-       return r;
-}
-
-int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
-{
-       bool flush = false;
-       int r;
-
-       mutex_lock(&kvm->slots_lock);
-
-       /*
-        * Flush potentially hardware-cached dirty pages to dirty_bitmap.
-        */
-       if (kvm_x86_ops->flush_log_dirty)
-               kvm_x86_ops->flush_log_dirty(kvm);
-
-       r = kvm_clear_dirty_log_protect(kvm, log, &flush);
-
-       /*
-        * All the TLBs can be flushed out of mmu lock, see the comments in
-        * kvm_mmu_slot_remove_write_access().
-        */
-       lockdep_assert_held(&kvm->slots_lock);
-       if (flush)
-               kvm_flush_remote_tlbs(kvm);
-
-       mutex_unlock(&kvm->slots_lock);
-       return r;
 }
 
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@ -5260,28 +5224,28 @@ static void kvm_init_msr_list(void)
                                continue;
                        break;
                case MSR_TSC_AUX:
-                       if (!kvm_x86_ops->rdtscp_supported())
+                       if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
                                continue;
                        break;
                case MSR_IA32_RTIT_CTL:
                case MSR_IA32_RTIT_STATUS:
-                       if (!kvm_x86_ops->pt_supported())
+                       if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
                                continue;
                        break;
                case MSR_IA32_RTIT_CR3_MATCH:
-                       if (!kvm_x86_ops->pt_supported() ||
+                       if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                            !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
                                continue;
                        break;
                case MSR_IA32_RTIT_OUTPUT_BASE:
                case MSR_IA32_RTIT_OUTPUT_MASK:
-                       if (!kvm_x86_ops->pt_supported() ||
+                       if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                                (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
                                 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
                                continue;
                        break;
                case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
-                       if (!kvm_x86_ops->pt_supported() ||
+                       if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                                msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
                                intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
                                continue;
@@ -5738,7 +5702,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
        int handled, ret;
        bool write = ops->write;
        struct kvm_mmio_fragment *frag;
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
 
        /*
         * If the exit was due to a NPF we may already have a GPA.
@@ -5747,10 +5711,9 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
         * operation using rep will only have the initial GPA from the NPF
         * occurred.
         */
-       if (vcpu->arch.gpa_available &&
-           emulator_can_use_gpa(ctxt) &&
-           (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
-               gpa = vcpu->arch.gpa_val;
+       if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
+           (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
+               gpa = ctxt->gpa_val;
                ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
        } else {
                ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@ -5970,11 +5933,9 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
        return 0;
 }
 
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-                                   int size, unsigned short port, void *val,
-                                   unsigned int count)
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+                          unsigned short port, void *val, unsigned int count)
 {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        int ret;
 
        if (vcpu->arch.pio.count)
@@ -5994,17 +5955,30 @@ data_avail:
        return 0;
 }
 
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
-                                    int size, unsigned short port,
-                                    const void *val, unsigned int count)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+                                   int size, unsigned short port, void *val,
+                                   unsigned int count)
 {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
+
+}
 
+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
+                           unsigned short port, const void *val,
+                           unsigned int count)
+{
        memcpy(vcpu->arch.pio_data, val, size * count);
        trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
        return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
 }
 
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+                                    int size, unsigned short port,
+                                    const void *val, unsigned int count)
+{
+       return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
+}
+
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
        return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -6267,13 +6241,15 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
                              struct x86_instruction_info *info,
                              enum x86_intercept_stage stage)
 {
-       return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
+       return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage,
+                                           &ctxt->exception);
 }
 
 static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
-                       u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
+                             u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
+                             bool exact_only)
 {
-       return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
+       return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
 }
 
 static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
@@ -6400,7 +6376,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 
 static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        if (ctxt->exception.vector == PF_VECTOR)
                return kvm_propagate_fault(vcpu, &ctxt->exception);
 
@@ -6412,13 +6388,31 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
        return false;
 }
 
+static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+       struct x86_emulate_ctxt *ctxt;
+
+       ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
+       if (!ctxt) {
+               pr_err("kvm: failed to allocate vcpu's emulator\n");
+               return NULL;
+       }
+
+       ctxt->vcpu = vcpu;
+       ctxt->ops = &emulate_ops;
+       vcpu->arch.emulate_ctxt = ctxt;
+
+       return ctxt;
+}
+
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 {
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int cs_db, cs_l;
 
        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
+       ctxt->gpa_available = false;
        ctxt->eflags = kvm_get_rflags(vcpu);
        ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
 
@@ -6438,7 +6432,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 
 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 {
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int ret;
 
        init_emulate_ctxt(vcpu);
@@ -6494,10 +6488,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        gpa_t gpa = cr2_or_gpa;
        kvm_pfn_t pfn;
 
-       if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+       if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                return false;
 
-       if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+       if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+           WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                return false;
 
        if (!vcpu->arch.mmu->direct_map) {
@@ -6585,10 +6580,11 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
         */
        vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
 
-       if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+       if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                return false;
 
-       if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+       if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+           WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                return false;
 
        if (x86_page_table_writing_insn(ctxt))
@@ -6751,7 +6747,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                            int emulation_type, void *insn, int insn_len)
 {
        int r;
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        bool writeback = true;
        bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
@@ -6841,8 +6837,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        }
 
 restart:
-       /* Save the faulting GPA (cr2) in the address field */
-       ctxt->exception.address = cr2_or_gpa;
+       if (emulation_type & EMULTYPE_PF) {
+               /* Save the faulting GPA (cr2) in the address field */
+               ctxt->exception.address = cr2_or_gpa;
+
+               /* With shadow page tables, cr2 contains a GVA or nGPA. */
+               if (vcpu->arch.mmu->direct_map) {
+                       ctxt->gpa_available = true;
+                       ctxt->gpa_val = cr2_or_gpa;
+               }
+       } else {
+               /* Sanitize the address out of an abundance of paranoia. */
+               ctxt->exception.address = 0;
+       }
 
        r = x86_emulate_insn(ctxt);
 
@@ -6943,8 +6950,8 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
                            unsigned short port)
 {
        unsigned long val = kvm_rax_read(vcpu);
-       int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
-                                           size, port, &val, 1);
+       int ret = emulator_pio_out(vcpu, size, port, &val, 1);
+
        if (ret)
                return ret;
 
@@ -6980,11 +6987,10 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
        val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
 
        /*
-        * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
+        * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
         * the copy and tracing
         */
-       emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
-                                vcpu->arch.pio.port, &val, 1);
+       emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
        kvm_rax_write(vcpu, val);
 
        return kvm_skip_emulated_instruction(vcpu);
@@ -6999,8 +7005,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
        /* For size less than 4 we merge, else we zero extend */
        val = (size < 4) ? kvm_rax_read(vcpu) : 0;
 
-       ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
-                                      &val, 1);
+       ret = emulator_pio_in(vcpu, size, port, &val, 1);
        if (ret) {
                kvm_rax_write(vcpu, val);
                return ret;
@@ -7338,10 +7343,16 @@ int kvm_arch_init(void *opaque)
                goto out;
        }
 
+       x86_emulator_cache = kvm_alloc_emulator_cache();
+       if (!x86_emulator_cache) {
+               pr_err("kvm: failed to allocate cache for x86 emulator\n");
+               goto out_free_x86_fpu_cache;
+       }
+
        shared_msrs = alloc_percpu(struct kvm_shared_msrs);
        if (!shared_msrs) {
                printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-               goto out_free_x86_fpu_cache;
+               goto out_free_x86_emulator_cache;
        }
 
        r = kvm_mmu_module_init();
@@ -7357,8 +7368,10 @@ int kvm_arch_init(void *opaque)
 
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
 
-       if (boot_cpu_has(X86_FEATURE_XSAVE))
+       if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+               supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+       }
 
        kvm_lapic_init();
        if (pi_inject_timer == -1)
@@ -7374,6 +7387,8 @@ int kvm_arch_init(void *opaque)
 
 out_free_percpu:
        free_percpu(shared_msrs);
+out_free_x86_emulator_cache:
+       kmem_cache_destroy(x86_emulator_cache);
 out_free_x86_fpu_cache:
        kmem_cache_destroy(x86_fpu_cache);
 out:
@@ -7631,7 +7646,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
+static int inject_pending_event(struct kvm_vcpu *vcpu)
 {
        int r;
 
@@ -7667,7 +7682,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
         * from L2 to L1.
         */
        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
-               r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+               r = kvm_x86_ops->check_nested_events(vcpu);
                if (r != 0)
                        return r;
        }
@@ -7729,7 +7744,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
                 * KVM_REQ_EVENT only on certain events and not unconditionally?
                 */
                if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
-                       r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+                       r = kvm_x86_ops->check_nested_events(vcpu);
                        if (r != 0)
                                return r;
                }
@@ -8039,19 +8054,26 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
  */
 void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
 {
+       unsigned long old, new, expected;
+
        if (!kvm_x86_ops->check_apicv_inhibit_reasons ||
            !kvm_x86_ops->check_apicv_inhibit_reasons(bit))
                return;
 
-       if (activate) {
-               if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
-                   !kvm_apicv_activated(kvm))
-                       return;
-       } else {
-               if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
-                   kvm_apicv_activated(kvm))
-                       return;
-       }
+       old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
+       do {
+               expected = new = old;
+               if (activate)
+                       __clear_bit(bit, &new);
+               else
+                       __set_bit(bit, &new);
+               if (new == old)
+                       break;
+               old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
+       } while (old != expected);
+
+       if (!!old == !!new)
+               return;
 
        trace_kvm_apicv_update_request(activate, bit);
        if (kvm_x86_ops->pre_update_apicv_exec_ctrl)
@@ -8176,8 +8198,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                }
                if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
                        kvm_mmu_sync_roots(vcpu);
-               if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
-                       kvm_mmu_load_cr3(vcpu);
+               if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
+                       kvm_mmu_load_pgd(vcpu);
                if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
                        kvm_vcpu_flush_tlb(vcpu, true);
                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@ -8262,7 +8284,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        goto out;
                }
 
-               if (inject_pending_event(vcpu, req_int_win) != 0)
+               if (inject_pending_event(vcpu) != 0)
                        req_immediate_exit = true;
                else {
                        /* Enable SMI/NMI/IRQ window open exits if needed.
@@ -8443,7 +8465,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (vcpu->arch.apic_attention)
                kvm_lapic_sync_from_vapic(vcpu);
 
-       vcpu->arch.gpa_available = false;
        r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath);
        return r;
 
@@ -8484,7 +8505,6 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
                break;
        default:
                return -EINTR;
-               break;
        }
        return 1;
 }
@@ -8492,7 +8512,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
-               kvm_x86_ops->check_nested_events(vcpu, false);
+               kvm_x86_ops->check_nested_events(vcpu);
 
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
@@ -8753,7 +8773,7 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
                 * that usually, but some bad designed PV devices (vmware
                 * backdoor interface) need this to work
                 */
-               emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
+               emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
        }
        regs->rax = kvm_rax_read(vcpu);
@@ -8939,7 +8959,7 @@ out:
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
                    int reason, bool has_error_code, u32 error_code)
 {
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int ret;
 
        init_emulate_ctxt(vcpu);
@@ -9271,7 +9291,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        struct page *page;
        int r;
 
-       vcpu->arch.emulate_ctxt.ops = &emulate_ops;
        if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
@@ -9309,11 +9328,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
                                GFP_KERNEL_ACCOUNT))
                goto fail_free_mce_banks;
 
+       if (!alloc_emulate_ctxt(vcpu))
+               goto free_wbinvd_dirty_mask;
+
        vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
                                                GFP_KERNEL_ACCOUNT);
        if (!vcpu->arch.user_fpu) {
                pr_err("kvm: failed to allocate userspace's fpu\n");
-               goto free_wbinvd_dirty_mask;
+               goto free_emulate_ctxt;
        }
 
        vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
@@ -9355,6 +9377,8 @@ free_guest_fpu:
        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
 free_user_fpu:
        kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+free_emulate_ctxt:
+       kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
 free_wbinvd_dirty_mask:
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 fail_free_mce_banks:
@@ -9389,11 +9413,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
        mutex_unlock(&vcpu->mutex);
 
-       if (!kvmclock_periodic_sync)
-               return;
-
-       schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
-                                       KVMCLOCK_SYNC_PERIOD);
+       if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
+               schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+                                               KVMCLOCK_SYNC_PERIOD);
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -9407,6 +9429,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
        kvm_x86_ops->vcpu_free(vcpu);
 
+       kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
        kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
@@ -9607,10 +9630,18 @@ int kvm_arch_hardware_setup(void)
 {
        int r;
 
+       rdmsrl_safe(MSR_EFER, &host_efer);
+
+       if (boot_cpu_has(X86_FEATURE_XSAVES))
+               rdmsrl(MSR_IA32_XSS, host_xss);
+
        r = kvm_x86_ops->hardware_setup();
        if (r != 0)
                return r;
 
+       if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+               supported_xss = 0;
+
        cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
 
        if (kvm_has_tsc_control) {
@@ -9627,9 +9658,6 @@ int kvm_arch_hardware_setup(void)
                kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
        }
 
-       if (boot_cpu_has(X86_FEATURE_XSAVES))
-               rdmsrl(MSR_IA32_XSS, host_xss);
-
        kvm_init_msr_list();
        return 0;
 }
@@ -9677,6 +9705,13 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
        kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+       kfree(kvm->arch.hyperv.hv_pa_pg);
+       vfree(kvm);
+}
+
+
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
        if (type)
@@ -9759,9 +9794,9 @@ void kvm_arch_sync_events(struct kvm *kvm)
 int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 {
        int i, r;
-       unsigned long hva;
+       unsigned long hva, uninitialized_var(old_npages);
        struct kvm_memslots *slots = kvm_memslots(kvm);
-       struct kvm_memory_slot *slot, old;
+       struct kvm_memory_slot *slot;
 
        /* Called with kvm->slots_lock held.  */
        if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
@@ -9769,7 +9804,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 
        slot = id_to_memslot(slots, id);
        if (size) {
-               if (slot->npages)
+               if (slot && slot->npages)
                        return -EEXIST;
 
                /*
@@ -9781,13 +9816,18 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
                if (IS_ERR((void *)hva))
                        return PTR_ERR((void *)hva);
        } else {
-               if (!slot->npages)
+               if (!slot || !slot->npages)
                        return 0;
 
-               hva = 0;
+               /*
+                * Stuff a non-canonical value to catch use-after-delete.  This
+                * ends up being 0 on 32-bit KVM, but there's no better
+                * alternative.
+                */
+               hva = (unsigned long)(0xdeadull << 48);
+               old_npages = slot->npages;
        }
 
-       old = *slot;
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                struct kvm_userspace_memory_region m;
 
@@ -9802,7 +9842,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
        }
 
        if (!size)
-               vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
+               vm_munmap(hva, old_npages * PAGE_SIZE);
 
        return 0;
 }
@@ -9841,34 +9881,36 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_hv_destroy_vm(kvm);
 }
 
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-                          struct kvm_memory_slot *dont)
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
        int i;
 
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-               if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
-                       kvfree(free->arch.rmap[i]);
-                       free->arch.rmap[i] = NULL;
-               }
+               kvfree(slot->arch.rmap[i]);
+               slot->arch.rmap[i] = NULL;
+
                if (i == 0)
                        continue;
 
-               if (!dont || free->arch.lpage_info[i - 1] !=
-                            dont->arch.lpage_info[i - 1]) {
-                       kvfree(free->arch.lpage_info[i - 1]);
-                       free->arch.lpage_info[i - 1] = NULL;
-               }
+               kvfree(slot->arch.lpage_info[i - 1]);
+               slot->arch.lpage_info[i - 1] = NULL;
        }
 
-       kvm_page_track_free_memslot(free, dont);
+       kvm_page_track_free_memslot(slot);
 }
 
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                           unsigned long npages)
+static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
+                                     unsigned long npages)
 {
        int i;
 
+       /*
+        * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
+        * old arrays will be freed by __kvm_set_memory_region() if installing
+        * the new memslot is successful.
+        */
+       memset(&slot->arch, 0, sizeof(slot->arch));
+
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                struct kvm_lpage_info *linfo;
                unsigned long ugfn;
@@ -9899,11 +9941,9 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
                ugfn = slot->userspace_addr >> PAGE_SHIFT;
                /*
                 * If the gfn and userspace address are not aligned wrt each
-                * other, or if explicitly asked to, disable large page
-                * support for this slot
+                * other, disable large page support for this slot.
                 */
-               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
-                   !kvm_largepages_enabled()) {
+               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
                        unsigned long j;
 
                        for (j = 0; j < lpages; ++j)
@@ -9950,6 +9990,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem,
                                enum kvm_mr_change change)
 {
+       if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
+               return kvm_alloc_memslot_metadata(memslot,
+                                                 mem->memory_size >> PAGE_SHIFT);
        return 0;
 }
 
@@ -9958,7 +10001,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 {
        /* Still write protect RO slot */
        if (new->flags & KVM_MEM_READONLY) {
-               kvm_mmu_slot_remove_write_access(kvm, new);
+               kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL);
                return;
        }
 
@@ -9993,10 +10036,23 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
         * See the comments in fast_page_fault().
         */
        if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-               if (kvm_x86_ops->slot_enable_log_dirty)
+               if (kvm_x86_ops->slot_enable_log_dirty) {
                        kvm_x86_ops->slot_enable_log_dirty(kvm, new);
-               else
-                       kvm_mmu_slot_remove_write_access(kvm, new);
+               } else {
+                       int level =
+                               kvm_dirty_log_manual_protect_and_init_set(kvm) ?
+                               PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL;
+
+                       /*
+                        * If we're with initial-all-set, we don't need
+                        * to write protect any small page because
+                        * they're reported as dirty already.  However
+                        * we still need to write-protect huge pages
+                        * so that the page split can happen lazily on
+                        * the first write to the huge page.
+                        */
+                       kvm_mmu_slot_remove_write_access(kvm, new, level);
+               }
        } else {
                if (kvm_x86_ops->slot_disable_log_dirty)
                        kvm_x86_ops->slot_disable_log_dirty(kvm, new);
@@ -10005,7 +10061,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old,
+                               struct kvm_memory_slot *old,
                                const struct kvm_memory_slot *new,
                                enum kvm_mr_change change)
 {
@@ -10047,6 +10103,10 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         */
        if (change != KVM_MR_DELETE)
                kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
+
+       /* Free the arrays associated with the old memslot. */
+       if (change == KVM_MR_MOVE)
+               kvm_arch_free_memslot(kvm, old);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -10191,7 +10251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
                return;
 
        if (!vcpu->arch.mmu->direct_map &&
-             work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
+             work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
                return;
 
        kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
@@ -10514,4 +10574,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
index 3624665..c1954e2 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/kvm_host.h>
 #include <asm/pvclock.h>
 #include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
 
 #define KVM_DEFAULT_PLE_GAP            128
 #define KVM_VMX_DEFAULT_PLE_WINDOW     4096
@@ -149,11 +150,6 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
        return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
 }
 
-static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
-{
-       return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
-}
-
 static inline u64 get_canonical(u64 la, u8 vaddr_bits)
 {
        return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
@@ -164,12 +160,6 @@ static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
        return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
 }
 
-static inline bool emul_is_noncanonical_address(u64 la,
-                                               struct x86_emulate_ctxt *ctxt)
-{
-       return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
-}
-
 static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
                                        gva_t gva, gfn_t gfn, unsigned access)
 {
@@ -280,13 +270,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                            int emulation_type, void *insn, int insn_len);
 enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
 
-#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
-                               | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
-                               | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
-                               | XFEATURE_MASK_PKRU)
 extern u64 host_xcr0;
+extern u64 supported_xcr0;
+extern u64 supported_xss;
 
-extern u64 kvm_supported_xcr0(void);
+static inline bool kvm_mpx_supported(void)
+{
+       return (supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
+               == (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+}
 
 extern unsigned int min_timer_period_us;
 
index bcb9b2a..f6a1905 100644 (file)
@@ -360,6 +360,10 @@ static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *mem
        return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap);
 }
 
+#ifndef KVM_DIRTY_LOG_MANUAL_CAPS
+#define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE
+#endif
+
 struct kvm_s390_adapter_int {
        u64 ind_addr;
        u64 summary_addr;
@@ -431,11 +435,11 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
  */
 struct kvm_memslots {
        u64 generation;
-       struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
        /* The mapping table from slot id to the index in memslots[]. */
        short id_to_index[KVM_MEM_SLOTS_NUM];
        atomic_t lru_slot;
        int used_slots;
+       struct kvm_memory_slot memslots[];
 };
 
 struct kvm {
@@ -493,7 +497,7 @@ struct kvm {
 #endif
        long tlbs_dirty;
        struct list_head devices;
-       bool manual_dirty_log_protect;
+       u64 manual_dirty_log_protect;
        struct dentry *debugfs_dentry;
        struct kvm_stat_data **debugfs_stat_data;
        struct srcu_struct srcu;
@@ -527,6 +531,11 @@ struct kvm {
 #define vcpu_err(vcpu, fmt, ...)                                       \
        kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
 
+static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
+{
+       return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
+}
+
 static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx)
 {
        return srcu_dereference_check(kvm->buses[idx], &kvm->srcu,
@@ -572,10 +581,11 @@ static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
        return vcpu->vcpu_idx;
 }
 
-#define kvm_for_each_memslot(memslot, slots)   \
-       for (memslot = &slots->memslots[0];     \
-             memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
-               memslot++)
+#define kvm_for_each_memslot(memslot, slots)                           \
+       for (memslot = &slots->memslots[0];                             \
+            memslot < slots->memslots + slots->used_slots; memslot++)  \
+               if (WARN_ON_ONCE(!memslot->npages)) {                   \
+               } else
 
 void kvm_vcpu_destroy(struct kvm_vcpu *vcpu);
 
@@ -635,12 +645,15 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
        return __kvm_memslots(vcpu->kvm, as_id);
 }
 
-static inline struct kvm_memory_slot *
-id_to_memslot(struct kvm_memslots *slots, int id)
+static inline
+struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
 {
        int index = slots->id_to_index[id];
        struct kvm_memory_slot *slot;
 
+       if (index < 0)
+               return NULL;
+
        slot = &slots->memslots[index];
 
        WARN_ON(slot->id != id);
@@ -669,10 +682,7 @@ int kvm_set_memory_region(struct kvm *kvm,
                          const struct kvm_userspace_memory_region *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
                            const struct kvm_userspace_memory_region *mem);
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-                          struct kvm_memory_slot *dont);
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                           unsigned long npages);
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
@@ -680,11 +690,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                enum kvm_mr_change change);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old,
+                               struct kvm_memory_slot *old,
                                const struct kvm_memory_slot *new,
                                enum kvm_mr_change change);
-bool kvm_largepages_enabled(void);
-void kvm_disable_largepages(void);
 /* flush all memory translations */
 void kvm_arch_flush_shadow_all(struct kvm *kvm);
 /* flush memory translations pointing to 'slot' */
@@ -704,7 +712,6 @@ void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
-kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable);
@@ -819,23 +826,20 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
 
-int kvm_get_dirty_log(struct kvm *kvm,
-                       struct kvm_dirty_log *log, int *is_dirty);
-
-int kvm_get_dirty_log_protect(struct kvm *kvm,
-                             struct kvm_dirty_log *log, bool *flush);
-int kvm_clear_dirty_log_protect(struct kvm *kvm,
-                               struct kvm_clear_dirty_log *log, bool *flush);
-
 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                                        struct kvm_memory_slot *slot,
                                        gfn_t gfn_offset,
                                        unsigned long mask);
-
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-                               struct kvm_dirty_log *log);
-int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
-                                 struct kvm_clear_dirty_log *log);
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
+
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+                                       struct kvm_memory_slot *memslot);
+#else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
+int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
+                     int *is_dirty, struct kvm_memory_slot **memslot);
+#endif
 
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
                        bool line_status);
@@ -1018,6 +1022,8 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
  * gfn_to_memslot() itself isn't here as an inline because that would
  * bloat other code too much.
+ *
+ * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!
  */
 static inline struct kvm_memory_slot *
 search_memslots(struct kvm_memslots *slots, gfn_t gfn)
@@ -1026,6 +1032,9 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn)
        int slot = atomic_read(&slots->lru_slot);
        struct kvm_memory_slot *memslots = slots->memslots;
 
+       if (unlikely(!slots->used_slots))
+               return NULL;
+
        if (gfn >= memslots[slot].base_gfn &&
            gfn < memslots[slot].base_gfn + memslots[slot].npages)
                return &memslots[slot];
index 4b95f9a..5e6234c 100644 (file)
@@ -474,12 +474,17 @@ struct kvm_s390_mem_op {
        __u32 size;             /* amount of bytes */
        __u32 op;               /* type of operation */
        __u64 buf;              /* buffer in userspace */
-       __u8 ar;                /* the access register number */
-       __u8 reserved[31];      /* should be set to 0 */
+       union {
+               __u8 ar;        /* the access register number */
+               __u32 sida_offset; /* offset into the sida */
+               __u8 reserved[32]; /* should be set to 0 */
+       };
 };
 /* types for kvm_s390_mem_op->op */
 #define KVM_S390_MEMOP_LOGICAL_READ    0
 #define KVM_S390_MEMOP_LOGICAL_WRITE   1
+#define KVM_S390_MEMOP_SIDA_READ       2
+#define KVM_S390_MEMOP_SIDA_WRITE      3
 /* flags for kvm_s390_mem_op->flags */
 #define KVM_S390_MEMOP_F_CHECK_ONLY            (1ULL << 0)
 #define KVM_S390_MEMOP_F_INJECT_EXCEPTION      (1ULL << 1)
@@ -1010,6 +1015,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_NISV_TO_USER 177
 #define KVM_CAP_ARM_INJECT_EXT_DABT 178
 #define KVM_CAP_S390_VCPU_RESETS 179
+#define KVM_CAP_S390_PROTECTED 180
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1478,6 +1484,39 @@ struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET  _IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET   _IO(KVMIO,   0xc4)
 
+struct kvm_s390_pv_sec_parm {
+       __u64 origin;
+       __u64 length;
+};
+
+struct kvm_s390_pv_unp {
+       __u64 addr;
+       __u64 size;
+       __u64 tweak;
+};
+
+enum pv_cmd_id {
+       KVM_PV_ENABLE,
+       KVM_PV_DISABLE,
+       KVM_PV_SET_SEC_PARMS,
+       KVM_PV_UNPACK,
+       KVM_PV_VERIFY,
+       KVM_PV_PREP_RESET,
+       KVM_PV_UNSHARE_ALL,
+};
+
+struct kvm_pv_cmd {
+       __u32 cmd;      /* Command to be executed */
+       __u16 rc;       /* Ultravisor return code */
+       __u16 rrc;      /* Ultravisor return reason code */
+       __u64 data;     /* Data or address */
+       __u32 flags;    /* flags for future extensions. Must be 0 for now */
+       __u32 reserved[3];
+};
+
+/* Available with KVM_CAP_S390_PROTECTED */
+#define KVM_S390_PV_COMMAND            _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
        /* Guest initialization commands */
@@ -1628,4 +1667,7 @@ struct kvm_hyperv_eventfd {
 #define KVM_HYPERV_CONN_ID_MASK                0x00ffffff
 #define KVM_HYPERV_EVENTFD_DEASSIGN    (1 << 0)
 
+#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE    (1 << 0)
+#define KVM_DIRTY_LOG_INITIALLY_SET            (1 << 1)
+
 #endif /* __LINUX_KVM_H */
index cb52a3a..4205ed4 100644 (file)
@@ -1,4 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NR_userfaultfd
+#define __NR_userfaultfd 282
+#endif
 #ifndef __NR_perf_event_open
 # define __NR_perf_event_open 298
 #endif
index 4cf9311..e83fc8e 100755 (executable)
@@ -25,7 +25,7 @@ import sys
 import locale
 import os
 import time
-import optparse
+import argparse
 import ctypes
 import fcntl
 import resource
@@ -33,6 +33,8 @@ import struct
 import re
 import subprocess
 from collections import defaultdict, namedtuple
+from functools import reduce
+from datetime import datetime
 
 VMX_EXIT_REASONS = {
     'EXCEPTION_NMI':        0,
@@ -873,7 +875,7 @@ class Stats(object):
 
         if options.debugfs:
             providers.append(DebugfsProvider(options.pid, options.fields,
-                                             options.dbgfs_include_past))
+                                             options.debugfs_include_past))
         if options.tracepoints or not providers:
             providers.append(TracepointProvider(options.pid, options.fields))
 
@@ -974,15 +976,17 @@ DELAY_DEFAULT = 3.0
 MAX_GUEST_NAME_LEN = 48
 MAX_REGEX_LEN = 44
 SORT_DEFAULT = 0
+MIN_DELAY = 0.1
+MAX_DELAY = 25.5
 
 
 class Tui(object):
     """Instruments curses to draw a nice text ui."""
-    def __init__(self, stats):
+    def __init__(self, stats, opts):
         self.stats = stats
         self.screen = None
         self._delay_initial = 0.25
-        self._delay_regular = DELAY_DEFAULT
+        self._delay_regular = opts.set_delay
         self._sorting = SORT_DEFAULT
         self._display_guests = 0
 
@@ -1183,7 +1187,7 @@ class Tui(object):
 
         if not self._is_running_guest(self.stats.pid_filter):
             if self._gname:
-                try: # ...to identify the guest by name in case it's back
+                try:  # ...to identify the guest by name in case it's back
                     pids = self.get_pid_from_gname(self._gname)
                     if len(pids) == 1:
                         self._refresh_header(pids[0])
@@ -1282,7 +1286,8 @@ class Tui(object):
                '   p     filter by guest name/PID',
                '   q     quit',
                '   r     reset stats',
-               '   s     set update interval',
+               '   s     set delay between refreshs (value range: '
+               '%s-%s secs)' % (MIN_DELAY, MAX_DELAY),
                '   x     toggle reporting of stats for individual child trace'
                ' events',
                'Any other key refreshes statistics immediately')
@@ -1336,8 +1341,8 @@ class Tui(object):
         msg = ''
         while True:
             self.screen.erase()
-            self.screen.addstr(0, 0, 'Set update interval (defaults to %.1fs).' %
-                               DELAY_DEFAULT, curses.A_BOLD)
+            self.screen.addstr(0, 0, 'Set update interval (defaults to %.1fs).'
+                               DELAY_DEFAULT, curses.A_BOLD)
             self.screen.addstr(4, 0, msg)
             self.screen.addstr(2, 0, 'Change delay from %.1fs to ' %
                                self._delay_regular)
@@ -1348,11 +1353,9 @@ class Tui(object):
             try:
                 if len(val) > 0:
                     delay = float(val)
-                    if delay < 0.1:
-                        msg = '"' + str(val) + '": Value must be >=0.1'
-                        continue
-                    if delay > 25.5:
-                        msg = '"' + str(val) + '": Value must be <=25.5'
+                    err = is_delay_valid(delay)
+                    if err is not None:
+                        msg = err
                         continue
                 else:
                     delay = DELAY_DEFAULT
@@ -1488,33 +1491,64 @@ def batch(stats):
         pass
 
 
-def log(stats):
-    """Prints statistics as reiterating key block, multiple value blocks."""
-    keys = sorted(stats.get().keys())
-
-    def banner():
+class StdFormat(object):
+    def __init__(self, keys):
+        self._banner = ''
         for key in keys:
-            print(key.split(' ')[0], end=' ')
-        print()
+            self._banner += key.split(' ')[0] + ' '
 
-    def statline():
-        s = stats.get()
+    def get_banner(self):
+        return self._banner
+
+    @staticmethod
+    def get_statline(keys, s):
+        res = ''
         for key in keys:
-            print(' %9d' % s[key].delta, end=' ')
-        print()
+            res += ' %9d' % s[key].delta
+        return res
+
+
+class CSVFormat(object):
+    def __init__(self, keys):
+        self._banner = 'timestamp'
+        self._banner += reduce(lambda res, key: "{},{!s}".format(res,
+                               key.split(' ')[0]), keys, '')
+
+    def get_banner(self):
+        return self._banner
+
+    @staticmethod
+    def get_statline(keys, s):
+        return reduce(lambda res, key: "{},{!s}".format(res, s[key].delta),
+                      keys, '')
+
+
+def log(stats, opts, frmt, keys):
+    """Prints statistics as reiterating key block, multiple value blocks."""
     line = 0
     banner_repeat = 20
     while True:
         try:
-            time.sleep(1)
+            time.sleep(opts.set_delay)
             if line % banner_repeat == 0:
-                banner()
-            statline()
+                print(frmt.get_banner())
+            print(datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
+                  frmt.get_statline(keys, stats.get()))
             line += 1
         except KeyboardInterrupt:
             break
 
 
+def is_delay_valid(delay):
+    """Verify delay is in valid value range."""
+    msg = None
+    if delay < MIN_DELAY:
+        msg = '"' + str(delay) + '": Delay must be >=%s' % MIN_DELAY
+    if delay > MAX_DELAY:
+        msg = '"' + str(delay) + '": Delay must be <=%s' % MAX_DELAY
+    return msg
+
+
 def get_options():
     """Returns processed program arguments."""
     description_text = """
@@ -1545,89 +1579,85 @@ Interactive Commands:
    p     filter by PID
    q     quit
    r     reset stats
-   s     set update interval
+   s     set update interval (value range: 0.1-25.5 secs)
    x     toggle reporting of stats for individual child trace events
 Press any other key to refresh statistics immediately.
 """ % (PATH_DEBUGFS_KVM, PATH_DEBUGFS_TRACING)
 
-    class PlainHelpFormatter(optparse.IndentedHelpFormatter):
-        def format_description(self, description):
-            if description:
-                return description + "\n"
-            else:
-                return ""
-
-    def cb_guest_to_pid(option, opt, val, parser):
-        try:
-            pids = Tui.get_pid_from_gname(val)
-        except:
-            sys.exit('Error while searching for guest "{}". Use "-p" to '
-                     'specify a pid instead?'.format(val))
-        if len(pids) == 0:
-            sys.exit('Error: No guest by the name "{}" found'.format(val))
-        if len(pids) > 1:
-            sys.exit('Error: Multiple processes found (pids: {}). Use "-p" '
-                     'to specify the desired pid'.format(" ".join(pids)))
-        parser.values.pid = pids[0]
-
-    optparser = optparse.OptionParser(description=description_text,
-                                      formatter=PlainHelpFormatter())
-    optparser.add_option('-1', '--once', '--batch',
-                         action='store_true',
-                         default=False,
-                         dest='once',
-                         help='run in batch mode for one second',
-                         )
-    optparser.add_option('-i', '--debugfs-include-past',
-                         action='store_true',
-                         default=False,
-                         dest='dbgfs_include_past',
-                         help='include all available data on past events for '
-                              'debugfs',
-                         )
-    optparser.add_option('-l', '--log',
-                         action='store_true',
-                         default=False,
-                         dest='log',
-                         help='run in logging mode (like vmstat)',
-                         )
-    optparser.add_option('-t', '--tracepoints',
-                         action='store_true',
-                         default=False,
-                         dest='tracepoints',
-                         help='retrieve statistics from tracepoints',
-                         )
-    optparser.add_option('-d', '--debugfs',
-                         action='store_true',
-                         default=False,
-                         dest='debugfs',
-                         help='retrieve statistics from debugfs',
-                         )
-    optparser.add_option('-f', '--fields',
-                         action='store',
-                         default='',
-                         dest='fields',
-                         help='''fields to display (regex)
-                                 "-f help" for a list of available events''',
-                         )
-    optparser.add_option('-p', '--pid',
-                         action='store',
-                         default=0,
-                         type='int',
-                         dest='pid',
-                         help='restrict statistics to pid',
-                         )
-    optparser.add_option('-g', '--guest',
-                         action='callback',
-                         type='string',
-                         dest='pid',
-                         metavar='GUEST',
-                         help='restrict statistics to guest by name',
-                         callback=cb_guest_to_pid,
-                         )
-    options, unkn = optparser.parse_args(sys.argv)
-    if len(unkn) != 1:
-        sys.exit('Error: Extra argument(s): ' + ' '.join(unkn[1:]))
+    class Guest_to_pid(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            try:
+                pids = Tui.get_pid_from_gname(values)
+            except:
+                sys.exit('Error while searching for guest "{}". Use "-p" to '
+                         'specify a pid instead?'.format(values))
+            if len(pids) == 0:
+                sys.exit('Error: No guest by the name "{}" found'
+                         .format(values))
+            if len(pids) > 1:
+                sys.exit('Error: Multiple processes found (pids: {}). Use "-p"'
+                         ' to specify the desired pid'.format(" ".join(pids)))
+            namespace.pid = pids[0]
+
+    argparser = argparse.ArgumentParser(description=description_text,
+                                        formatter_class=argparse
+                                        .RawTextHelpFormatter)
+    argparser.add_argument('-1', '--once', '--batch',
+                           action='store_true',
+                           default=False,
+                           help='run in batch mode for one second',
+                           )
+    argparser.add_argument('-c', '--csv',
+                           action='store_true',
+                           default=False,
+                           help='log in csv format - requires option -l/--log',
+                           )
+    argparser.add_argument('-d', '--debugfs',
+                           action='store_true',
+                           default=False,
+                           help='retrieve statistics from debugfs',
+                           )
+    argparser.add_argument('-f', '--fields',
+                           default='',
+                           help='''fields to display (regex)
+"-f help" for a list of available events''',
+                           )
+    argparser.add_argument('-g', '--guest',
+                           type=str,
+                           help='restrict statistics to guest by name',
+                           action=Guest_to_pid,
+                           )
+    argparser.add_argument('-i', '--debugfs-include-past',
+                           action='store_true',
+                           default=False,
+                           help='include all available data on past events for'
+                                ' debugfs',
+                           )
+    argparser.add_argument('-l', '--log',
+                           action='store_true',
+                           default=False,
+                           help='run in logging mode (like vmstat)',
+                           )
+    argparser.add_argument('-p', '--pid',
+                           type=int,
+                           default=0,
+                           help='restrict statistics to pid',
+                           )
+    argparser.add_argument('-s', '--set-delay',
+                           type=float,
+                           default=DELAY_DEFAULT,
+                           metavar='DELAY',
+                           help='set delay between refreshs (value range: '
+                                '%s-%s secs)' % (MIN_DELAY, MAX_DELAY),
+                           )
+    argparser.add_argument('-t', '--tracepoints',
+                           action='store_true',
+                           default=False,
+                           help='retrieve statistics from tracepoints',
+                           )
+    options = argparser.parse_args()
+    if options.csv and not options.log:
+        sys.exit('Error: Option -c/--csv requires -l/--log')
     try:
         # verify that we were passed a valid regex up front
         re.compile(options.fields)
@@ -1693,6 +1723,10 @@ def main():
         sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n')
         sys.exit('Specified pid does not exist.')
 
+    err = is_delay_valid(options.set_delay)
+    if err is not None:
+        sys.exit('Error: ' + err)
+
     stats = Stats(options)
 
     if options.fields == 'help':
@@ -1704,12 +1738,18 @@ def main():
         sys.exit(0)
 
     if options.log:
-        log(stats)
+        keys = sorted(stats.get().keys())
+        if options.csv:
+            frmt = CSVFormat(keys)
+        else:
+            frmt = StdFormat(keys)
+        log(stats, options, frmt, keys)
     elif not options.once:
-        with Tui(stats) as tui:
+        with Tui(stats, options) as tui:
             tui.show_stats()
     else:
         batch(stats)
 
+
 if __name__ == "__main__":
     main()
index c057ba5..a97ded2 100644 (file)
@@ -49,7 +49,7 @@ INTERACTIVE COMMANDS
 
 *r*::  reset stats
 
-*s*::   set update interval
+*s*::   set delay between refreshs
 
 *x*::  toggle reporting of stats for child trace events
  ::     *Note*: The stats for the parents summarize the respective child trace
@@ -64,37 +64,45 @@ OPTIONS
 --batch::
        run in batch mode for one second
 
--l::
---log::
-       run in logging mode (like vmstat)
-
--t::
---tracepoints::
-       retrieve statistics from tracepoints
+-c::
+--csv=<file>::
+        log in csv format - requires option -l/--log
 
 -d::
 --debugfs::
        retrieve statistics from debugfs
 
+-f<fields>::
+--fields=<fields>::
+        fields to display (regex), "-f help" for a list of available events
+
+-g<guest>::
+--guest=<guest_name>::
+        limit statistics to one virtual machine (guest name)
+
+-h::
+--help::
+        show help message
+
 -i::
 --debugfs-include-past::
        include all available data on past events for debugfs
 
+-l::
+--log::
+        run in logging mode (like vmstat)
+
 -p<pid>::
 --pid=<pid>::
        limit statistics to one virtual machine (pid)
 
--g<guest>::
---guest=<guest_name>::
-       limit statistics to one virtual machine (guest name)
+-s::
+--set-delay::
+        set delay between refreshs (value range: 0.1-25.5 secs)
 
--f<fields>::
---fields=<fields>::
-       fields to display (regex), "-f help" for a list of available events
-
--h::
---help::
-       show help message
+-t::
+--tracepoints::
+        retrieve statistics from tracepoints
 
 SEE ALSO
 --------
index 30072c3..16877c3 100644 (file)
@@ -1,13 +1,16 @@
-/s390x/sync_regs_test
 /s390x/memop
+/s390x/resets
+/s390x/sync_regs_test
 /x86_64/cr4_cpuid_sync_test
 /x86_64/evmcs_test
 /x86_64/hyperv_cpuid
 /x86_64/mmio_warning_test
 /x86_64/platform_info_test
+/x86_64/set_memory_region_test
 /x86_64/set_sregs_test
 /x86_64/smm_test
 /x86_64/state_test
+/x86_64/svm_vmcall_test
 /x86_64/sync_regs_test
 /x86_64/vmx_close_while_nested_test
 /x86_64/vmx_dirty_log_test
@@ -15,5 +18,7 @@
 /x86_64/vmx_tsc_adjust_test
 /x86_64/xss_msr_test
 /clear_dirty_log_test
+/demand_paging_test
 /dirty_log_test
 /kvm_create_max_vcpus
+/steal_time
index d91c53b..712a2dd 100644 (file)
@@ -7,7 +7,7 @@ top_srcdir = ../../../..
 KSFT_KHDR_INSTALL := 1
 UNAME_M := $(shell uname -m)
 
-LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c
+LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c
 LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c
 LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c
 LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c
@@ -17,27 +17,33 @@ TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/set_memory_region_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
 TEST_GEN_PROGS_x86_64 += x86_64/smm_test
 TEST_GEN_PROGS_x86_64 += x86_64/state_test
+TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
 TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
-TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
 TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
+TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += steal_time
 
 TEST_GEN_PROGS_aarch64 += clear_dirty_log_test
+TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += steal_time
 
 TEST_GEN_PROGS_s390x = s390x/memop
-TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += s390x/resets
+TEST_GEN_PROGS_s390x += s390x/sync_regs_test
+TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
 
index 7493369..11672ec 100644 (file)
@@ -1,2 +1,6 @@
 #define USE_CLEAR_DIRTY_LOG
+#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0)
+#define KVM_DIRTY_LOG_INITIALLY_SET         (1 << 1)
+#define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
+               KVM_DIRTY_LOG_INITIALLY_SET)
 #include "dirty_log_test.c"
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
new file mode 100644 (file)
index 0000000..360cd3e
--- /dev/null
@@ -0,0 +1,661 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM demand paging test
+ * Adapted from dirty_log_test.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2019, Google, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+#include <time.h>
+#include <poll.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/userfaultfd.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#ifdef __NR_userfaultfd
+
+/* The memory slot index demand page */
+#define TEST_MEM_SLOT_INDEX            1
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM         0xc0000000
+
+#define DEFAULT_GUEST_TEST_MEM_SIZE (1 << 30) /* 1G */
+
+#ifdef PRINT_PER_PAGE_UPDATES
+#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__)
+#endif
+
+#ifdef PRINT_PER_VCPU_UPDATES
+#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__)
+#endif
+
+#define MAX_VCPUS 512
+
+/*
+ * Guest/Host shared variables. Ensure addr_gva2hva() and/or
+ * sync_global_to/from_guest() are used when accessing from
+ * the host. READ/WRITE_ONCE() should also be used with anything
+ * that may change.
+ */
+static uint64_t host_page_size;
+static uint64_t guest_page_size;
+
+static char *guest_data_prototype;
+
+/*
+ * Guest physical memory offset of the testing memory slot.
+ * This will be set to the topmost valid physical address minus
+ * the test memory size.
+ */
+static uint64_t guest_test_phys_mem;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+
+struct vcpu_args {
+       uint64_t gva;
+       uint64_t pages;
+
+       /* Only used by the host userspace part of the vCPU thread */
+       int vcpu_id;
+       struct kvm_vm *vm;
+};
+
+static struct vcpu_args vcpu_args[MAX_VCPUS];
+
+/*
+ * Continuously write to the first 8 bytes of each page in the demand paging
+ * memory region.
+ */
+static void guest_code(uint32_t vcpu_id)
+{
+       uint64_t gva;
+       uint64_t pages;
+       int i;
+
+       /* Make sure vCPU args data structure is not corrupt. */
+       GUEST_ASSERT(vcpu_args[vcpu_id].vcpu_id == vcpu_id);
+
+       gva = vcpu_args[vcpu_id].gva;
+       pages = vcpu_args[vcpu_id].pages;
+
+       for (i = 0; i < pages; i++) {
+               uint64_t addr = gva + (i * guest_page_size);
+
+               addr &= ~(host_page_size - 1);
+               *(uint64_t *)addr = 0x0123456789ABCDEF;
+       }
+
+       GUEST_SYNC(1);
+}
+
+static void *vcpu_worker(void *data)
+{
+       int ret;
+       struct vcpu_args *args = (struct vcpu_args *)data;
+       struct kvm_vm *vm = args->vm;
+       int vcpu_id = args->vcpu_id;
+       struct kvm_run *run;
+       struct timespec start, end, ts_diff;
+
+       vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+       run = vcpu_state(vm, vcpu_id);
+
+       clock_gettime(CLOCK_MONOTONIC, &start);
+
+       /* Let the guest access its memory */
+       ret = _vcpu_run(vm, vcpu_id);
+       TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+       if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) {
+               TEST_ASSERT(false,
+                           "Invalid guest sync status: exit_reason=%s\n",
+                           exit_reason_str(run->exit_reason));
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &end);
+       ts_diff = timespec_sub(end, start);
+       PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
+                      ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       return NULL;
+}
+
+#define PAGE_SHIFT_4K  12
+#define PTES_PER_4K_PT 512
+
+static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus,
+                               uint64_t vcpu_memory_bytes)
+{
+       struct kvm_vm *vm;
+       uint64_t pages = DEFAULT_GUEST_PHY_PAGES;
+
+       /* Account for a few pages per-vCPU for stacks */
+       pages += DEFAULT_STACK_PGS * vcpus;
+
+       /*
+        * Reserve twice the ammount of memory needed to map the test region and
+        * the page table / stacks region, at 4k, for page tables. Do the
+        * calculation with 4K page size: the smallest of all archs. (e.g., 64K
+        * page size guest will need even less memory for page tables).
+        */
+       pages += (2 * pages) / PTES_PER_4K_PT;
+       pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) /
+                PTES_PER_4K_PT;
+       pages = vm_adjust_num_guest_pages(mode, pages);
+
+       pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+
+       vm = _vm_create(mode, pages, O_RDWR);
+       kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+#ifdef __x86_64__
+       vm_create_irqchip(vm);
+#endif
+       return vm;
+}
+
+static int handle_uffd_page_request(int uffd, uint64_t addr)
+{
+       pid_t tid;
+       struct timespec start;
+       struct timespec end;
+       struct uffdio_copy copy;
+       int r;
+
+       tid = syscall(__NR_gettid);
+
+       copy.src = (uint64_t)guest_data_prototype;
+       copy.dst = addr;
+       copy.len = host_page_size;
+       copy.mode = 0;
+
+       clock_gettime(CLOCK_MONOTONIC, &start);
+
+       r = ioctl(uffd, UFFDIO_COPY, &copy);
+       if (r == -1) {
+               pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n",
+                       addr, tid, errno);
+               return r;
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &end);
+
+       PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
+                      timespec_to_ns(timespec_sub(end, start)));
+       PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n",
+                      host_page_size, addr, tid);
+
+       return 0;
+}
+
+bool quit_uffd_thread;
+
+struct uffd_handler_args {
+       int uffd;
+       int pipefd;
+       useconds_t delay;
+};
+
+static void *uffd_handler_thread_fn(void *arg)
+{
+       struct uffd_handler_args *uffd_args = (struct uffd_handler_args *)arg;
+       int uffd = uffd_args->uffd;
+       int pipefd = uffd_args->pipefd;
+       useconds_t delay = uffd_args->delay;
+       int64_t pages = 0;
+       struct timespec start, end, ts_diff;
+
+       clock_gettime(CLOCK_MONOTONIC, &start);
+       while (!quit_uffd_thread) {
+               struct uffd_msg msg;
+               struct pollfd pollfd[2];
+               char tmp_chr;
+               int r;
+               uint64_t addr;
+
+               pollfd[0].fd = uffd;
+               pollfd[0].events = POLLIN;
+               pollfd[1].fd = pipefd;
+               pollfd[1].events = POLLIN;
+
+               r = poll(pollfd, 2, -1);
+               switch (r) {
+               case -1:
+                       pr_info("poll err");
+                       continue;
+               case 0:
+                       continue;
+               case 1:
+                       break;
+               default:
+                       pr_info("Polling uffd returned %d", r);
+                       return NULL;
+               }
+
+               if (pollfd[0].revents & POLLERR) {
+                       pr_info("uffd revents has POLLERR");
+                       return NULL;
+               }
+
+               if (pollfd[1].revents & POLLIN) {
+                       r = read(pollfd[1].fd, &tmp_chr, 1);
+                       TEST_ASSERT(r == 1,
+                                   "Error reading pipefd in UFFD thread\n");
+                       return NULL;
+               }
+
+               if (!pollfd[0].revents & POLLIN)
+                       continue;
+
+               r = read(uffd, &msg, sizeof(msg));
+               if (r == -1) {
+                       if (errno == EAGAIN)
+                               continue;
+                       pr_info("Read of uffd gor errno %d", errno);
+                       return NULL;
+               }
+
+               if (r != sizeof(msg)) {
+                       pr_info("Read on uffd returned unexpected size: %d bytes", r);
+                       return NULL;
+               }
+
+               if (!(msg.event & UFFD_EVENT_PAGEFAULT))
+                       continue;
+
+               if (delay)
+                       usleep(delay);
+               addr =  msg.arg.pagefault.address;
+               r = handle_uffd_page_request(uffd, addr);
+               if (r < 0)
+                       return NULL;
+               pages++;
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &end);
+       ts_diff = timespec_sub(end, start);
+       PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
+                      pages, ts_diff.tv_sec, ts_diff.tv_nsec,
+                      pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+
+       return NULL;
+}
+
+static int setup_demand_paging(struct kvm_vm *vm,
+                              pthread_t *uffd_handler_thread, int pipefd,
+                              useconds_t uffd_delay,
+                              struct uffd_handler_args *uffd_args,
+                              void *hva, uint64_t len)
+{
+       int uffd;
+       struct uffdio_api uffdio_api;
+       struct uffdio_register uffdio_register;
+
+       uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+       if (uffd == -1) {
+               pr_info("uffd creation failed\n");
+               return -1;
+       }
+
+       uffdio_api.api = UFFD_API;
+       uffdio_api.features = 0;
+       if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
+               pr_info("ioctl uffdio_api failed\n");
+               return -1;
+       }
+
+       uffdio_register.range.start = (uint64_t)hva;
+       uffdio_register.range.len = len;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
+               pr_info("ioctl uffdio_register failed\n");
+               return -1;
+       }
+
+       if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) !=
+                       UFFD_API_RANGE_IOCTLS) {
+               pr_info("unexpected userfaultfd ioctl set\n");
+               return -1;
+       }
+
+       uffd_args->uffd = uffd;
+       uffd_args->pipefd = pipefd;
+       uffd_args->delay = uffd_delay;
+       pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn,
+                      uffd_args);
+
+       PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
+                      hva, hva + len);
+
+       return 0;
+}
+
+static void run_test(enum vm_guest_mode mode, bool use_uffd,
+                    useconds_t uffd_delay, int vcpus,
+                    uint64_t vcpu_memory_bytes)
+{
+       pthread_t *vcpu_threads;
+       pthread_t *uffd_handler_threads = NULL;
+       struct uffd_handler_args *uffd_args = NULL;
+       struct timespec start, end, ts_diff;
+       int *pipefds = NULL;
+       struct kvm_vm *vm;
+       uint64_t guest_num_pages;
+       int vcpu_id;
+       int r;
+
+       vm = create_vm(mode, vcpus, vcpu_memory_bytes);
+
+       guest_page_size = vm_get_page_size(vm);
+
+       TEST_ASSERT(vcpu_memory_bytes % guest_page_size == 0,
+                   "Guest memory size is not guest page size aligned.");
+
+       guest_num_pages = (vcpus * vcpu_memory_bytes) / guest_page_size;
+       guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+
+       /*
+        * If there should be more memory in the guest test region than there
+        * can be pages in the guest, it will definitely cause problems.
+        */
+       TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
+                   "Requested more guest memory than address space allows.\n"
+                   "    guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n",
+                   guest_num_pages, vm_get_max_gfn(vm), vcpus,
+                   vcpu_memory_bytes);
+
+       host_page_size = getpagesize();
+       TEST_ASSERT(vcpu_memory_bytes % host_page_size == 0,
+                   "Guest memory size is not host page size aligned.");
+
+       guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
+                             guest_page_size;
+       guest_test_phys_mem &= ~(host_page_size - 1);
+
+#ifdef __s390x__
+       /* Align to 1M (segment size) */
+       guest_test_phys_mem &= ~((1 << 20) - 1);
+#endif
+
+       pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
+
+       /* Add an extra memory slot for testing demand paging */
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+                                   guest_test_phys_mem,
+                                   TEST_MEM_SLOT_INDEX,
+                                   guest_num_pages, 0);
+
+       /* Do mapping for the demand paging memory slot */
+       virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
+
+       ucall_init(vm, NULL);
+
+       guest_data_prototype = malloc(host_page_size);
+       TEST_ASSERT(guest_data_prototype,
+                   "Failed to allocate buffer for guest data pattern");
+       memset(guest_data_prototype, 0xAB, host_page_size);
+
+       vcpu_threads = malloc(vcpus * sizeof(*vcpu_threads));
+       TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+       if (use_uffd) {
+               uffd_handler_threads =
+                       malloc(vcpus * sizeof(*uffd_handler_threads));
+               TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
+
+               uffd_args = malloc(vcpus * sizeof(*uffd_args));
+               TEST_ASSERT(uffd_args, "Memory allocation failed");
+
+               pipefds = malloc(sizeof(int) * vcpus * 2);
+               TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
+       }
+
+       for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
+               vm_paddr_t vcpu_gpa;
+               void *vcpu_hva;
+
+               vm_vcpu_add_default(vm, vcpu_id, guest_code);
+
+               vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes);
+               PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
+                              vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes);
+
+               /* Cache the HVA pointer of the region */
+               vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
+
+               if (use_uffd) {
+                       /*
+                        * Set up user fault fd to handle demand paging
+                        * requests.
+                        */
+                       r = pipe2(&pipefds[vcpu_id * 2],
+                                 O_CLOEXEC | O_NONBLOCK);
+                       TEST_ASSERT(!r, "Failed to set up pipefd");
+
+                       r = setup_demand_paging(vm,
+                                               &uffd_handler_threads[vcpu_id],
+                                               pipefds[vcpu_id * 2],
+                                               uffd_delay, &uffd_args[vcpu_id],
+                                               vcpu_hva, vcpu_memory_bytes);
+                       if (r < 0)
+                               exit(-r);
+               }
+
+#ifdef __x86_64__
+               vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid());
+#endif
+
+               vcpu_args[vcpu_id].vm = vm;
+               vcpu_args[vcpu_id].vcpu_id = vcpu_id;
+               vcpu_args[vcpu_id].gva = guest_test_virt_mem +
+                                        (vcpu_id * vcpu_memory_bytes);
+               vcpu_args[vcpu_id].pages = vcpu_memory_bytes / guest_page_size;
+       }
+
+       /* Export the shared variables to the guest */
+       sync_global_to_guest(vm, host_page_size);
+       sync_global_to_guest(vm, guest_page_size);
+       sync_global_to_guest(vm, vcpu_args);
+
+       pr_info("Finished creating vCPUs and starting uffd threads\n");
+
+       clock_gettime(CLOCK_MONOTONIC, &start);
+
+       for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
+               pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+                              &vcpu_args[vcpu_id]);
+       }
+
+       pr_info("Started all vCPUs\n");
+
+       /* Wait for the vcpu threads to quit */
+       for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
+               pthread_join(vcpu_threads[vcpu_id], NULL);
+               PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id);
+       }
+
+       pr_info("All vCPU threads joined\n");
+
+       clock_gettime(CLOCK_MONOTONIC, &end);
+
+       if (use_uffd) {
+               char c;
+
+               /* Tell the user fault fd handler threads to quit */
+               for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
+                       r = write(pipefds[vcpu_id * 2 + 1], &c, 1);
+                       TEST_ASSERT(r == 1, "Unable to write to pipefd");
+
+                       pthread_join(uffd_handler_threads[vcpu_id], NULL);
+               }
+       }
+
+       ts_diff = timespec_sub(end, start);
+       pr_info("Total guest execution time: %ld.%.9lds\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+       pr_info("Overall demand paging rate: %f pgs/sec\n",
+               guest_num_pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+
+       ucall_uninit(vm);
+       kvm_vm_free(vm);
+
+       free(guest_data_prototype);
+       free(vcpu_threads);
+       if (use_uffd) {
+               free(uffd_handler_threads);
+               free(uffd_args);
+               free(pipefds);
+       }
+}
+
+struct guest_mode {
+       bool supported;
+       bool enabled;
+};
+static struct guest_mode guest_modes[NUM_VM_MODES];
+
+#define guest_mode_init(mode, supported, enabled) ({ \
+       guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
+})
+
+static void help(char *name)
+{
+       int i;
+
+       puts("");
+       printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n"
+              "          [-b memory] [-v vcpus]\n", name);
+       printf(" -m: specify the guest mode ID to test\n"
+              "     (default: test all supported modes)\n"
+              "     This option may be used multiple times.\n"
+              "     Guest mode IDs:\n");
+       for (i = 0; i < NUM_VM_MODES; ++i) {
+               printf("         %d:    %s%s\n", i, vm_guest_mode_string(i),
+                      guest_modes[i].supported ? " (supported)" : "");
+       }
+       printf(" -u: use User Fault FD to handle vCPU page\n"
+              "     faults.\n");
+       printf(" -d: add a delay in usec to the User Fault\n"
+              "     FD handler to simulate demand paging\n"
+              "     overheads. Ignored without -u.\n");
+       printf(" -b: specify the size of the memory region which should be\n"
+              "     demand paged by each vCPU. e.g. 10M or 3G.\n"
+              "     Default: 1G\n");
+       printf(" -v: specify the number of vCPUs to run.\n");
+       puts("");
+       exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+       bool mode_selected = false;
+       uint64_t vcpu_memory_bytes = DEFAULT_GUEST_TEST_MEM_SIZE;
+       int vcpus = 1;
+       unsigned int mode;
+       int opt, i;
+       bool use_uffd = false;
+       useconds_t uffd_delay = 0;
+
+#ifdef __x86_64__
+       guest_mode_init(VM_MODE_PXXV48_4K, true, true);
+#endif
+#ifdef __aarch64__
+       guest_mode_init(VM_MODE_P40V48_4K, true, true);
+       guest_mode_init(VM_MODE_P40V48_64K, true, true);
+       {
+               unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+               if (limit >= 52)
+                       guest_mode_init(VM_MODE_P52V48_64K, true, true);
+               if (limit >= 48) {
+                       guest_mode_init(VM_MODE_P48V48_4K, true, true);
+                       guest_mode_init(VM_MODE_P48V48_64K, true, true);
+               }
+       }
+#endif
+#ifdef __s390x__
+       guest_mode_init(VM_MODE_P40V48_4K, true, true);
+#endif
+
+       while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) {
+               switch (opt) {
+               case 'm':
+                       if (!mode_selected) {
+                               for (i = 0; i < NUM_VM_MODES; ++i)
+                                       guest_modes[i].enabled = false;
+                               mode_selected = true;
+                       }
+                       mode = strtoul(optarg, NULL, 10);
+                       TEST_ASSERT(mode < NUM_VM_MODES,
+                                   "Guest mode ID %d too big", mode);
+                       guest_modes[mode].enabled = true;
+                       break;
+               case 'u':
+                       use_uffd = true;
+                       break;
+               case 'd':
+                       uffd_delay = strtoul(optarg, NULL, 0);
+                       TEST_ASSERT(uffd_delay >= 0,
+                                   "A negative UFFD delay is not supported.");
+                       break;
+               case 'b':
+                       vcpu_memory_bytes = parse_size(optarg);
+                       break;
+               case 'v':
+                       vcpus = atoi(optarg);
+                       TEST_ASSERT(vcpus > 0,
+                                   "Must have a positive number of vCPUs");
+                       TEST_ASSERT(vcpus <= MAX_VCPUS,
+                                   "This test does not currently support\n"
+                                   "more than %d vCPUs.", MAX_VCPUS);
+                       break;
+               case 'h':
+               default:
+                       help(argv[0]);
+                       break;
+               }
+       }
+
+       for (i = 0; i < NUM_VM_MODES; ++i) {
+               if (!guest_modes[i].enabled)
+                       continue;
+               TEST_ASSERT(guest_modes[i].supported,
+                           "Guest mode ID %d (%s) not supported.",
+                           i, vm_guest_mode_string(i));
+               run_test(i, use_uffd, uffd_delay, vcpus, vcpu_memory_bytes);
+       }
+
+       return 0;
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+       print_skip("__NR_userfaultfd must be present for userfaultfd test");
+       return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
index 5614222..752ec15 100644 (file)
@@ -166,24 +166,22 @@ static void *vcpu_worker(void *data)
                        pages_count += TEST_PAGES_PER_LOOP;
                        generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
                } else {
-                       TEST_ASSERT(false,
-                                   "Invalid guest sync status: "
-                                   "exit_reason=%s\n",
-                                   exit_reason_str(run->exit_reason));
+                       TEST_FAIL("Invalid guest sync status: "
+                                 "exit_reason=%s\n",
+                                 exit_reason_str(run->exit_reason));
                }
        }
 
-       DEBUG("Dirtied %"PRIu64" pages\n", pages_count);
+       pr_info("Dirtied %"PRIu64" pages\n", pages_count);
 
        return NULL;
 }
 
-static void vm_dirty_log_verify(unsigned long *bmap)
+static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
 {
+       uint64_t step = vm_num_host_pages(mode, 1);
        uint64_t page;
        uint64_t *value_ptr;
-       uint64_t step = host_page_size >= guest_page_size ? 1 :
-                               guest_page_size / host_page_size;
 
        for (page = 0; page < host_num_pages; page += step) {
                value_ptr = host_test_mem + page * host_page_size;
@@ -252,6 +250,8 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
        struct kvm_vm *vm;
        uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
 
+       pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+
        vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
        kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
 #ifdef __x86_64__
@@ -264,6 +264,10 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
 #define DIRTY_MEM_BITS 30 /* 1G */
 #define PAGE_SHIFT_4K  12
 
+#ifdef USE_CLEAR_DIRTY_LOG
+static u64 dirty_log_manual_caps;
+#endif
+
 static void run_test(enum vm_guest_mode mode, unsigned long iterations,
                     unsigned long interval, uint64_t phys_offset)
 {
@@ -289,14 +293,11 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
         * case where the size is not aligned to 64 pages.
         */
        guest_num_pages = (1ul << (DIRTY_MEM_BITS -
-                                  vm_get_page_shift(vm))) + 16;
-#ifdef __s390x__
-       /* Round up to multiple of 1M (segment size) */
-       guest_num_pages = (guest_num_pages + 0xff) & ~0xffUL;
-#endif
+                                  vm_get_page_shift(vm))) + 3;
+       guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+
        host_page_size = getpagesize();
-       host_num_pages = (guest_num_pages * guest_page_size) / host_page_size +
-                        !!((guest_num_pages * guest_page_size) % host_page_size);
+       host_num_pages = vm_num_host_pages(mode, guest_num_pages);
 
        if (!phys_offset) {
                guest_test_phys_mem = (vm_get_max_gfn(vm) -
@@ -311,7 +312,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
        guest_test_phys_mem &= ~((1 << 20) - 1);
 #endif
 
-       DEBUG("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
+       pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
 
        bmap = bitmap_alloc(host_num_pages);
        host_bmap_track = bitmap_alloc(host_num_pages);
@@ -320,7 +321,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
        struct kvm_enable_cap cap = {};
 
        cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
-       cap.args[0] = 1;
+       cap.args[0] = dirty_log_manual_caps;
        vm_enable_cap(vm, &cap);
 #endif
 
@@ -332,8 +333,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
                                    KVM_MEM_LOG_DIRTY_PAGES);
 
        /* Do mapping for the dirty track memory slot */
-       virt_map(vm, guest_test_virt_mem, guest_test_phys_mem,
-                guest_num_pages * guest_page_size, 0);
+       virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
 
        /* Cache the HVA pointer of the region */
        host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
@@ -341,9 +341,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
 #ifdef __x86_64__
        vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
 #endif
-#ifdef __aarch64__
        ucall_init(vm, NULL);
-#endif
 
        /* Export the shared variables to the guest */
        sync_global_to_guest(vm, host_page_size);
@@ -369,7 +367,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
                kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
                                       host_num_pages);
 #endif
-               vm_dirty_log_verify(bmap);
+               vm_dirty_log_verify(mode, bmap);
                iteration++;
                sync_global_to_guest(vm, iteration);
        }
@@ -378,9 +376,9 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
        host_quit = true;
        pthread_join(vcpu_thread, NULL);
 
-       DEBUG("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
-             "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count,
-             host_track_next_count);
+       pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
+               "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count,
+               host_track_next_count);
 
        free(bmap);
        free(host_bmap_track);
@@ -388,15 +386,14 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
        kvm_vm_free(vm);
 }
 
-struct vm_guest_mode_params {
+struct guest_mode {
        bool supported;
        bool enabled;
 };
-struct vm_guest_mode_params vm_guest_mode_params[NUM_VM_MODES];
+static struct guest_mode guest_modes[NUM_VM_MODES];
 
-#define vm_guest_mode_params_init(mode, supported, enabled)                                    \
-({                                                                                             \
-       vm_guest_mode_params[mode] = (struct vm_guest_mode_params){ supported, enabled };       \
+#define guest_mode_init(mode, supported, enabled) ({ \
+       guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
 })
 
 static void help(char *name)
@@ -419,7 +416,7 @@ static void help(char *name)
               "     Guest mode IDs:\n");
        for (i = 0; i < NUM_VM_MODES; ++i) {
                printf("         %d:    %s%s\n", i, vm_guest_mode_string(i),
-                      vm_guest_mode_params[i].supported ? " (supported)" : "");
+                      guest_modes[i].supported ? " (supported)" : "");
        }
        puts("");
        exit(0);
@@ -433,34 +430,38 @@ int main(int argc, char *argv[])
        uint64_t phys_offset = 0;
        unsigned int mode;
        int opt, i;
-#ifdef __aarch64__
-       unsigned int host_ipa_limit;
-#endif
 
 #ifdef USE_CLEAR_DIRTY_LOG
-       if (!kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2)) {
-               fprintf(stderr, "KVM_CLEAR_DIRTY_LOG not available, skipping tests\n");
+       dirty_log_manual_caps =
+               kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+       if (!dirty_log_manual_caps) {
+               print_skip("KVM_CLEAR_DIRTY_LOG not available");
                exit(KSFT_SKIP);
        }
+       dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+                                 KVM_DIRTY_LOG_INITIALLY_SET);
 #endif
 
 #ifdef __x86_64__
-       vm_guest_mode_params_init(VM_MODE_PXXV48_4K, true, true);
+       guest_mode_init(VM_MODE_PXXV48_4K, true, true);
 #endif
 #ifdef __aarch64__
-       vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true);
-       vm_guest_mode_params_init(VM_MODE_P40V48_64K, true, true);
-
-       host_ipa_limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
-       if (host_ipa_limit >= 52)
-               vm_guest_mode_params_init(VM_MODE_P52V48_64K, true, true);
-       if (host_ipa_limit >= 48) {
-               vm_guest_mode_params_init(VM_MODE_P48V48_4K, true, true);
-               vm_guest_mode_params_init(VM_MODE_P48V48_64K, true, true);
+       guest_mode_init(VM_MODE_P40V48_4K, true, true);
+       guest_mode_init(VM_MODE_P40V48_64K, true, true);
+
+       {
+               unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+               if (limit >= 52)
+                       guest_mode_init(VM_MODE_P52V48_64K, true, true);
+               if (limit >= 48) {
+                       guest_mode_init(VM_MODE_P48V48_4K, true, true);
+                       guest_mode_init(VM_MODE_P48V48_64K, true, true);
+               }
        }
 #endif
 #ifdef __s390x__
-       vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true);
+       guest_mode_init(VM_MODE_P40V48_4K, true, true);
 #endif
 
        while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) {
@@ -477,13 +478,13 @@ int main(int argc, char *argv[])
                case 'm':
                        if (!mode_selected) {
                                for (i = 0; i < NUM_VM_MODES; ++i)
-                                       vm_guest_mode_params[i].enabled = false;
+                                       guest_modes[i].enabled = false;
                                mode_selected = true;
                        }
                        mode = strtoul(optarg, NULL, 10);
                        TEST_ASSERT(mode < NUM_VM_MODES,
                                    "Guest mode ID %d too big", mode);
-                       vm_guest_mode_params[mode].enabled = true;
+                       guest_modes[mode].enabled = true;
                        break;
                case 'h':
                default:
@@ -495,15 +496,15 @@ int main(int argc, char *argv[])
        TEST_ASSERT(iterations > 2, "Iterations must be greater than two");
        TEST_ASSERT(interval > 0, "Interval must be greater than zero");
 
-       DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
-             iterations, interval);
+       pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
+               iterations, interval);
 
        srandom(time(0));
 
        for (i = 0; i < NUM_VM_MODES; ++i) {
-               if (!vm_guest_mode_params[i].enabled)
+               if (!guest_modes[i].enabled)
                        continue;
-               TEST_ASSERT(vm_guest_mode_params[i].supported,
+               TEST_ASSERT(guest_modes[i].supported,
                            "Guest mode ID %d (%s) not supported.",
                            i, vm_guest_mode_string(i));
                run_test(i, iterations, interval, phys_offset);
index 4912d23..d8f4d6b 100644 (file)
@@ -16,6 +16,8 @@
 #define u32 uint32_t
 #define u64 uint64_t
 
+#define EVMCS_VERSION 1
+
 extern bool enable_evmcs;
 
 struct hv_vp_assist_page {
index ae0d14c..a99b875 100644 (file)
@@ -16,7 +16,8 @@
 #include "sparsebit.h"
 
 
-/* Callers of kvm_util only have an incomplete/opaque description of the
+/*
+ * Callers of kvm_util only have an incomplete/opaque description of the
  * structure kvm_util is using to maintain the state of a VM.
  */
 struct kvm_vm;
@@ -24,12 +25,6 @@ struct kvm_vm;
 typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
 typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
 
-#ifndef NDEBUG
-#define DEBUG(...) printf(__VA_ARGS__);
-#else
-#define DEBUG(...)
-#endif
-
 /* Minimum allocated guest virtual and physical addresses */
 #define KVM_UTIL_MIN_VADDR             0x2000
 
@@ -84,6 +79,23 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
                     uint32_t data_memslot, uint32_t pgd_memslot);
 
 void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
+
+/*
+ * VM VCPU Dump
+ *
+ * Input Args:
+ *   stream - Output FILE stream
+ *   vm     - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the current state of the VCPU specified by @vcpuid, within the VM
+ * given by @vm, to the FILE stream given by @stream.
+ */
 void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid,
               uint8_t indent);
 
@@ -100,14 +112,31 @@ int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
                void *arg);
 void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
+void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
 void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
                          uint32_t data_memslot, uint32_t pgd_memslot);
 void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-             size_t size, uint32_t pgd_memslot);
+             unsigned int npages, uint32_t pgd_memslot);
 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
 void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
 vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
+
+/*
+ * Address Guest Virtual to Guest Physical
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   gva - VM virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   Equivalent VM physical address
+ *
+ * Returns the VM physical address of the translated VM virtual
+ * address given by @gva.
+ */
 vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);
 
 struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
@@ -118,7 +147,27 @@ void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
                       struct kvm_mp_state *mp_state);
 void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
 void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
+
+/*
+ * VM VCPU Args Set
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   num - number of arguments
+ *   ... - arguments, each of type uint64_t
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the first @num function input registers of the VCPU with @vcpuid,
+ * per the C calling convention of the architecture, to the values given
+ * as variable args. Each of the variable args is expected to be of type
+ * uint64_t. The maximum @num can be is specific to the architecture.
+ */
 void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...);
+
 void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid,
                    struct kvm_sregs *sregs);
 void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
@@ -147,15 +196,57 @@ int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
 const char *exit_reason_str(unsigned int exit_reason);
 
 void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);
+
+/*
+ * VM Virtual Page Map
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vaddr - VM Virtual Address
+ *   paddr - VM Physical Address
+ *   memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within @vm, creates a virtual translation for the page starting
+ * at @vaddr to the page starting at @paddr.
+ */
 void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-                uint32_t pgd_memslot);
+                uint32_t memslot);
+
 vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
                             uint32_t memslot);
 vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
                              vm_paddr_t paddr_min, uint32_t memslot);
 
-struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_size,
+/*
+ * Create a VM with reasonable defaults
+ *
+ * Input Args:
+ *   vcpuid - The id of the single VCPU to add to the VM.
+ *   extra_mem_pages - The number of extra pages to add (this will
+ *                     decide how much extra space we will need to
+ *                     setup the page tables using memslot 0)
+ *   guest_code - The vCPU's entry point
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   Pointer to opaque structure that describes the created VM.
+ */
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
                                 void *guest_code);
+
+/*
+ * Adds a vCPU with reasonable defaults (e.g. a stack)
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - The id of the VCPU to add to the VM.
+ *   guest_code - The vCPU's entry point
+ */
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
 
 bool vm_is_unrestricted_guest(struct kvm_vm *vm);
@@ -164,6 +255,21 @@ unsigned int vm_get_page_size(struct kvm_vm *vm);
 unsigned int vm_get_page_shift(struct kvm_vm *vm);
 unsigned int vm_get_max_gfn(struct kvm_vm *vm);
 
+unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
+unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages);
+unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages);
+static inline unsigned int
+vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
+{
+       unsigned int n;
+       n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages));
+#ifdef __s390x__
+       /* s390 requires 1M aligned guest sizes */
+       n = (n + 255) & ~255;
+#endif
+       return n;
+}
+
 struct kvm_userspace_memory_region *
 kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
                                 uint64_t end);
index a41db6f..5eb01bf 100644 (file)
 #include <fcntl.h>
 #include "kselftest.h"
 
+static inline int _no_printf(const char *format, ...) { return 0; }
+
+#ifdef DEBUG
+#define pr_debug(...) printf(__VA_ARGS__)
+#else
+#define pr_debug(...) _no_printf(__VA_ARGS__)
+#endif
+#ifndef QUIET
+#define pr_info(...) printf(__VA_ARGS__)
+#else
+#define pr_info(...) _no_printf(__VA_ARGS__)
+#endif
+
+void print_skip(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
+
 ssize_t test_write(int fd, const void *buf, size_t count);
 ssize_t test_read(int fd, void *buf, size_t count);
 int test_seq_read(const char *path, char **bufp, size_t *sizep);
 
 void test_assert(bool exp, const char *exp_str,
-                const char *file, unsigned int line, const char *fmt, ...);
+                const char *file, unsigned int line, const char *fmt, ...)
+               __attribute__((format(printf, 5, 6)));
 
 #define TEST_ASSERT(e, fmt, ...) \
        test_assert((e), #e, __FILE__, __LINE__, fmt, ##__VA_ARGS__)
@@ -39,4 +55,14 @@ void test_assert(bool exp, const char *exp_str,
                    #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \
 } while (0)
 
+#define TEST_FAIL(fmt, ...) \
+       TEST_ASSERT(false, fmt, ##__VA_ARGS__)
+
+size_t parse_size(const char *size);
+
+int64_t timespec_to_ns(struct timespec ts);
+struct timespec timespec_add_ns(struct timespec ts, int64_t ns);
+struct timespec timespec_add(struct timespec ts1, struct timespec ts2);
+struct timespec timespec_sub(struct timespec ts1, struct timespec ts2);
+
 #endif /* SELFTEST_KVM_TEST_UTIL_H */
index 6f38c3d..0299cd8 100644 (file)
@@ -24,8 +24,8 @@ void test_vcpu_creation(int first_vcpu_id, int num_vcpus)
        struct kvm_vm *vm;
        int i;
 
-       printf("Testing creating %d vCPUs, with IDs %d...%d.\n",
-              num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1);
+       pr_info("Testing creating %d vCPUs, with IDs %d...%d.\n",
+               num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1);
 
        vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
 
@@ -41,8 +41,8 @@ int main(int argc, char *argv[])
        int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID);
        int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
 
-       printf("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id);
-       printf("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus);
+       pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id);
+       pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus);
 
        /*
         * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID.
index 86036a5..2afa661 100644 (file)
@@ -130,7 +130,7 @@ void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
                ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8;
                break;
        default:
-               TEST_ASSERT(false, "Page table levels must be 2, 3, or 4");
+               TEST_FAIL("Page table levels must be 2, 3, or 4");
        }
 
        *ptep = paddr | 3;
@@ -173,20 +173,19 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
                        goto unmapped_gva;
                break;
        default:
-               TEST_ASSERT(false, "Page table levels must be 2, 3, or 4");
+               TEST_FAIL("Page table levels must be 2, 3, or 4");
        }
 
        return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
 
 unmapped_gva:
-       TEST_ASSERT(false, "No mapping for vm virtual address, "
-                   "gva: 0x%lx", gva);
+       TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
        exit(1);
 }
 
 static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level)
 {
-#ifdef DEBUG_VM
+#ifdef DEBUG
        static const char * const type[] = { "", "pud", "pmd", "pte" };
        uint64_t pte, *ptep;
 
@@ -197,7 +196,7 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t p
                ptep = addr_gpa2hva(vm, pte);
                if (!*ptep)
                        continue;
-               printf("%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep);
+               fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep);
                pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1);
        }
 #endif
@@ -215,7 +214,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
                ptep = addr_gpa2hva(vm, pgd);
                if (!*ptep)
                        continue;
-               printf("%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep);
+               fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep);
                pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level);
        }
 }
@@ -262,11 +261,11 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini
 
        switch (vm->mode) {
        case VM_MODE_P52V48_4K:
-               TEST_ASSERT(false, "AArch64 does not support 4K sized pages "
-                                  "with 52-bit physical address ranges");
+               TEST_FAIL("AArch64 does not support 4K sized pages "
+                         "with 52-bit physical address ranges");
        case VM_MODE_PXXV48_4K:
-               TEST_ASSERT(false, "AArch64 does not support 4K sized pages "
-                                  "with ANY-bit physical address ranges");
+               TEST_FAIL("AArch64 does not support 4K sized pages "
+                         "with ANY-bit physical address ranges");
        case VM_MODE_P52V48_64K:
                tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
                tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
@@ -288,7 +287,7 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini
                tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
                break;
        default:
-               TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode);
+               TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
        }
 
        sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */;
@@ -333,3 +332,21 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
 {
        aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code);
 }
+
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+       va_list ap;
+       int i;
+
+       TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n"
+                   "  num: %u\n", num);
+
+       va_start(ap, num);
+
+       for (i = 0; i < num; i++) {
+               set_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[i]),
+                       va_arg(ap, uint64_t));
+       }
+
+       va_end(ap);
+}
index 6cd9197..c8e0ec2 100644 (file)
@@ -62,7 +62,7 @@ void ucall_init(struct kvm_vm *vm, void *arg)
                if (ucall_mmio_init(vm, start + offset))
                        return;
        }
-       TEST_ASSERT(false, "Can't find a ucall mmio address");
+       TEST_FAIL("Can't find a ucall mmio address");
 }
 
 void ucall_uninit(struct kvm_vm *vm)
index d1cf9f6..5ebbd0d 100644 (file)
@@ -82,8 +82,10 @@ test_assert(bool exp, const char *exp_str,
                }
                va_end(ap);
 
-               if (errno == EACCES)
-                       ksft_exit_skip("Access denied - Exiting.\n");
+               if (errno == EACCES) {
+                       print_skip("Access denied - Exiting");
+                       exit(KSFT_SKIP);
+               }
                exit(254);
        }
 
index eaf351c..fedb2a7 100644 (file)
@@ -61,9 +61,9 @@ ssize_t test_write(int fd, const void *buf, size_t count)
                        continue;
 
                case 0:
-                       TEST_ASSERT(false, "Unexpected EOF,\n"
-                                   "  rc: %zi num_written: %zi num_left: %zu",
-                                   rc, num_written, num_left);
+                       TEST_FAIL("Unexpected EOF,\n"
+                                 "  rc: %zi num_written: %zi num_left: %zu",
+                                 rc, num_written, num_left);
                        break;
 
                default:
@@ -138,9 +138,9 @@ ssize_t test_read(int fd, void *buf, size_t count)
                        break;
 
                case 0:
-                       TEST_ASSERT(false, "Unexpected EOF,\n"
-                                   "  rc: %zi num_read: %zi num_left: %zu",
-                                   rc, num_read, num_left);
+                       TEST_FAIL("Unexpected EOF,\n"
+                                   rc: %zi num_read: %zi num_left: %zu",
+                                 rc, num_read, num_left);
                        break;
 
                default:
index a6dd040..8a3523d 100644 (file)
@@ -92,7 +92,7 @@ static void vm_open(struct kvm_vm *vm, int perm)
                exit(KSFT_SKIP);
 
        if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) {
-               fprintf(stderr, "immediate_exit not available, skipping test\n");
+               print_skip("immediate_exit not available");
                exit(KSFT_SKIP);
        }
 
@@ -113,6 +113,25 @@ const char * const vm_guest_mode_string[] = {
 _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
               "Missing new mode strings?");
 
+struct vm_guest_mode_params {
+       unsigned int pa_bits;
+       unsigned int va_bits;
+       unsigned int page_size;
+       unsigned int page_shift;
+};
+
+static const struct vm_guest_mode_params vm_guest_mode_params[] = {
+       { 52, 48,  0x1000, 12 },
+       { 52, 48, 0x10000, 16 },
+       { 48, 48,  0x1000, 12 },
+       { 48, 48, 0x10000, 16 },
+       { 40, 48,  0x1000, 12 },
+       { 40, 48, 0x10000, 16 },
+       {  0,  0,  0x1000, 12 },
+};
+_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
+              "Missing new mode params?");
+
 /*
  * VM Create
  *
@@ -136,7 +155,8 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 {
        struct kvm_vm *vm;
 
-       DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+       pr_debug("%s: mode='%s' pages='%ld' perm='%d'\n", __func__,
+                vm_guest_mode_string(mode), phy_pages, perm);
 
        vm = calloc(1, sizeof(*vm));
        TEST_ASSERT(vm != NULL, "Insufficient Memory");
@@ -144,67 +164,45 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
        vm->mode = mode;
        vm->type = 0;
 
+       vm->pa_bits = vm_guest_mode_params[mode].pa_bits;
+       vm->va_bits = vm_guest_mode_params[mode].va_bits;
+       vm->page_size = vm_guest_mode_params[mode].page_size;
+       vm->page_shift = vm_guest_mode_params[mode].page_shift;
+
        /* Setup mode specific traits. */
        switch (vm->mode) {
        case VM_MODE_P52V48_4K:
                vm->pgtable_levels = 4;
-               vm->pa_bits = 52;
-               vm->va_bits = 48;
-               vm->page_size = 0x1000;
-               vm->page_shift = 12;
                break;
        case VM_MODE_P52V48_64K:
                vm->pgtable_levels = 3;
-               vm->pa_bits = 52;
-               vm->va_bits = 48;
-               vm->page_size = 0x10000;
-               vm->page_shift = 16;
                break;
        case VM_MODE_P48V48_4K:
                vm->pgtable_levels = 4;
-               vm->pa_bits = 48;
-               vm->va_bits = 48;
-               vm->page_size = 0x1000;
-               vm->page_shift = 12;
                break;
        case VM_MODE_P48V48_64K:
                vm->pgtable_levels = 3;
-               vm->pa_bits = 48;
-               vm->va_bits = 48;
-               vm->page_size = 0x10000;
-               vm->page_shift = 16;
                break;
        case VM_MODE_P40V48_4K:
                vm->pgtable_levels = 4;
-               vm->pa_bits = 40;
-               vm->va_bits = 48;
-               vm->page_size = 0x1000;
-               vm->page_shift = 12;
                break;
        case VM_MODE_P40V48_64K:
                vm->pgtable_levels = 3;
-               vm->pa_bits = 40;
-               vm->va_bits = 48;
-               vm->page_size = 0x10000;
-               vm->page_shift = 16;
                break;
        case VM_MODE_PXXV48_4K:
 #ifdef __x86_64__
                kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
                TEST_ASSERT(vm->va_bits == 48, "Linear address width "
                            "(%d bits) not supported", vm->va_bits);
+               pr_debug("Guest physical address width detected: %d\n",
+                        vm->pa_bits);
                vm->pgtable_levels = 4;
-               vm->page_size = 0x1000;
-               vm->page_shift = 12;
-               DEBUG("Guest physical address width detected: %d\n",
-                     vm->pa_bits);
 #else
-               TEST_ASSERT(false, "VM_MODE_PXXV48_4K not supported on "
-                           "non-x86 platforms");
+               TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms");
 #endif
                break;
        default:
-               TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
+               TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
        }
 
 #ifdef __aarch64__
@@ -266,7 +264,7 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm)
                TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
                            "  rc: %i errno: %i\n"
                            "  slot: %u flags: 0x%x\n"
-                           "  guest_phys_addr: 0x%lx size: 0x%lx",
+                           "  guest_phys_addr: 0x%llx size: 0x%llx",
                            ret, errno, region->region.slot,
                            region->region.flags,
                            region->region.guest_phys_addr,
@@ -281,7 +279,7 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
 
        ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args);
        TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s",
-                   strerror(-ret));
+                   __func__, strerror(-ret));
 }
 
 void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
@@ -294,7 +292,7 @@ void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
 
        ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);
        TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s",
-                   strerror(-ret));
+                   __func__, strerror(-ret));
 }
 
 /*
@@ -582,6 +580,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
        size_t alignment;
 
+       TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
+               "Number of guest pages is not compatible with the host. "
+               "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages));
+
        TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical "
                "address not on a page boundary.\n"
                "  guest_paddr: 0x%lx vm->page_size: 0x%x",
@@ -600,7 +602,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        region = (struct userspace_mem_region *) userspace_mem_region_find(
                vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1);
        if (region != NULL)
-               TEST_ASSERT(false, "overlapping userspace_mem_region already "
+               TEST_FAIL("overlapping userspace_mem_region already "
                        "exists\n"
                        "  requested guest_paddr: 0x%lx npages: 0x%lx "
                        "page_size: 0x%x\n"
@@ -616,7 +618,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
                        break;
        }
        if (region != NULL)
-               TEST_ASSERT(false, "A mem region with the requested slot "
+               TEST_FAIL("A mem region with the requested slot "
                        "already exists.\n"
                        "  requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
                        "  existing slot: %u paddr: 0x%lx size: 0x%lx",
@@ -720,7 +722,7 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot)
                        "  requested slot: %u\n", memslot);
                fputs("---- vm dump ----\n", stderr);
                vm_dump(stderr, vm, 2);
-               TEST_ASSERT(false, "Mem region not found");
+               TEST_FAIL("Mem region not found");
        }
 
        return region;
@@ -757,6 +759,36 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
 }
 
 /*
+ * VM Memory Region Move
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   slot - Slot of the memory region to move
+ *   new_gpa - Starting guest physical address
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Change the gpa of a memory region.
+ */
+void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
+{
+       struct userspace_mem_region *region;
+       int ret;
+
+       region = memslot2region(vm, slot);
+
+       region->region.guest_phys_addr = new_gpa;
+
+       ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+
+       TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n"
+                   "ret: %i errno: %i slot: %u new_gpa: 0x%lx",
+                   ret, errno, slot, new_gpa);
+}
+
+/*
  * VCPU mmap Size
  *
  * Input Args: None
@@ -808,7 +840,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
        /* Confirm a vcpu with the specified id doesn't already exist. */
        vcpu = vcpu_find(vm, vcpuid);
        if (vcpu != NULL)
-               TEST_ASSERT(false, "vcpu with the specified id "
+               TEST_FAIL("vcpu with the specified id "
                        "already exists,\n"
                        "  requested vcpuid: %u\n"
                        "  existing vcpuid: %u state: %p",
@@ -901,8 +933,7 @@ static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
        } while (pgidx_start != 0);
 
 no_va_found:
-       TEST_ASSERT(false, "No vaddr of specified pages available, "
-               "pages: 0x%lx", pages);
+       TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages);
 
        /* NOT REACHED */
        return -1;
@@ -982,21 +1013,21 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
  *   vm - Virtual Machine
  *   vaddr - Virtuall address to map
  *   paddr - VM Physical Address
- *   size - The size of the range to map
+ *   npages - The number of pages to map
  *   pgd_memslot - Memory region slot for new virtual translation tables
  *
  * Output Args: None
  *
  * Return: None
  *
- * Within the VM given by vm, creates a virtual translation for the
- * page range starting at vaddr to the page range starting at paddr.
+ * Within the VM given by @vm, creates a virtual translation for
+ * @npages starting at @vaddr to the page range starting at @paddr.
  */
 void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-             size_t size, uint32_t pgd_memslot)
+             unsigned int npages, uint32_t pgd_memslot)
 {
        size_t page_size = vm->page_size;
-       size_t npages = size / page_size;
+       size_t size = npages * page_size;
 
        TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow");
        TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
@@ -1037,7 +1068,7 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
                                + (gpa - region->region.guest_phys_addr));
        }
 
-       TEST_ASSERT(false, "No vm physical memory at 0x%lx", gpa);
+       TEST_FAIL("No vm physical memory at 0x%lx", gpa);
        return NULL;
 }
 
@@ -1071,8 +1102,7 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
                                + (hva - (uintptr_t) region->host_mem));
        }
 
-       TEST_ASSERT(false, "No mapping to a guest physical address, "
-               "hva: %p", hva);
+       TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
        return -1;
 }
 
@@ -1703,3 +1733,43 @@ unsigned int vm_get_max_gfn(struct kvm_vm *vm)
 {
        return vm->max_gfn;
 }
+
+static unsigned int vm_calc_num_pages(unsigned int num_pages,
+                                     unsigned int page_shift,
+                                     unsigned int new_page_shift,
+                                     bool ceil)
+{
+       unsigned int n = 1 << (new_page_shift - page_shift);
+
+       if (page_shift >= new_page_shift)
+               return num_pages * (1 << (page_shift - new_page_shift));
+
+       return num_pages / n + !!(ceil && num_pages % n);
+}
+
+static inline int getpageshift(void)
+{
+       return __builtin_ffs(getpagesize()) - 1;
+}
+
+unsigned int
+vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
+{
+       return vm_calc_num_pages(num_guest_pages,
+                                vm_guest_mode_params[mode].page_shift,
+                                getpageshift(), true);
+}
+
+unsigned int
+vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages)
+{
+       return vm_calc_num_pages(num_host_pages, getpageshift(),
+                                vm_guest_mode_params[mode].page_shift, false);
+}
+
+unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size)
+{
+       unsigned int n;
+       n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size);
+       return vm_adjust_num_guest_pages(mode, n);
+}
index ac50c42..ca56a01 100644 (file)
 
 #define KVM_DEV_PATH           "/dev/kvm"
 
-#ifndef BITS_PER_BYTE
-#define BITS_PER_BYTE          8
-#endif
-
-#ifndef BITS_PER_LONG
-#define BITS_PER_LONG          (BITS_PER_BYTE * sizeof(long))
-#endif
-
-#define DIV_ROUND_UP(n, d)     (((n) + (d) - 1) / (d))
-#define BITS_TO_LONGS(nr)      DIV_ROUND_UP(nr, BITS_PER_LONG)
-
 struct userspace_mem_region {
        struct userspace_mem_region *next, *prev;
        struct kvm_userspace_memory_region region;
@@ -64,8 +53,56 @@ struct kvm_vm {
 };
 
 struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid);
+
+/*
+ * Virtual Translation Tables Dump
+ *
+ * Input Args:
+ *   stream - Output FILE stream
+ *   vm     - Virtual Machine
+ *   indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps to the FILE stream given by @stream, the contents of all the
+ * virtual translation tables for the VM given by @vm.
+ */
 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
+
+/*
+ * Register Dump
+ *
+ * Input Args:
+ *   stream - Output FILE stream
+ *   regs   - Registers
+ *   indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the registers given by @regs, to the FILE stream
+ * given by @stream.
+ */
 void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent);
+
+/*
+ * System Register Dump
+ *
+ * Input Args:
+ *   stream - Output FILE stream
+ *   sregs  - System registers
+ *   indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the system registers given by @sregs, to the FILE stream
+ * given by @stream.
+ */
 void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent);
 
 struct userspace_mem_region *
index 32a0236..8d94961 100644 (file)
@@ -51,22 +51,6 @@ static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot)
                | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH);
 }
 
-/*
- * VM Virtual Page Map
- *
- * Input Args:
- *   vm - Virtual Machine
- *   gva - VM Virtual Address
- *   gpa - VM Physical Address
- *   memslot - Memory region slot for new virtual translation tables
- *
- * Output Args: None
- *
- * Return: None
- *
- * Within the VM given by vm, creates a virtual translation for the page
- * starting at vaddr to the page starting at paddr.
- */
 void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa,
                 uint32_t memslot)
 {
@@ -107,26 +91,6 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa,
        entry[idx] = gpa;
 }
 
-/*
- * Address Guest Virtual to Guest Physical
- *
- * Input Args:
- *   vm - Virtual Machine
- *   gpa - VM virtual address
- *
- * Output Args: None
- *
- * Return:
- *   Equivalent VM physical address
- *
- * Translates the VM virtual address given by gva to a VM physical
- * address and then locates the memory region containing the VM
- * physical address, within the VM given by vm.  When found, the host
- * virtual address providing the memory to the vm physical address is
- * returned.
- * A TEST_ASSERT failure occurs if no region containing translated
- * VM virtual address exists.
- */
 vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 {
        int ri, idx;
@@ -196,21 +160,6 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
        virt_dump_region(stream, vm, indent, vm->pgd);
 }
 
-/*
- * Create a VM with reasonable defaults
- *
- * Input Args:
- *   vcpuid - The id of the single VCPU to add to the VM.
- *   extra_mem_pages - The size of extra memories to add (this will
- *                     decide how much extra space we will need to
- *                     setup the page tables using mem slot 0)
- *   guest_code - The vCPU's entry point
- *
- * Output Args: None
- *
- * Return:
- *   Pointer to opaque structure that describes the created VM.
- */
 struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
                                 void *guest_code)
 {
@@ -231,13 +180,6 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
        return vm;
 }
 
-/*
- * Adds a vCPU with reasonable defaults (i.e. a stack and initial PSW)
- *
- * Input Args:
- *   vcpuid - The id of the VCPU to add to the VM.
- *   guest_code - The vCPU's entry point
- */
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
 {
        size_t stack_size =  DEFAULT_STACK_PGS * getpagesize();
@@ -269,6 +211,26 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
        run->psw_addr = (uintptr_t)guest_code;
 }
 
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+       va_list ap;
+       struct kvm_regs regs;
+       int i;
+
+       TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n"
+                   "  num: %u\n",
+                   num);
+
+       va_start(ap, num);
+       vcpu_regs_get(vm, vcpuid, &regs);
+
+       for (i = 0; i < num; i++)
+               regs.gprs[i + 2] = va_arg(ap, uint64_t);
+
+       vcpu_regs_set(vm, vcpuid, &regs);
+       va_end(ap);
+}
+
 void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
 {
        struct vcpu *vcpu = vm->vcpu_head;
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
new file mode 100644 (file)
index 0000000..689e97c
--- /dev/null
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/test_util.c
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+#include <stdlib.h>
+#include <ctype.h>
+#include <limits.h>
+#include <assert.h>
+#include "test_util.h"
+
+/*
+ * Parses "[0-9]+[kmgt]?".
+ */
+size_t parse_size(const char *size)
+{
+       size_t base;
+       char *scale;
+       int shift = 0;
+
+       TEST_ASSERT(size && isdigit(size[0]), "Need at least one digit in '%s'", size);
+
+       base = strtoull(size, &scale, 0);
+
+       TEST_ASSERT(base != ULLONG_MAX, "Overflow parsing size!");
+
+       switch (tolower(*scale)) {
+       case 't':
+               shift = 40;
+               break;
+       case 'g':
+               shift = 30;
+               break;
+       case 'm':
+               shift = 20;
+               break;
+       case 'k':
+               shift = 10;
+               break;
+       case 'b':
+       case '\0':
+               shift = 0;
+               break;
+       default:
+               TEST_ASSERT(false, "Unknown size letter %c", *scale);
+       }
+
+       TEST_ASSERT((base << shift) >> shift == base, "Overflow scaling size!");
+
+       return base << shift;
+}
+
+int64_t timespec_to_ns(struct timespec ts)
+{
+       return (int64_t)ts.tv_nsec + 1000000000LL * (int64_t)ts.tv_sec;
+}
+
+struct timespec timespec_add_ns(struct timespec ts, int64_t ns)
+{
+       struct timespec res;
+
+       res.tv_nsec = ts.tv_nsec + ns;
+       res.tv_sec = ts.tv_sec + res.tv_nsec / 1000000000LL;
+       res.tv_nsec %= 1000000000LL;
+
+       return res;
+}
+
+struct timespec timespec_add(struct timespec ts1, struct timespec ts2)
+{
+       int64_t ns1 = timespec_to_ns(ts1);
+       int64_t ns2 = timespec_to_ns(ts2);
+       return timespec_add_ns((struct timespec){0}, ns1 + ns2);
+}
+
+struct timespec timespec_sub(struct timespec ts1, struct timespec ts2)
+{
+       int64_t ns1 = timespec_to_ns(ts1);
+       int64_t ns2 = timespec_to_ns(ts2);
+       return timespec_add_ns((struct timespec){0}, ns1 - ns2);
+}
+
+void print_skip(const char *fmt, ...)
+{
+       va_list ap;
+
+       assert(fmt);
+       va_start(ap, fmt);
+       vprintf(fmt, ap);
+       va_end(ap);
+       puts(", skipping test");
+}
index 683d3bd..f6eb34e 100644 (file)
@@ -77,20 +77,6 @@ struct pageTableEntry {
        uint64_t execute_disable:1;
 };
 
-/* Register Dump
- *
- * Input Args:
- *   indent - Left margin indent amount
- *   regs - register
- *
- * Output Args:
- *   stream - Output FILE stream
- *
- * Return: None
- *
- * Dumps the state of the registers given by regs, to the FILE stream
- * given by steam.
- */
 void regs_dump(FILE *stream, struct kvm_regs *regs,
               uint8_t indent)
 {
@@ -115,19 +101,20 @@ void regs_dump(FILE *stream, struct kvm_regs *regs,
                regs->rip, regs->rflags);
 }
 
-/* Segment Dump
+/*
+ * Segment Dump
  *
  * Input Args:
- *   indent - Left margin indent amount
+ *   stream  - Output FILE stream
  *   segment - KVM segment
+ *   indent  - Left margin indent amount
  *
- * Output Args:
- *   stream - Output FILE stream
+ * Output Args: None
  *
  * Return: None
  *
- * Dumps the state of the KVM segment given by segment, to the FILE stream
- * given by steam.
+ * Dumps the state of the KVM segment given by @segment, to the FILE stream
+ * given by @stream.
  */
 static void segment_dump(FILE *stream, struct kvm_segment *segment,
                         uint8_t indent)
@@ -146,19 +133,20 @@ static void segment_dump(FILE *stream, struct kvm_segment *segment,
                segment->unusable, segment->padding);
 }
 
-/* dtable Dump
+/*
+ * dtable Dump
  *
  * Input Args:
- *   indent - Left margin indent amount
+ *   stream - Output FILE stream
  *   dtable - KVM dtable
+ *   indent - Left margin indent amount
  *
- * Output Args:
- *   stream - Output FILE stream
+ * Output Args: None
  *
  * Return: None
  *
- * Dumps the state of the KVM dtable given by dtable, to the FILE stream
- * given by steam.
+ * Dumps the state of the KVM dtable given by @dtable, to the FILE stream
+ * given by @stream.
  */
 static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
                        uint8_t indent)
@@ -169,20 +157,6 @@ static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
                dtable->padding[0], dtable->padding[1], dtable->padding[2]);
 }
 
-/* System Register Dump
- *
- * Input Args:
- *   indent - Left margin indent amount
- *   sregs - System registers
- *
- * Output Args:
- *   stream - Output FILE stream
- *
- * Return: None
- *
- * Dumps the state of the system registers given by sregs, to the FILE stream
- * given by steam.
- */
 void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
                uint8_t indent)
 {
@@ -240,21 +214,6 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
        }
 }
 
-/* VM Virtual Page Map
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vaddr - VM Virtual Address
- *   paddr - VM Physical Address
- *   pgd_memslot - Memory region slot for new virtual translation tables
- *
- * Output Args: None
- *
- * Return: None
- *
- * Within the VM given by vm, creates a virtual translation for the page
- * starting at vaddr to the page starting at paddr.
- */
 void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
        uint32_t pgd_memslot)
 {
@@ -326,20 +285,6 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
        pte[index[0]].present = 1;
 }
 
-/* Virtual Translation Tables Dump
- *
- * Input Args:
- *   vm - Virtual Machine
- *   indent - Left margin indent amount
- *
- * Output Args:
- *   stream - Output FILE stream
- *
- * Return: None
- *
- * Dumps to the FILE stream given by stream, the contents of all the
- * virtual translation tables for the VM given by vm.
- */
 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 {
        struct pageMapL4Entry *pml4e, *pml4e_start;
@@ -421,7 +366,8 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
        }
 }
 
-/* Set Unusable Segment
+/*
+ * Set Unusable Segment
  *
  * Input Args: None
  *
@@ -430,7 +376,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
  *
  * Return: None
  *
- * Sets the segment register pointed to by segp to an unusable state.
+ * Sets the segment register pointed to by @segp to an unusable state.
  */
 static void kvm_seg_set_unusable(struct kvm_segment *segp)
 {
@@ -460,7 +406,8 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
 }
 
 
-/* Set Long Mode Flat Kernel Code Segment
+/*
+ * Set Long Mode Flat Kernel Code Segment
  *
  * Input Args:
  *   vm - VM whose GDT is being filled, or NULL to only write segp
@@ -471,8 +418,8 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
  *
  * Return: None
  *
- * Sets up the KVM segment pointed to by segp, to be a code segment
- * with the selector value given by selector.
+ * Sets up the KVM segment pointed to by @segp, to be a code segment
+ * with the selector value given by @selector.
  */
 static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector,
        struct kvm_segment *segp)
@@ -491,7 +438,8 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector,
                kvm_seg_fill_gdt_64bit(vm, segp);
 }
 
-/* Set Long Mode Flat Kernel Data Segment
+/*
+ * Set Long Mode Flat Kernel Data Segment
  *
  * Input Args:
  *   vm - VM whose GDT is being filled, or NULL to only write segp
@@ -502,8 +450,8 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector,
  *
  * Return: None
  *
- * Sets up the KVM segment pointed to by segp, to be a data segment
- * with the selector value given by selector.
+ * Sets up the KVM segment pointed to by @segp, to be a data segment
+ * with the selector value given by @selector.
  */
 static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
        struct kvm_segment *segp)
@@ -521,24 +469,6 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
                kvm_seg_fill_gdt_64bit(vm, segp);
 }
 
-/* Address Guest Virtual to Guest Physical
- *
- * Input Args:
- *   vm - Virtual Machine
- *   gpa - VM virtual address
- *
- * Output Args: None
- *
- * Return:
- *   Equivalent VM physical address
- *
- * Translates the VM virtual address given by gva to a VM physical
- * address and then locates the memory region containing the VM
- * physical address, within the VM given by vm.  When found, the host
- * virtual address providing the memory to the vm physical address is returned.
- * A TEST_ASSERT failure occurs if no region containing translated
- * VM virtual address exists.
- */
 vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 {
        uint16_t index[4];
@@ -576,8 +506,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
        return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu);
 
 unmapped_gva:
-       TEST_ASSERT(false, "No mapping for vm virtual address, "
-                   "gva: 0x%lx", gva);
+       TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
        exit(EXIT_FAILURE);
 }
 
@@ -634,18 +563,13 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m
                break;
 
        default:
-               TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode);
+               TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
        }
 
        sregs.cr3 = vm->pgd;
        vcpu_sregs_set(vm, vcpuid, &sregs);
 }
-/* Adds a vCPU with reasonable defaults (i.e., a stack)
- *
- * Input Args:
- *   vcpuid - The id of the VCPU to add to the VM.
- *   guest_code - The vCPU's entry point
- */
+
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
 {
        struct kvm_mp_state mp_state;
@@ -670,7 +594,8 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
        vcpu_set_mp_state(vm, vcpuid, &mp_state);
 }
 
-/* Allocate an instance of struct kvm_cpuid2
+/*
+ * Allocate an instance of struct kvm_cpuid2
  *
  * Input Args: None
  *
@@ -703,7 +628,8 @@ static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
        return cpuid;
 }
 
-/* KVM Supported CPUID Get
+/*
+ * KVM Supported CPUID Get
  *
  * Input Args: None
  *
@@ -735,11 +661,12 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
        return cpuid;
 }
 
-/* Locate a cpuid entry.
+/*
+ * Locate a cpuid entry.
  *
  * Input Args:
- *   cpuid: The cpuid.
  *   function: The function of the cpuid entry to find.
+ *   index: The index of the cpuid entry.
  *
  * Output Args: None
  *
@@ -766,7 +693,8 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
        return entry;
 }
 
-/* VM VCPU CPUID Set
+/*
+ * VM VCPU CPUID Set
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -793,20 +721,6 @@ void vcpu_set_cpuid(struct kvm_vm *vm,
 
 }
 
-/* Create a VM with reasonable defaults
- *
- * Input Args:
- *   vcpuid - The id of the single VCPU to add to the VM.
- *   extra_mem_pages - The size of extra memories to add (this will
- *                     decide how much extra space we will need to
- *                     setup the page tables using mem slot 0)
- *   guest_code - The vCPU's entry point
- *
- * Output Args: None
- *
- * Return:
- *   Pointer to opaque structure that describes the created VM.
- */
 struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
                                 void *guest_code)
 {
@@ -837,7 +751,8 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
        return vm;
 }
 
-/* VCPU Get MSR
+/*
+ * VCPU Get MSR
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -869,7 +784,8 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
        return buffer.entry.data;
 }
 
-/* _VCPU Set MSR
+/*
+ * _VCPU Set MSR
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -902,7 +818,8 @@ int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
        return r;
 }
 
-/* VCPU Set MSR
+/*
+ * VCPU Set MSR
  *
  * Input Args:
  *   vm - Virtual Machine
@@ -926,22 +843,6 @@ void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
                "  rc: %i errno: %i", r, errno);
 }
 
-/* VM VCPU Args Set
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vcpuid - VCPU ID
- *   num - number of arguments
- *   ... - arguments, each of type uint64_t
- *
- * Output Args: None
- *
- * Return: None
- *
- * Sets the first num function input arguments to the values
- * given as variable args.  Each of the variable args is expected to
- * be of type uint64_t.
- */
 void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
 {
        va_list ap;
@@ -976,22 +877,6 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
        va_end(ap);
 }
 
-/*
- * VM VCPU Dump
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vcpuid - VCPU ID
- *   indent - Left margin indent amount
- *
- * Output Args:
- *   stream - Output FILE stream
- *
- * Return: None
- *
- * Dumps the current state of the VCPU specified by vcpuid, within the VM
- * given by vm, to the FILE stream given by stream.
- */
 void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
 {
        struct kvm_regs regs;
index 6e05a8f..c424010 100644 (file)
@@ -154,7 +154,7 @@ void nested_svm_check_supported(void)
                kvm_get_supported_cpuid_entry(0x80000001);
 
        if (!(entry->ecx & CPUID_SVM)) {
-               fprintf(stderr, "nested SVM not enabled, skipping test\n");
+               print_skip("nested SVM not enabled");
                exit(KSFT_SKIP);
        }
 }
index 7aaa99c..6f17f69 100644 (file)
@@ -191,7 +191,7 @@ bool load_vmcs(struct vmx_pages *vmx)
                if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa,
                                  vmx->enlightened_vmcs))
                        return false;
-               current_evmcs->revision_id = vmcs_revision();
+               current_evmcs->revision_id = EVMCS_VERSION;
        }
 
        return true;
@@ -381,7 +381,7 @@ void nested_vmx_check_supported(void)
        struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
 
        if (!(entry->ecx & CPUID_VMX)) {
-               fprintf(stderr, "nested VMX not enabled, skipping test\n");
+               print_skip("nested VMX not enabled");
                exit(KSFT_SKIP);
        }
 }
index 9edaa9a..9f49ead 100644 (file)
@@ -40,7 +40,7 @@ int main(int argc, char *argv[])
 
        maxsize = kvm_check_cap(KVM_CAP_S390_MEM_OP);
        if (!maxsize) {
-               fprintf(stderr, "CAP_S390_MEM_OP not supported -> skip test\n");
+               print_skip("CAP_S390_MEM_OP not supported");
                exit(KSFT_SKIP);
        }
        if (maxsize > sizeof(mem1))
index 1485bc6..b143db6 100644 (file)
@@ -20,29 +20,42 @@ struct kvm_s390_irq buf[VCPU_ID + LOCAL_IRQS];
 
 struct kvm_vm *vm;
 struct kvm_run *run;
-struct kvm_sync_regs *regs;
-static uint64_t regs_null[16];
-
-static uint64_t crs[16] = { 0x40000ULL,
-                           0x42000ULL,
-                           0, 0, 0, 0, 0,
-                           0x43000ULL,
-                           0, 0, 0, 0, 0,
-                           0x44000ULL,
-                           0, 0
-};
+struct kvm_sync_regs *sync_regs;
+static uint8_t regs_null[512];
 
 static void guest_code_initial(void)
 {
-       /* Round toward 0 */
-       uint32_t fpc = 0x11;
+       /* set several CRs to "safe" value */
+       unsigned long cr2_59 = 0x10;    /* enable guarded storage */
+       unsigned long cr8_63 = 0x1;     /* monitor mask = 1 */
+       unsigned long cr10 = 1;         /* PER START */
+       unsigned long cr11 = -1;        /* PER END */
+
 
        /* Dirty registers */
        asm volatile (
-               "       lctlg   0,15,%0\n"
-               "       sfpc    %1\n"
-               : : "Q" (crs), "d" (fpc));
-       GUEST_SYNC(0);
+               "       lghi    2,0x11\n"       /* Round toward 0 */
+               "       sfpc    2\n"            /* set fpc to !=0 */
+               "       lctlg   2,2,%0\n"
+               "       lctlg   8,8,%1\n"
+               "       lctlg   10,10,%2\n"
+               "       lctlg   11,11,%3\n"
+               /* now clobber some general purpose regs */
+               "       llihh   0,0xffff\n"
+               "       llihl   1,0x5555\n"
+               "       llilh   2,0xaaaa\n"
+               "       llill   3,0x0000\n"
+               /* now clobber a floating point reg */
+               "       lghi    4,0x1\n"
+               "       cdgbr   0,4\n"
+               /* now clobber an access reg */
+               "       sar     9,4\n"
+               /* We embed diag 501 here to control register content */
+               "       diag 0,0,0x501\n"
+               :
+               : "m" (cr2_59), "m" (cr8_63), "m" (cr10), "m" (cr11)
+               /* no clobber list as this should not return */
+               );
 }
 
 static void test_one_reg(uint64_t id, uint64_t value)
@@ -53,7 +66,7 @@ static void test_one_reg(uint64_t id, uint64_t value)
        reg.addr = (uintptr_t)&eval_reg;
        reg.id = id;
        vcpu_get_reg(vm, VCPU_ID, &reg);
-       TEST_ASSERT(eval_reg == value, "value == %s", value);
+       TEST_ASSERT(eval_reg == value, "value == 0x%lx", value);
 }
 
 static void assert_noirq(void)
@@ -87,6 +100,31 @@ static void assert_clear(void)
 
        vcpu_fpu_get(vm, VCPU_ID, &fpu);
        TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0");
+
+       /* sync regs */
+       TEST_ASSERT(!memcmp(sync_regs->gprs, regs_null, sizeof(sync_regs->gprs)),
+                   "gprs0-15 == 0 (sync_regs)");
+
+       TEST_ASSERT(!memcmp(sync_regs->acrs, regs_null, sizeof(sync_regs->acrs)),
+                   "acrs0-15 == 0 (sync_regs)");
+
+       TEST_ASSERT(!memcmp(sync_regs->vrs, regs_null, sizeof(sync_regs->vrs)),
+                   "vrs0-15 == 0 (sync_regs)");
+}
+
+static void assert_initial_noclear(void)
+{
+       TEST_ASSERT(sync_regs->gprs[0] == 0xffff000000000000UL,
+                   "gpr0 == 0xffff000000000000 (sync_regs)");
+       TEST_ASSERT(sync_regs->gprs[1] == 0x0000555500000000UL,
+                   "gpr1 == 0x0000555500000000 (sync_regs)");
+       TEST_ASSERT(sync_regs->gprs[2] == 0x00000000aaaa0000UL,
+                   "gpr2 == 0x00000000aaaa0000 (sync_regs)");
+       TEST_ASSERT(sync_regs->gprs[3] == 0x0000000000000000UL,
+                   "gpr3 == 0x0000000000000000 (sync_regs)");
+       TEST_ASSERT(sync_regs->fprs[0] == 0x3ff0000000000000UL,
+                   "fpr0 == 0f1 (sync_regs)");
+       TEST_ASSERT(sync_regs->acrs[9] == 1, "ar9 == 1 (sync_regs)");
 }
 
 static void assert_initial(void)
@@ -94,12 +132,32 @@ static void assert_initial(void)
        struct kvm_sregs sregs;
        struct kvm_fpu fpu;
 
+       /* KVM_GET_SREGS */
        vcpu_sregs_get(vm, VCPU_ID, &sregs);
-       TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0");
-       TEST_ASSERT(sregs.crs[14] == 0xC2000000UL, "cr14 == 0xC2000000");
+       TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)");
+       TEST_ASSERT(sregs.crs[14] == 0xC2000000UL,
+                   "cr14 == 0xC2000000 (KVM_GET_SREGS)");
        TEST_ASSERT(!memcmp(&sregs.crs[1], regs_null, sizeof(sregs.crs[1]) * 12),
-                   "cr1-13 == 0");
-       TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0");
+                   "cr1-13 == 0 (KVM_GET_SREGS)");
+       TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0 (KVM_GET_SREGS)");
+
+       /* sync regs */
+       TEST_ASSERT(sync_regs->crs[0] == 0xE0UL, "cr0 == 0xE0 (sync_regs)");
+       TEST_ASSERT(sync_regs->crs[14] == 0xC2000000UL,
+                   "cr14 == 0xC2000000 (sync_regs)");
+       TEST_ASSERT(!memcmp(&sync_regs->crs[1], regs_null, 8 * 12),
+                   "cr1-13 == 0 (sync_regs)");
+       TEST_ASSERT(sync_regs->crs[15] == 0, "cr15 == 0 (sync_regs)");
+       TEST_ASSERT(sync_regs->fpc == 0, "fpc == 0 (sync_regs)");
+       TEST_ASSERT(sync_regs->todpr == 0, "todpr == 0 (sync_regs)");
+       TEST_ASSERT(sync_regs->cputm == 0, "cputm == 0 (sync_regs)");
+       TEST_ASSERT(sync_regs->ckc == 0, "ckc == 0 (sync_regs)");
+       TEST_ASSERT(sync_regs->pp == 0, "pp == 0 (sync_regs)");
+       TEST_ASSERT(sync_regs->gbea == 1, "gbea == 1 (sync_regs)");
+
+       /* kvm_run */
+       TEST_ASSERT(run->psw_addr == 0, "psw_addr == 0 (kvm_run)");
+       TEST_ASSERT(run->psw_mask == 0, "psw_mask == 0 (kvm_run)");
 
        vcpu_fpu_get(vm, VCPU_ID, &fpu);
        TEST_ASSERT(!fpu.fpc, "fpc == 0");
@@ -111,9 +169,19 @@ static void assert_initial(void)
        test_one_reg(KVM_REG_S390_CLOCK_COMP, 0);
 }
 
+static void assert_normal_noclear(void)
+{
+       TEST_ASSERT(sync_regs->crs[2] == 0x10, "cr2 == 10 (sync_regs)");
+       TEST_ASSERT(sync_regs->crs[8] == 1, "cr10 == 1 (sync_regs)");
+       TEST_ASSERT(sync_regs->crs[10] == 1, "cr10 == 1 (sync_regs)");
+       TEST_ASSERT(sync_regs->crs[11] == -1, "cr11 == -1 (sync_regs)");
+}
+
 static void assert_normal(void)
 {
        test_one_reg(KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID);
+       TEST_ASSERT(sync_regs->pft == KVM_S390_PFAULT_TOKEN_INVALID,
+                       "pft == 0xff.....  (sync_regs)");
        assert_noirq();
 }
 
@@ -134,53 +202,67 @@ static void inject_irq(int cpu_id)
 
 static void test_normal(void)
 {
-       printf("Testing normal reset\n");
+       pr_info("Testing normal reset\n");
        /* Create VM */
        vm = vm_create_default(VCPU_ID, 0, guest_code_initial);
        run = vcpu_state(vm, VCPU_ID);
-       regs = &run->s.regs;
+       sync_regs = &run->s.regs;
 
        vcpu_run(vm, VCPU_ID);
 
        inject_irq(VCPU_ID);
 
        vcpu_ioctl(vm, VCPU_ID, KVM_S390_NORMAL_RESET, 0);
+
+       /* must clears */
        assert_normal();
+       /* must not clears */
+       assert_normal_noclear();
+       assert_initial_noclear();
+
        kvm_vm_free(vm);
 }
 
 static void test_initial(void)
 {
-       printf("Testing initial reset\n");
+       pr_info("Testing initial reset\n");
        vm = vm_create_default(VCPU_ID, 0, guest_code_initial);
        run = vcpu_state(vm, VCPU_ID);
-       regs = &run->s.regs;
+       sync_regs = &run->s.regs;
 
        vcpu_run(vm, VCPU_ID);
 
        inject_irq(VCPU_ID);
 
        vcpu_ioctl(vm, VCPU_ID, KVM_S390_INITIAL_RESET, 0);
+
+       /* must clears */
        assert_normal();
        assert_initial();
+       /* must not clears */
+       assert_initial_noclear();
+
        kvm_vm_free(vm);
 }
 
 static void test_clear(void)
 {
-       printf("Testing clear reset\n");
+       pr_info("Testing clear reset\n");
        vm = vm_create_default(VCPU_ID, 0, guest_code_initial);
        run = vcpu_state(vm, VCPU_ID);
-       regs = &run->s.regs;
+       sync_regs = &run->s.regs;
 
        vcpu_run(vm, VCPU_ID);
 
        inject_irq(VCPU_ID);
 
        vcpu_ioctl(vm, VCPU_ID, KVM_S390_CLEAR_RESET, 0);
+
+       /* must clears */
        assert_normal();
        assert_initial();
        assert_clear();
+
        kvm_vm_free(vm);
 }
 
index b705637..5731ccf 100644 (file)
@@ -42,6 +42,13 @@ static void guest_code(void)
                    " values did not match: 0x%llx, 0x%llx\n", \
                    left->reg, right->reg)
 
+#define REG_COMPARE32(reg) \
+       TEST_ASSERT(left->reg == right->reg, \
+                   "Register " #reg \
+                   " values did not match: 0x%x, 0x%x\n", \
+                   left->reg, right->reg)
+
+
 static void compare_regs(struct kvm_regs *left, struct kvm_sync_regs *right)
 {
        int i;
@@ -55,7 +62,7 @@ static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right)
        int i;
 
        for (i = 0; i < 16; i++)
-               REG_COMPARE(acrs[i]);
+               REG_COMPARE32(acrs[i]);
 
        for (i = 0; i < 16; i++)
                REG_COMPARE(crs[i]);
@@ -79,7 +86,7 @@ int main(int argc, char *argv[])
 
        cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
        if (!cap) {
-               fprintf(stderr, "CAP_SYNC_REGS not supported, skipping test\n");
+               print_skip("CAP_SYNC_REGS not supported");
                exit(KSFT_SKIP);
        }
 
@@ -155,7 +162,7 @@ int main(int argc, char *argv[])
                    "r11 sync regs value incorrect 0x%llx.",
                    run->s.regs.gprs[11]);
        TEST_ASSERT(run->s.regs.acrs[0]  == 1 << 11,
-                   "acr0 sync regs value incorrect 0x%llx.",
+                   "acr0 sync regs value incorrect 0x%x.",
                    run->s.regs.acrs[0]);
 
        vcpu_regs_get(vm, VCPU_ID, &regs);
diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c
new file mode 100644 (file)
index 0000000..fcc8400
--- /dev/null
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * steal/stolen time test
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+#include <sys/syscall.h>
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define NR_VCPUS               4
+#define ST_GPA_BASE            (1 << 30)
+#define MIN_RUN_DELAY_NS       200000UL
+
+static void *st_gva[NR_VCPUS];
+static uint64_t guest_stolen_time[NR_VCPUS];
+
+#if defined(__x86_64__)
+
+/* steal_time must have 64-byte alignment */
+#define STEAL_TIME_SIZE                ((sizeof(struct kvm_steal_time) + 63) & ~63)
+
+static void check_status(struct kvm_steal_time *st)
+{
+       GUEST_ASSERT(!(READ_ONCE(st->version) & 1));
+       GUEST_ASSERT(READ_ONCE(st->flags) == 0);
+       GUEST_ASSERT(READ_ONCE(st->preempted) == 0);
+}
+
+static void guest_code(int cpu)
+{
+       struct kvm_steal_time *st = st_gva[cpu];
+       uint32_t version;
+
+       GUEST_ASSERT(rdmsr(MSR_KVM_STEAL_TIME) == ((uint64_t)st_gva[cpu] | KVM_MSR_ENABLED));
+
+       memset(st, 0, sizeof(*st));
+       GUEST_SYNC(0);
+
+       check_status(st);
+       WRITE_ONCE(guest_stolen_time[cpu], st->steal);
+       version = READ_ONCE(st->version);
+       check_status(st);
+       GUEST_SYNC(1);
+
+       check_status(st);
+       GUEST_ASSERT(version < READ_ONCE(st->version));
+       WRITE_ONCE(guest_stolen_time[cpu], st->steal);
+       check_status(st);
+       GUEST_DONE();
+}
+
+static void steal_time_init(struct kvm_vm *vm)
+{
+       int i;
+
+       if (!(kvm_get_supported_cpuid_entry(KVM_CPUID_FEATURES)->eax &
+             KVM_FEATURE_STEAL_TIME)) {
+               print_skip("steal-time not supported");
+               exit(KSFT_SKIP);
+       }
+
+       for (i = 0; i < NR_VCPUS; ++i) {
+               int ret;
+
+               vcpu_set_cpuid(vm, i, kvm_get_supported_cpuid());
+
+               /* ST_GPA_BASE is identity mapped */
+               st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
+               sync_global_to_guest(vm, st_gva[i]);
+
+               ret = _vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK);
+               TEST_ASSERT(ret == 0, "Bad GPA didn't fail");
+
+               vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED);
+       }
+}
+
+static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]);
+       int i;
+
+       pr_info("VCPU%d:\n", vcpuid);
+       pr_info("    steal:     %lld\n", st->steal);
+       pr_info("    version:   %d\n", st->version);
+       pr_info("    flags:     %d\n", st->flags);
+       pr_info("    preempted: %d\n", st->preempted);
+       pr_info("    u8_pad:    ");
+       for (i = 0; i < 3; ++i)
+               pr_info("%d", st->u8_pad[i]);
+       pr_info("\n    pad:       ");
+       for (i = 0; i < 11; ++i)
+               pr_info("%d", st->pad[i]);
+       pr_info("\n");
+}
+
+#elif defined(__aarch64__)
+
+/* PV_TIME_ST must have 64-byte alignment */
+#define STEAL_TIME_SIZE                ((sizeof(struct st_time) + 63) & ~63)
+
+#define SMCCC_ARCH_FEATURES    0x80000001
+#define PV_TIME_FEATURES       0xc5000020
+#define PV_TIME_ST             0xc5000021
+
+struct st_time {
+       uint32_t rev;
+       uint32_t attr;
+       uint64_t st_time;
+};
+
+static int64_t smccc(uint32_t func, uint32_t arg)
+{
+       unsigned long ret;
+
+       asm volatile(
+               "mov    x0, %1\n"
+               "mov    x1, %2\n"
+               "hvc    #0\n"
+               "mov    %0, x0\n"
+       : "=r" (ret) : "r" (func), "r" (arg) :
+         "x0", "x1", "x2", "x3");
+
+       return ret;
+}
+
+static void check_status(struct st_time *st)
+{
+       GUEST_ASSERT(READ_ONCE(st->rev) == 0);
+       GUEST_ASSERT(READ_ONCE(st->attr) == 0);
+}
+
+static void guest_code(int cpu)
+{
+       struct st_time *st;
+       int64_t status;
+
+       status = smccc(SMCCC_ARCH_FEATURES, PV_TIME_FEATURES);
+       GUEST_ASSERT(status == 0);
+       status = smccc(PV_TIME_FEATURES, PV_TIME_FEATURES);
+       GUEST_ASSERT(status == 0);
+       status = smccc(PV_TIME_FEATURES, PV_TIME_ST);
+       GUEST_ASSERT(status == 0);
+
+       status = smccc(PV_TIME_ST, 0);
+       GUEST_ASSERT(status != -1);
+       GUEST_ASSERT(status == (ulong)st_gva[cpu]);
+
+       st = (struct st_time *)status;
+       GUEST_SYNC(0);
+
+       check_status(st);
+       WRITE_ONCE(guest_stolen_time[cpu], st->st_time);
+       GUEST_SYNC(1);
+
+       check_status(st);
+       WRITE_ONCE(guest_stolen_time[cpu], st->st_time);
+       GUEST_DONE();
+}
+
+static void steal_time_init(struct kvm_vm *vm)
+{
+       struct kvm_device_attr dev = {
+               .group = KVM_ARM_VCPU_PVTIME_CTRL,
+               .attr = KVM_ARM_VCPU_PVTIME_IPA,
+       };
+       int i, ret;
+
+       ret = _vcpu_ioctl(vm, 0, KVM_HAS_DEVICE_ATTR, &dev);
+       if (ret != 0 && errno == ENXIO) {
+               print_skip("steal-time not supported");
+               exit(KSFT_SKIP);
+       }
+
+       for (i = 0; i < NR_VCPUS; ++i) {
+               uint64_t st_ipa;
+
+               vcpu_ioctl(vm, i, KVM_HAS_DEVICE_ATTR, &dev);
+
+               dev.addr = (uint64_t)&st_ipa;
+
+               /* ST_GPA_BASE is identity mapped */
+               st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
+               sync_global_to_guest(vm, st_gva[i]);
+
+               st_ipa = (ulong)st_gva[i] | 1;
+               ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev);
+               TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL");
+
+               st_ipa = (ulong)st_gva[i];
+               vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev);
+
+               ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev);
+               TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST");
+
+       }
+}
+
+static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]);
+
+       pr_info("VCPU%d:\n", vcpuid);
+       pr_info("    rev:     %d\n", st->rev);
+       pr_info("    attr:    %d\n", st->attr);
+       pr_info("    st_time: %ld\n", st->st_time);
+}
+
+#endif
+
+static long get_run_delay(void)
+{
+       char path[64];
+       long val[2];
+       FILE *fp;
+
+       sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid));
+       fp = fopen(path, "r");
+       fscanf(fp, "%ld %ld ", &val[0], &val[1]);
+       fclose(fp);
+
+       return val[1];
+}
+
+static void *do_steal_time(void *arg)
+{
+       struct timespec ts, stop;
+
+       clock_gettime(CLOCK_MONOTONIC, &ts);
+       stop = timespec_add_ns(ts, MIN_RUN_DELAY_NS);
+
+       while (1) {
+               clock_gettime(CLOCK_MONOTONIC, &ts);
+               if (timespec_to_ns(timespec_sub(ts, stop)) >= 0)
+                       break;
+       }
+
+       return NULL;
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       struct ucall uc;
+
+       vcpu_args_set(vm, vcpuid, 1, vcpuid);
+
+       vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+
+       switch (get_ucall(vm, vcpuid, &uc)) {
+       case UCALL_SYNC:
+       case UCALL_DONE:
+               break;
+       case UCALL_ABORT:
+               TEST_ASSERT(false, "%s at %s:%ld", (const char *)uc.args[0],
+                           __FILE__, uc.args[1]);
+       default:
+               TEST_ASSERT(false, "Unexpected exit: %s",
+                           exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+       }
+}
+
+int main(int ac, char **av)
+{
+       struct kvm_vm *vm;
+       pthread_attr_t attr;
+       pthread_t thread;
+       cpu_set_t cpuset;
+       unsigned int gpages;
+       long stolen_time;
+       long run_delay;
+       bool verbose;
+       int i;
+
+       verbose = ac > 1 && (!strncmp(av[1], "-v", 3) || !strncmp(av[1], "--verbose", 10));
+
+       /* Set CPU affinity so we can force preemption of the VCPU */
+       CPU_ZERO(&cpuset);
+       CPU_SET(0, &cpuset);
+       pthread_attr_init(&attr);
+       pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+
+       /* Create a one VCPU guest and an identity mapped memslot for the steal time structure */
+       vm = vm_create_default(0, 0, guest_code);
+       gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS);
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0);
+       virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages, 0);
+       ucall_init(vm, NULL);
+
+       /* Add the rest of the VCPUs */
+       for (i = 1; i < NR_VCPUS; ++i)
+               vm_vcpu_add_default(vm, i, guest_code);
+
+       steal_time_init(vm);
+
+       /* Run test on each VCPU */
+       for (i = 0; i < NR_VCPUS; ++i) {
+               /* First VCPU run initializes steal-time */
+               run_vcpu(vm, i);
+
+               /* Second VCPU run, expect guest stolen time to be <= run_delay */
+               run_vcpu(vm, i);
+               sync_global_from_guest(vm, guest_stolen_time[i]);
+               stolen_time = guest_stolen_time[i];
+               run_delay = get_run_delay();
+               TEST_ASSERT(stolen_time <= run_delay,
+                           "Expected stolen time <= %ld, got %ld",
+                           run_delay, stolen_time);
+
+               /* Steal time from the VCPU. The steal time thread has the same CPU affinity as the VCPUs. */
+               run_delay = get_run_delay();
+               pthread_create(&thread, &attr, do_steal_time, NULL);
+               do
+                       pthread_yield();
+               while (get_run_delay() - run_delay < MIN_RUN_DELAY_NS);
+               pthread_join(thread, NULL);
+               run_delay = get_run_delay() - run_delay;
+               TEST_ASSERT(run_delay >= MIN_RUN_DELAY_NS,
+                           "Expected run_delay >= %ld, got %ld",
+                           MIN_RUN_DELAY_NS, run_delay);
+
+               /* Run VCPU again to confirm stolen time is consistent with run_delay */
+               run_vcpu(vm, i);
+               sync_global_from_guest(vm, guest_stolen_time[i]);
+               stolen_time = guest_stolen_time[i] - stolen_time;
+               TEST_ASSERT(stolen_time >= run_delay,
+                           "Expected stolen time >= %ld, got %ld",
+                           run_delay, stolen_time);
+
+               if (verbose) {
+                       pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i,
+                               guest_stolen_time[i], stolen_time);
+                       if (stolen_time == run_delay)
+                               pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)");
+                       pr_info("\n");
+                       steal_time_dump(vm, i);
+               }
+       }
+
+       return 0;
+}
index 63cc9c3..140e919 100644 (file)
@@ -72,7 +72,7 @@ int main(int argc, char *argv[])
 
        entry = kvm_get_supported_cpuid_entry(1);
        if (!(entry->ecx & X86_FEATURE_XSAVE)) {
-               printf("XSAVE feature not supported, skipping test\n");
+               print_skip("XSAVE feature not supported");
                return 0;
        }
 
@@ -101,12 +101,12 @@ int main(int argc, char *argv[])
                        vcpu_sregs_set(vm, VCPU_ID, &sregs);
                        break;
                case UCALL_ABORT:
-                       TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit.");
+                       TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit.");
                        break;
                case UCALL_DONE:
                        goto done;
                default:
-                       TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
+                       TEST_FAIL("Unknown ucall %lu", uc.cmd);
                }
        }
 
index 92915e6..e6e62e5 100644 (file)
 
 void l2_guest_code(void)
 {
-       GUEST_SYNC(6);
-
        GUEST_SYNC(7);
 
+       GUEST_SYNC(8);
+
        /* Done, exit to L1 and never come back.  */
        vmcall();
 }
@@ -50,12 +50,17 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
 
        GUEST_SYNC(5);
        GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+       current_evmcs->revision_id = -1u;
+       GUEST_ASSERT(vmlaunch());
+       current_evmcs->revision_id = EVMCS_VERSION;
+       GUEST_SYNC(6);
+
        GUEST_ASSERT(!vmlaunch());
        GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
-       GUEST_SYNC(8);
+       GUEST_SYNC(9);
        GUEST_ASSERT(!vmresume());
        GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-       GUEST_SYNC(9);
+       GUEST_SYNC(10);
 }
 
 void guest_code(struct vmx_pages *vmx_pages)
@@ -67,6 +72,10 @@ void guest_code(struct vmx_pages *vmx_pages)
                l1_guest_code(vmx_pages);
 
        GUEST_DONE();
+
+       /* Try enlightened vmptrld with an incorrect GPA */
+       evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs);
+       GUEST_ASSERT(vmlaunch());
 }
 
 int main(int argc, char *argv[])
@@ -87,7 +96,7 @@ int main(int argc, char *argv[])
 
        if (!kvm_check_cap(KVM_CAP_NESTED_STATE) ||
            !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
-               printf("capabilities not available, skipping test\n");
+               print_skip("capabilities not available");
                exit(KSFT_SKIP);
        }
 
@@ -109,20 +118,20 @@ int main(int argc, char *argv[])
 
                switch (get_ucall(vm, VCPU_ID, &uc)) {
                case UCALL_ABORT:
-                       TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0],
-                                   __FILE__, uc.args[1]);
+                       TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+                                 __FILE__, uc.args[1]);
                        /* NOT REACHED */
                case UCALL_SYNC:
                        break;
                case UCALL_DONE:
-                       goto done;
+                       goto part1_done;
                default:
-                       TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
+                       TEST_FAIL("Unknown ucall %lu", uc.cmd);
                }
 
                /* UCALL_SYNC is handled here.  */
                TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-                           uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx",
+                           uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
                            stage, (ulong)uc.args[1]);
 
                state = vcpu_save_state(vm, VCPU_ID);
@@ -147,6 +156,10 @@ int main(int argc, char *argv[])
                            (ulong) regs2.rdi, (ulong) regs2.rsi);
        }
 
-done:
+part1_done:
+       _vcpu_run(vm, VCPU_ID);
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
+                   "Unexpected successful VMEnter with invalid eVMCS pointer!");
+
        kvm_vm_free(vm);
 }
index 443a2b5..83323f3 100644 (file)
@@ -66,7 +66,7 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
 
                TEST_ASSERT((entry->function >= 0x40000000) &&
                            (entry->function <= 0x4000000A),
-                           "function %lx is our of supported range",
+                           "function %x is our of supported range",
                            entry->function);
 
                TEST_ASSERT(entry->index == 0,
@@ -141,8 +141,7 @@ int main(int argc, char *argv[])
 
        rv = kvm_check_cap(KVM_CAP_HYPERV_CPUID);
        if (!rv) {
-               fprintf(stderr,
-                       "KVM_CAP_HYPERV_CPUID not supported, skip test\n");
+               print_skip("KVM_CAP_HYPERV_CPUID not supported");
                exit(KSFT_SKIP);
        }
 
@@ -160,8 +159,7 @@ int main(int argc, char *argv[])
        free(hv_cpuid_entries);
 
        if (!kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
-               fprintf(stderr,
-                       "Enlightened VMCS is unsupported, skip related test\n");
+               print_skip("Enlightened VMCS is unsupported");
                goto vm_free;
        }
 
index 00bb97d..e6480fd 100644 (file)
@@ -44,7 +44,7 @@ void *thr(void *arg)
        struct kvm_run *run = tc->run;
 
        res = ioctl(kvmcpu, KVM_RUN, 0);
-       printf("ret1=%d exit_reason=%d suberror=%d\n",
+       pr_info("ret1=%d exit_reason=%d suberror=%d\n",
                res, run->exit_reason, run->internal.suberror);
 
        return 0;
@@ -93,12 +93,12 @@ int main(void)
        int warnings_before, warnings_after;
 
        if (!is_intel_cpu()) {
-               printf("Must be run on an Intel CPU, skipping test\n");
+               print_skip("Must be run on an Intel CPU");
                exit(KSFT_SKIP);
        }
 
        if (vm_is_unrestricted_guest(NULL)) {
-               printf("Unrestricted guest must be disabled, skipping test\n");
+               print_skip("Unrestricted guest must be disabled");
                exit(KSFT_SKIP);
        }
 
index f9334bd..1e89688 100644 (file)
@@ -58,8 +58,7 @@ static void test_msr_platform_info_enabled(struct kvm_vm *vm)
                        exit_reason_str(run->exit_reason));
        get_ucall(vm, VCPU_ID, &uc);
        TEST_ASSERT(uc.cmd == UCALL_SYNC,
-                       "Received ucall other than UCALL_SYNC: %u\n",
-                       ucall);
+                       "Received ucall other than UCALL_SYNC: %lu\n", uc.cmd);
        TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) ==
                MSR_PLATFORM_INFO_MAX_TURBO_RATIO,
                "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.",
@@ -89,8 +88,7 @@ int main(int argc, char *argv[])
 
        rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO);
        if (!rv) {
-               fprintf(stderr,
-                       "KVM_CAP_MSR_PLATFORM_INFO not supported, skip test\n");
+               print_skip("KVM_CAP_MSR_PLATFORM_INFO not supported");
                exit(KSFT_SKIP);
        }
 
diff --git a/tools/testing/selftests/kvm/x86_64/set_memory_region_test.c b/tools/testing/selftests/kvm/x86_64/set_memory_region_test.c
new file mode 100644 (file)
index 0000000..c6691cf
--- /dev/null
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <linux/compiler.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define VCPU_ID 0
+
+/*
+ * Somewhat arbitrary location and slot, intended to not overlap anything.  The
+ * location and size are specifically 2mb sized/aligned so that the initial
+ * region corresponds to exactly one large page.
+ */
+#define MEM_REGION_GPA         0xc0000000
+#define MEM_REGION_SIZE                0x200000
+#define MEM_REGION_SLOT                10
+
+static void guest_code(void)
+{
+       uint64_t val;
+
+       do {
+               val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA));
+       } while (!val);
+
+       if (val != 1)
+               ucall(UCALL_ABORT, 1, val);
+
+       GUEST_DONE();
+}
+
+static void *vcpu_worker(void *data)
+{
+       struct kvm_vm *vm = data;
+       struct kvm_run *run;
+       struct ucall uc;
+       uint64_t cmd;
+
+       /*
+        * Loop until the guest is done.  Re-enter the guest on all MMIO exits,
+        * which will occur if the guest attempts to access a memslot while it
+        * is being moved.
+        */
+       run = vcpu_state(vm, VCPU_ID);
+       do {
+               vcpu_run(vm, VCPU_ID);
+       } while (run->exit_reason == KVM_EXIT_MMIO);
+
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                   "Unexpected exit reason = %d", run->exit_reason);
+
+       cmd = get_ucall(vm, VCPU_ID, &uc);
+       TEST_ASSERT(cmd == UCALL_DONE, "Unexpected val in guest = %lu", uc.args[0]);
+       return NULL;
+}
+
+static void test_move_memory_region(void)
+{
+       pthread_t vcpu_thread;
+       struct kvm_vm *vm;
+       uint64_t *hva;
+       uint64_t gpa;
+
+       vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+       vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
+                                   MEM_REGION_GPA, MEM_REGION_SLOT,
+                                   MEM_REGION_SIZE / getpagesize(), 0);
+
+       /*
+        * Allocate and map two pages so that the GPA accessed by guest_code()
+        * stays valid across the memslot move.
+        */
+       gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT);
+       TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n");
+
+       virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0);
+
+       /* Ditto for the host mapping so that both pages can be zeroed. */
+       hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+       memset(hva, 0, 2 * 4096);
+
+       pthread_create(&vcpu_thread, NULL, vcpu_worker, vm);
+
+       /* Ensure the guest thread is spun up. */
+       usleep(100000);
+
+       /*
+        * Shift the region's base GPA.  The guest should not see "2" as the
+        * hva->gpa translation is misaligned, i.e. the guest is accessing a
+        * different host pfn.
+        */
+       vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096);
+       WRITE_ONCE(*hva, 2);
+
+       usleep(100000);
+
+       /*
+        * Note, value in memory needs to be changed *before* restoring the
+        * memslot, else the guest could race the update and see "2".
+        */
+       WRITE_ONCE(*hva, 1);
+
+       /* Restore the original base, the guest should see "1". */
+       vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA);
+
+       pthread_join(vcpu_thread, NULL);
+
+       kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+       int i, loops;
+
+       /* Tell stdout not to buffer its content */
+       setbuf(stdout, NULL);
+
+       if (argc > 1)
+               loops = atoi(argv[1]);
+       else
+               loops = 10;
+
+       for (i = 0; i < loops; i++)
+               test_move_memory_region();
+
+       return 0;
+}
index 8c06364..8230b6b 100644 (file)
@@ -117,7 +117,7 @@ int main(int argc, char *argv[])
                vcpu_alloc_vmx(vm, &vmx_pages_gva);
                vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
        } else {
-               printf("will skip SMM test with VMX enabled\n");
+               pr_info("will skip SMM test with VMX enabled\n");
                vcpu_args_set(vm, VCPU_ID, 1, 0);
        }
 
index 3ab5ec3..5b1a016 100644 (file)
@@ -139,7 +139,7 @@ int main(int argc, char *argv[])
                vcpu_alloc_vmx(vm, &vmx_pages_gva);
                vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
        } else {
-               printf("will skip nested state checks\n");
+               pr_info("will skip nested state checks\n");
                vcpu_args_set(vm, VCPU_ID, 1, 0);
        }
 
@@ -152,20 +152,20 @@ int main(int argc, char *argv[])
 
                switch (get_ucall(vm, VCPU_ID, &uc)) {
                case UCALL_ABORT:
-                       TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0],
-                                   __FILE__, uc.args[1]);
+                       TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+                                 __FILE__, uc.args[1]);
                        /* NOT REACHED */
                case UCALL_SYNC:
                        break;
                case UCALL_DONE:
                        goto done;
                default:
-                       TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
+                       TEST_FAIL("Unknown ucall %lu", uc.cmd);
                }
 
                /* UCALL_SYNC is handled here.  */
                TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-                           uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx",
+                           uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
                            stage, (ulong)uc.args[1]);
 
                state = vcpu_save_state(vm, VCPU_ID);
index e280f68..0e1adb4 100644 (file)
@@ -61,16 +61,14 @@ int main(int argc, char *argv[])
 
                switch (get_ucall(vm, VCPU_ID, &uc)) {
                case UCALL_ABORT:
-                       TEST_ASSERT(false, "%s",
-                                   (const char *)uc.args[0]);
+                       TEST_FAIL("%s", (const char *)uc.args[0]);
                        /* NOT REACHED */
                case UCALL_SYNC:
                        break;
                case UCALL_DONE:
                        goto done;
                default:
-                       TEST_ASSERT(false,
-                                   "Unknown ucall 0x%x.", uc.cmd);
+                       TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
                }
        }
 done:
index 5c82242..d672f0a 100644 (file)
@@ -91,11 +91,11 @@ int main(int argc, char *argv[])
 
        cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
        if ((cap & TEST_SYNC_FIELDS) != TEST_SYNC_FIELDS) {
-               fprintf(stderr, "KVM_CAP_SYNC_REGS not supported, skipping test\n");
+               print_skip("KVM_CAP_SYNC_REGS not supported");
                exit(KSFT_SKIP);
        }
        if ((cap & INVALID_SYNC_FIELD) != 0) {
-               fprintf(stderr, "The \"invalid\" field is not invalid, skipping test\n");
+               print_skip("The \"invalid\" field is not invalid");
                exit(KSFT_SKIP);
        }
 
index 5dfb535..fe40ade 100644 (file)
@@ -78,10 +78,10 @@ int main(int argc, char *argv[])
 
                switch (get_ucall(vm, VCPU_ID, &uc)) {
                case UCALL_ABORT:
-                       TEST_ASSERT(false, "%s", (const char *)uc.args[0]);
+                       TEST_FAIL("%s", (const char *)uc.args[0]);
                        /* NOT REACHED */
                default:
-                       TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
+                       TEST_FAIL("Unknown ucall %lu", uc.cmd);
                }
        }
 }
index a223a64..e894a63 100644 (file)
@@ -21,7 +21,7 @@
 
 /* The memory slot index to track dirty pages */
 #define TEST_MEM_SLOT_INDEX            1
-#define TEST_MEM_SIZE                  3
+#define TEST_MEM_PAGES                 3
 
 /* L1 guest test virtual memory offset */
 #define GUEST_TEST_MEM                 0xc0000000
@@ -91,15 +91,14 @@ int main(int argc, char *argv[])
        vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
                                    GUEST_TEST_MEM,
                                    TEST_MEM_SLOT_INDEX,
-                                   TEST_MEM_SIZE,
+                                   TEST_MEM_PAGES,
                                    KVM_MEM_LOG_DIRTY_PAGES);
 
        /*
         * Add an identity map for GVA range [0xc0000000, 0xc0002000).  This
         * affects both L1 and L2.  However...
         */
-       virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM,
-                TEST_MEM_SIZE * 4096, 0);
+       virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES, 0);
 
        /*
         * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
@@ -113,11 +112,11 @@ int main(int argc, char *argv[])
        nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0);
        nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0);
 
-       bmap = bitmap_alloc(TEST_MEM_SIZE);
+       bmap = bitmap_alloc(TEST_MEM_PAGES);
        host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
 
        while (!done) {
-               memset(host_test_mem, 0xaa, TEST_MEM_SIZE * 4096);
+               memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096);
                _vcpu_run(vm, VCPU_ID);
                TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
                            "Unexpected exit reason: %u (%s),\n",
@@ -126,8 +125,8 @@ int main(int argc, char *argv[])
 
                switch (get_ucall(vm, VCPU_ID, &uc)) {
                case UCALL_ABORT:
-                       TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0],
-                                   __FILE__, uc.args[1]);
+                       TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+                                 __FILE__, uc.args[1]);
                        /* NOT REACHED */
                case UCALL_SYNC:
                        /*
@@ -152,7 +151,7 @@ int main(int argc, char *argv[])
                        done = true;
                        break;
                default:
-                       TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
+                       TEST_FAIL("Unknown ucall %lu", uc.cmd);
                }
        }
 }
index 9ef7fab..54cdefd 100644 (file)
@@ -212,7 +212,7 @@ void test_vmx_nested_state(struct kvm_vm *vm)
        test_nested_state(vm, state);
        vcpu_nested_state_get(vm, VCPU_ID, state);
        TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
-                   "Size must be between %d and %d.  The size returned was %d.",
+                   "Size must be between %ld and %d.  The size returned was %d.",
                    sizeof(*state), state_sz, state->size);
        TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull.");
        TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull.");
@@ -228,7 +228,7 @@ int main(int argc, char *argv[])
        have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
 
        if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) {
-               printf("KVM_CAP_NESTED_STATE not available, skipping test\n");
+               print_skip("KVM_CAP_NESTED_STATE not available");
                exit(KSFT_SKIP);
        }
 
index 69e482a..fbe8417 100644 (file)
@@ -121,8 +121,8 @@ static void l1_guest_code(struct vmx_pages *vmx_pages)
 
 static void report(int64_t val)
 {
-       printf("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
-              val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE);
+       pr_info("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
+               val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE);
 }
 
 int main(int argc, char *argv[])
@@ -150,7 +150,7 @@ int main(int argc, char *argv[])
 
                switch (get_ucall(vm, VCPU_ID, &uc)) {
                case UCALL_ABORT:
-                       TEST_ASSERT(false, "%s", (const char *)uc.args[0]);
+                       TEST_FAIL("%s", (const char *)uc.args[0]);
                        /* NOT REACHED */
                case UCALL_SYNC:
                        report(uc.args[1]);
@@ -158,7 +158,7 @@ int main(int argc, char *argv[])
                case UCALL_DONE:
                        goto done;
                default:
-                       TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
+                       TEST_FAIL("Unknown ucall %lu", uc.cmd);
                }
        }
 
index 851ea81..3529376 100644 (file)
@@ -51,7 +51,7 @@ int main(int argc, char *argv[])
                xss_supported = entry && !!(entry->eax & X86_FEATURE_XSAVES);
        }
        if (!xss_supported) {
-               printf("IA32_XSS is not supported by the vCPU.\n");
+               print_skip("IA32_XSS is not supported by the vCPU");
                exit(KSFT_SKIP);
        }
 
index 4d864f8..376c6a7 100644 (file)
@@ -1189,55 +1189,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        return r;
 }
 
-/**
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- * @kvm: kvm instance
- * @log: slot id and address to which we copy the log
- *
- * Steps 1-4 below provide general overview of dirty page logging. See
- * kvm_get_dirty_log_protect() function description for additional details.
- *
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- * always flush the TLB (step 4) even if previous step failed  and the dirty
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- * writes will be marked dirty for next log read.
- *
- *   1. Take a snapshot of the bit and clear it if needed.
- *   2. Write protect the corresponding page.
- *   3. Copy the snapshot to the userspace.
- *   4. Flush TLB's if needed.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
-       bool flush = false;
-       int r;
-
-       mutex_lock(&kvm->slots_lock);
-
-       r = kvm_get_dirty_log_protect(kvm, log, &flush);
 
-       if (flush)
-               kvm_flush_remote_tlbs(kvm);
-
-       mutex_unlock(&kvm->slots_lock);
-       return r;
 }
 
-int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
+void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+                                       struct kvm_memory_slot *memslot)
 {
-       bool flush = false;
-       int r;
-
-       mutex_lock(&kvm->slots_lock);
-
-       r = kvm_clear_dirty_log_protect(kvm, log, &flush);
-
-       if (flush)
-               kvm_flush_remote_tlbs(kvm);
-
-       mutex_unlock(&kvm->slots_lock);
-       return r;
+       kvm_flush_remote_tlbs(kvm);
 }
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
index 19c961a..e3b9ee2 100644 (file)
@@ -1534,8 +1534,13 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 {
        struct kvm_memslots *slots = kvm_memslots(kvm);
        struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
-       phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
-       phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+       phys_addr_t start, end;
+
+       if (WARN_ON_ONCE(!memslot))
+               return;
+
+       start = memslot->base_gfn << PAGE_SHIFT;
+       end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
 
        spin_lock(&kvm->mmu_lock);
        stage2_wp_range(kvm, start, end);
@@ -2251,7 +2256,7 @@ out:
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                   const struct kvm_userspace_memory_region *mem,
-                                  const struct kvm_memory_slot *old,
+                                  struct kvm_memory_slot *old,
                                   const struct kvm_memory_slot *new,
                                   enum kvm_mr_change change)
 {
@@ -2349,17 +2354,10 @@ out:
        return ret;
 }
 
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-                          struct kvm_memory_slot *dont)
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
 }
 
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                           unsigned long npages)
-{
-       return 0;
-}
-
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 {
 }
index 17e2bdd..14a162e 100644 (file)
@@ -12,7 +12,6 @@
 
 #include <asm/cputype.h>
 #include <asm/kvm_emulate.h>
-#include <asm/kvm_host.h>
 
 #include <kvm/arm_psci.h>
 #include <kvm/arm_hypercalls.h>
index 70f03ce..f744bc6 100644 (file)
@@ -149,8 +149,6 @@ static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
 __visible bool kvm_rebooting;
 EXPORT_SYMBOL_GPL(kvm_rebooting);
 
-static bool largepages_enabled = true;
-
 #define KVM_EVENT_CREATE_VM 0
 #define KVM_EVENT_DESTROY_VM 1
 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
@@ -566,7 +564,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
                return NULL;
 
        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
-               slots->id_to_index[i] = slots->memslots[i].id = i;
+               slots->id_to_index[i] = -1;
 
        return slots;
 }
@@ -580,18 +578,14 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
        memslot->dirty_bitmap = NULL;
 }
 
-/*
- * Free any memory in @free but not in @dont.
- */
-static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-                             struct kvm_memory_slot *dont)
+static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
-       if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
-               kvm_destroy_dirty_bitmap(free);
+       kvm_destroy_dirty_bitmap(slot);
 
-       kvm_arch_free_memslot(kvm, free, dont);
+       kvm_arch_free_memslot(kvm, slot);
 
-       free->npages = 0;
+       slot->flags = 0;
+       slot->npages = 0;
 }
 
 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
@@ -602,7 +596,7 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
                return;
 
        kvm_for_each_memslot(memslot, slots)
-               kvm_free_memslot(kvm, memslot, NULL);
+               kvm_free_memslot(kvm, memslot);
 
        kvfree(slots);
 }
@@ -860,9 +854,9 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 
 /*
  * Allocation size is twice as large as the actual dirty bitmap size.
- * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
+ * See kvm_vm_ioctl_get_dirty_log() why this is needed.
  */
-static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
+static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
        unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
@@ -874,63 +868,165 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 }
 
 /*
- * Insert memslot and re-sort memslots based on their GFN,
- * so binary search could be used to lookup GFN.
- * Sorting algorithm takes advantage of having initially
- * sorted array and known changed memslot position.
+ * Delete a memslot by decrementing the number of used slots and shifting all
+ * other entries in the array forward one spot.
  */
-static void update_memslots(struct kvm_memslots *slots,
-                           struct kvm_memory_slot *new,
-                           enum kvm_mr_change change)
+static inline void kvm_memslot_delete(struct kvm_memslots *slots,
+                                     struct kvm_memory_slot *memslot)
 {
-       int id = new->id;
-       int i = slots->id_to_index[id];
        struct kvm_memory_slot *mslots = slots->memslots;
+       int i;
 
-       WARN_ON(mslots[i].id != id);
-       switch (change) {
-       case KVM_MR_CREATE:
-               slots->used_slots++;
-               WARN_ON(mslots[i].npages || !new->npages);
-               break;
-       case KVM_MR_DELETE:
-               slots->used_slots--;
-               WARN_ON(new->npages || !mslots[i].npages);
-               break;
-       default:
-               break;
-       }
+       if (WARN_ON(slots->id_to_index[memslot->id] == -1))
+               return;
 
-       while (i < KVM_MEM_SLOTS_NUM - 1 &&
-              new->base_gfn <= mslots[i + 1].base_gfn) {
-               if (!mslots[i + 1].npages)
-                       break;
+       slots->used_slots--;
+
+       if (atomic_read(&slots->lru_slot) >= slots->used_slots)
+               atomic_set(&slots->lru_slot, 0);
+
+       for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
                mslots[i] = mslots[i + 1];
                slots->id_to_index[mslots[i].id] = i;
-               i++;
        }
+       mslots[i] = *memslot;
+       slots->id_to_index[memslot->id] = -1;
+}
+
+/*
+ * "Insert" a new memslot by incrementing the number of used slots.  Returns
+ * the new slot's initial index into the memslots array.
+ */
+static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
+{
+       return slots->used_slots++;
+}
+
+/*
+ * Move a changed memslot backwards in the array by shifting existing slots
+ * with a higher GFN toward the front of the array.  Note, the changed memslot
+ * itself is not preserved in the array, i.e. not swapped at this time, only
+ * its new index into the array is tracked.  Returns the changed memslot's
+ * current index into the memslots array.
+ */
+static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
+                                           struct kvm_memory_slot *memslot)
+{
+       struct kvm_memory_slot *mslots = slots->memslots;
+       int i;
+
+       if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
+           WARN_ON_ONCE(!slots->used_slots))
+               return -1;
 
        /*
-        * The ">=" is needed when creating a slot with base_gfn == 0,
-        * so that it moves before all those with base_gfn == npages == 0.
-        *
-        * On the other hand, if new->npages is zero, the above loop has
-        * already left i pointing to the beginning of the empty part of
-        * mslots, and the ">=" would move the hole backwards in this
-        * case---which is wrong.  So skip the loop when deleting a slot.
+        * Move the target memslot backward in the array by shifting existing
+        * memslots with a higher GFN (than the target memslot) towards the
+        * front of the array.
         */
-       if (new->npages) {
-               while (i > 0 &&
-                      new->base_gfn >= mslots[i - 1].base_gfn) {
-                       mslots[i] = mslots[i - 1];
-                       slots->id_to_index[mslots[i].id] = i;
-                       i--;
-               }
-       } else
-               WARN_ON_ONCE(i != slots->used_slots);
+       for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
+               if (memslot->base_gfn > mslots[i + 1].base_gfn)
+                       break;
+
+               WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
 
-       mslots[i] = *new;
-       slots->id_to_index[mslots[i].id] = i;
+               /* Shift the next memslot forward one and update its index. */
+               mslots[i] = mslots[i + 1];
+               slots->id_to_index[mslots[i].id] = i;
+       }
+       return i;
+}
+
+/*
+ * Move a changed memslot forwards in the array by shifting existing slots with
+ * a lower GFN toward the back of the array.  Note, the changed memslot itself
+ * is not preserved in the array, i.e. not swapped at this time, only its new
+ * index into the array is tracked.  Returns the changed memslot's final index
+ * into the memslots array.
+ */
+static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
+                                          struct kvm_memory_slot *memslot,
+                                          int start)
+{
+       struct kvm_memory_slot *mslots = slots->memslots;
+       int i;
+
+       for (i = start; i > 0; i--) {
+               if (memslot->base_gfn < mslots[i - 1].base_gfn)
+                       break;
+
+               WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
+
+               /* Shift the next memslot back one and update its index. */
+               mslots[i] = mslots[i - 1];
+               slots->id_to_index[mslots[i].id] = i;
+       }
+       return i;
+}
+
+/*
+ * Re-sort memslots based on their GFN to account for an added, deleted, or
+ * moved memslot.  Sorting memslots by GFN allows using a binary search during
+ * memslot lookup.
+ *
+ * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!  I.e. the entry
+ * at memslots[0] has the highest GFN.
+ *
+ * The sorting algorithm takes advantage of having initially sorted memslots
+ * and knowing the position of the changed memslot.  Sorting is also optimized
+ * by not swapping the updated memslot and instead only shifting other memslots
+ * and tracking the new index for the update memslot.  Only once its final
+ * index is known is the updated memslot copied into its position in the array.
+ *
+ *  - When deleting a memslot, the deleted memslot simply needs to be moved to
+ *    the end of the array.
+ *
+ *  - When creating a memslot, the algorithm "inserts" the new memslot at the
+ *    end of the array and then it forward to its correct location.
+ *
+ *  - When moving a memslot, the algorithm first moves the updated memslot
+ *    backward to handle the scenario where the memslot's GFN was changed to a
+ *    lower value.  update_memslots() then falls through and runs the same flow
+ *    as creating a memslot to move the memslot forward to handle the scenario
+ *    where its GFN was changed to a higher value.
+ *
+ * Note, slots are sorted from highest->lowest instead of lowest->highest for
+ * historical reasons.  Originally, invalid memslots where denoted by having
+ * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
+ * to the end of the array.  The current algorithm uses dedicated logic to
+ * delete a memslot and thus does not rely on invalid memslots having GFN=0.
+ *
+ * The other historical motiviation for highest->lowest was to improve the
+ * performance of memslot lookup.  KVM originally used a linear search starting
+ * at memslots[0].  On x86, the largest memslot usually has one of the highest,
+ * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a
+ * single memslot above the 4gb boundary.  As the largest memslot is also the
+ * most likely to be referenced, sorting it to the front of the array was
+ * advantageous.  The current binary search starts from the middle of the array
+ * and uses an LRU pointer to improve performance for all memslots and GFNs.
+ */
+static void update_memslots(struct kvm_memslots *slots,
+                           struct kvm_memory_slot *memslot,
+                           enum kvm_mr_change change)
+{
+       int i;
+
+       if (change == KVM_MR_DELETE) {
+               kvm_memslot_delete(slots, memslot);
+       } else {
+               if (change == KVM_MR_CREATE)
+                       i = kvm_memslot_insert_back(slots);
+               else
+                       i = kvm_memslot_move_backward(slots, memslot);
+               i = kvm_memslot_move_forward(slots, memslot, i);
+
+               /*
+                * Copy the memslot to its new position in memslots and update
+                * its index accordingly.
+                */
+               slots->memslots[i] = *memslot;
+               slots->id_to_index[memslot->id] = i;
+       }
 }
 
 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
@@ -984,6 +1080,112 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 }
 
 /*
+ * Note, at a minimum, the current number of used slots must be allocated, even
+ * when deleting a memslot, as we need a complete duplicate of the memslots for
+ * use when invalidating a memslot prior to deleting/moving the memslot.
+ */
+static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
+                                            enum kvm_mr_change change)
+{
+       struct kvm_memslots *slots;
+       size_t old_size, new_size;
+
+       old_size = sizeof(struct kvm_memslots) +
+                  (sizeof(struct kvm_memory_slot) * old->used_slots);
+
+       if (change == KVM_MR_CREATE)
+               new_size = old_size + sizeof(struct kvm_memory_slot);
+       else
+               new_size = old_size;
+
+       slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
+       if (likely(slots))
+               memcpy(slots, old, old_size);
+
+       return slots;
+}
+
+static int kvm_set_memslot(struct kvm *kvm,
+                          const struct kvm_userspace_memory_region *mem,
+                          struct kvm_memory_slot *old,
+                          struct kvm_memory_slot *new, int as_id,
+                          enum kvm_mr_change change)
+{
+       struct kvm_memory_slot *slot;
+       struct kvm_memslots *slots;
+       int r;
+
+       slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
+       if (!slots)
+               return -ENOMEM;
+
+       if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
+               /*
+                * Note, the INVALID flag needs to be in the appropriate entry
+                * in the freshly allocated memslots, not in @old or @new.
+                */
+               slot = id_to_memslot(slots, old->id);
+               slot->flags |= KVM_MEMSLOT_INVALID;
+
+               /*
+                * We can re-use the old memslots, the only difference from the
+                * newly installed memslots is the invalid flag, which will get
+                * dropped by update_memslots anyway.  We'll also revert to the
+                * old memslots if preparing the new memory region fails.
+                */
+               slots = install_new_memslots(kvm, as_id, slots);
+
+               /* From this point no new shadow pages pointing to a deleted,
+                * or moved, memslot will be created.
+                *
+                * validation of sp->gfn happens in:
+                *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+                *      - kvm_is_visible_gfn (mmu_check_root)
+                */
+               kvm_arch_flush_shadow_memslot(kvm, slot);
+       }
+
+       r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
+       if (r)
+               goto out_slots;
+
+       update_memslots(slots, new, change);
+       slots = install_new_memslots(kvm, as_id, slots);
+
+       kvm_arch_commit_memory_region(kvm, mem, old, new, change);
+
+       kvfree(slots);
+       return 0;
+
+out_slots:
+       if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
+               slots = install_new_memslots(kvm, as_id, slots);
+       kvfree(slots);
+       return r;
+}
+
+static int kvm_delete_memslot(struct kvm *kvm,
+                             const struct kvm_userspace_memory_region *mem,
+                             struct kvm_memory_slot *old, int as_id)
+{
+       struct kvm_memory_slot new;
+       int r;
+
+       if (!old->npages)
+               return -EINVAL;
+
+       memset(&new, 0, sizeof(new));
+       new.id = old->id;
+
+       r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
+       if (r)
+               return r;
+
+       kvm_free_memslot(kvm, old);
+       return 0;
+}
+
+/*
  * Allocate some memory and give it an address in the guest physical address
  * space.
  *
@@ -994,162 +1196,118 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 int __kvm_set_memory_region(struct kvm *kvm,
                            const struct kvm_userspace_memory_region *mem)
 {
-       int r;
-       gfn_t base_gfn;
-       unsigned long npages;
-       struct kvm_memory_slot *slot;
        struct kvm_memory_slot old, new;
-       struct kvm_memslots *slots = NULL, *old_memslots;
-       int as_id, id;
+       struct kvm_memory_slot *tmp;
        enum kvm_mr_change change;
+       int as_id, id;
+       int r;
 
        r = check_memory_region_flags(mem);
        if (r)
-               goto out;
+               return r;
 
-       r = -EINVAL;
        as_id = mem->slot >> 16;
        id = (u16)mem->slot;
 
        /* General sanity checks */
        if (mem->memory_size & (PAGE_SIZE - 1))
-               goto out;
+               return -EINVAL;
        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
-               goto out;
+               return -EINVAL;
        /* We can read the guest memory with __xxx_user() later on. */
        if ((id < KVM_USER_MEM_SLOTS) &&
            ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
             !access_ok((void __user *)(unsigned long)mem->userspace_addr,
                        mem->memory_size)))
-               goto out;
+               return -EINVAL;
        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
-               goto out;
+               return -EINVAL;
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
-               goto out;
-
-       slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
-       base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
-       npages = mem->memory_size >> PAGE_SHIFT;
+               return -EINVAL;
 
-       if (npages > KVM_MEM_MAX_NR_PAGES)
-               goto out;
+       /*
+        * Make a full copy of the old memslot, the pointer will become stale
+        * when the memslots are re-sorted by update_memslots(), and the old
+        * memslot needs to be referenced after calling update_memslots(), e.g.
+        * to free its resources and for arch specific behavior.
+        */
+       tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+       if (tmp) {
+               old = *tmp;
+               tmp = NULL;
+       } else {
+               memset(&old, 0, sizeof(old));
+               old.id = id;
+       }
 
-       new = old = *slot;
+       if (!mem->memory_size)
+               return kvm_delete_memslot(kvm, mem, &old, as_id);
 
        new.id = id;
-       new.base_gfn = base_gfn;
-       new.npages = npages;
+       new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
+       new.npages = mem->memory_size >> PAGE_SHIFT;
        new.flags = mem->flags;
+       new.userspace_addr = mem->userspace_addr;
 
-       if (npages) {
-               if (!old.npages)
-                       change = KVM_MR_CREATE;
-               else { /* Modify an existing slot. */
-                       if ((mem->userspace_addr != old.userspace_addr) ||
-                           (npages != old.npages) ||
-                           ((new.flags ^ old.flags) & KVM_MEM_READONLY))
-                               goto out;
+       if (new.npages > KVM_MEM_MAX_NR_PAGES)
+               return -EINVAL;
 
-                       if (base_gfn != old.base_gfn)
-                               change = KVM_MR_MOVE;
-                       else if (new.flags != old.flags)
-                               change = KVM_MR_FLAGS_ONLY;
-                       else { /* Nothing to change. */
-                               r = 0;
-                               goto out;
-                       }
-               }
-       } else {
-               if (!old.npages)
-                       goto out;
+       if (!old.npages) {
+               change = KVM_MR_CREATE;
+               new.dirty_bitmap = NULL;
+               memset(&new.arch, 0, sizeof(new.arch));
+       } else { /* Modify an existing slot. */
+               if ((new.userspace_addr != old.userspace_addr) ||
+                   (new.npages != old.npages) ||
+                   ((new.flags ^ old.flags) & KVM_MEM_READONLY))
+                       return -EINVAL;
+
+               if (new.base_gfn != old.base_gfn)
+                       change = KVM_MR_MOVE;
+               else if (new.flags != old.flags)
+                       change = KVM_MR_FLAGS_ONLY;
+               else /* Nothing to change. */
+                       return 0;
 
-               change = KVM_MR_DELETE;
-               new.base_gfn = 0;
-               new.flags = 0;
+               /* Copy dirty_bitmap and arch from the current memslot. */
+               new.dirty_bitmap = old.dirty_bitmap;
+               memcpy(&new.arch, &old.arch, sizeof(new.arch));
        }
 
        if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
                /* Check for overlaps */
-               r = -EEXIST;
-               kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
-                       if (slot->id == id)
+               kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
+                       if (tmp->id == id)
                                continue;
-                       if (!((base_gfn + npages <= slot->base_gfn) ||
-                             (base_gfn >= slot->base_gfn + slot->npages)))
-                               goto out;
+                       if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
+                             (new.base_gfn >= tmp->base_gfn + tmp->npages)))
+                               return -EEXIST;
                }
        }
 
-       /* Free page dirty bitmap if unneeded */
+       /* Allocate/free page dirty bitmap as needed */
        if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
                new.dirty_bitmap = NULL;
+       else if (!new.dirty_bitmap) {
+               r = kvm_alloc_dirty_bitmap(&new);
+               if (r)
+                       return r;
 
-       r = -ENOMEM;
-       if (change == KVM_MR_CREATE) {
-               new.userspace_addr = mem->userspace_addr;
-
-               if (kvm_arch_create_memslot(kvm, &new, npages))
-                       goto out_free;
-       }
-
-       /* Allocate page dirty bitmap if needed */
-       if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
-               if (kvm_create_dirty_bitmap(&new) < 0)
-                       goto out_free;
-       }
-
-       slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
-       if (!slots)
-               goto out_free;
-       memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
-
-       if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
-               slot = id_to_memslot(slots, id);
-               slot->flags |= KVM_MEMSLOT_INVALID;
-
-               old_memslots = install_new_memslots(kvm, as_id, slots);
-
-               /* From this point no new shadow pages pointing to a deleted,
-                * or moved, memslot will be created.
-                *
-                * validation of sp->gfn happens in:
-                *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
-                *      - kvm_is_visible_gfn (mmu_check_root)
-                */
-               kvm_arch_flush_shadow_memslot(kvm, slot);
-
-               /*
-                * We can re-use the old_memslots from above, the only difference
-                * from the currently installed memslots is the invalid flag.  This
-                * will get overwritten by update_memslots anyway.
-                */
-               slots = old_memslots;
+               if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+                       bitmap_set(new.dirty_bitmap, 0, new.npages);
        }
 
-       r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
+       r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
        if (r)
-               goto out_slots;
-
-       /* actual memory is freed via old in kvm_free_memslot below */
-       if (change == KVM_MR_DELETE) {
-               new.dirty_bitmap = NULL;
-               memset(&new.arch, 0, sizeof(new.arch));
-       }
-
-       update_memslots(slots, &new, change);
-       old_memslots = install_new_memslots(kvm, as_id, slots);
-
-       kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
+               goto out_bitmap;
 
-       kvm_free_memslot(kvm, &old, &new);
-       kvfree(old_memslots);
+       if (old.dirty_bitmap && !new.dirty_bitmap)
+               kvm_destroy_dirty_bitmap(&old);
        return 0;
 
-out_slots:
-       kvfree(slots);
-out_free:
-       kvm_free_memslot(kvm, &new, &old);
-out:
+out_bitmap:
+       if (new.dirty_bitmap && !old.dirty_bitmap)
+               kvm_destroy_dirty_bitmap(&new);
        return r;
 }
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
@@ -1175,31 +1333,43 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
        return kvm_set_memory_region(kvm, mem);
 }
 
-int kvm_get_dirty_log(struct kvm *kvm,
-                       struct kvm_dirty_log *log, int *is_dirty)
+#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+/**
+ * kvm_get_dirty_log - get a snapshot of dirty pages
+ * @kvm:       pointer to kvm instance
+ * @log:       slot id and address to which we copy the log
+ * @is_dirty:  set to '1' if any dirty pages were found
+ * @memslot:   set to the associated memslot, always valid on success
+ */
+int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
+                     int *is_dirty, struct kvm_memory_slot **memslot)
 {
        struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
        int i, as_id, id;
        unsigned long n;
        unsigned long any = 0;
 
+       *memslot = NULL;
+       *is_dirty = 0;
+
        as_id = log->slot >> 16;
        id = (u16)log->slot;
        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
                return -EINVAL;
 
        slots = __kvm_memslots(kvm, as_id);
-       memslot = id_to_memslot(slots, id);
-       if (!memslot->dirty_bitmap)
+       *memslot = id_to_memslot(slots, id);
+       if (!(*memslot) || !(*memslot)->dirty_bitmap)
                return -ENOENT;
 
-       n = kvm_dirty_bitmap_bytes(memslot);
+       kvm_arch_sync_dirty_log(kvm, *memslot);
+
+       n = kvm_dirty_bitmap_bytes(*memslot);
 
        for (i = 0; !any && i < n/sizeof(long); ++i)
-               any = memslot->dirty_bitmap[i];
+               any = (*memslot)->dirty_bitmap[i];
 
-       if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+       if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
                return -EFAULT;
 
        if (any)
@@ -1208,13 +1378,12 @@ int kvm_get_dirty_log(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
 
-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 /**
  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
  *     and reenable dirty page tracking for the corresponding pages.
  * @kvm:       pointer to kvm instance
  * @log:       slot id and address to which we copy the log
- * @flush:     true if TLB flush is needed by caller
  *
  * We need to keep it in mind that VCPU threads can write to the bitmap
  * concurrently. So, to avoid losing track of dirty pages we keep the
@@ -1231,8 +1400,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
  * exiting to userspace will be logged for the next call.
  *
  */
-int kvm_get_dirty_log_protect(struct kvm *kvm,
-                       struct kvm_dirty_log *log, bool *flush)
+static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
@@ -1240,6 +1408,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
        unsigned long n;
        unsigned long *dirty_bitmap;
        unsigned long *dirty_bitmap_buffer;
+       bool flush;
 
        as_id = log->slot >> 16;
        id = (u16)log->slot;
@@ -1248,13 +1417,15 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 
        slots = __kvm_memslots(kvm, as_id);
        memslot = id_to_memslot(slots, id);
+       if (!memslot || !memslot->dirty_bitmap)
+               return -ENOENT;
 
        dirty_bitmap = memslot->dirty_bitmap;
-       if (!dirty_bitmap)
-               return -ENOENT;
+
+       kvm_arch_sync_dirty_log(kvm, memslot);
 
        n = kvm_dirty_bitmap_bytes(memslot);
-       *flush = false;
+       flush = false;
        if (kvm->manual_dirty_log_protect) {
                /*
                 * Unlike kvm_get_dirty_log, we always return false in *flush,
@@ -1277,7 +1448,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
                        if (!dirty_bitmap[i])
                                continue;
 
-                       *flush = true;
+                       flush = true;
                        mask = xchg(&dirty_bitmap[i], 0);
                        dirty_bitmap_buffer[i] = mask;
 
@@ -1288,21 +1459,55 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
                spin_unlock(&kvm->mmu_lock);
        }
 
+       if (flush)
+               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
+
        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
                return -EFAULT;
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+
+
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed  and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
+ *
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Copy the snapshot to the userspace.
+ *   4. Flush TLB's if needed.
+ */
+static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                                     struct kvm_dirty_log *log)
+{
+       int r;
+
+       mutex_lock(&kvm->slots_lock);
+
+       r = kvm_get_dirty_log_protect(kvm, log);
+
+       mutex_unlock(&kvm->slots_lock);
+       return r;
+}
 
 /**
  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
  *     and reenable dirty page tracking for the corresponding pages.
  * @kvm:       pointer to kvm instance
  * @log:       slot id and address from which to fetch the bitmap of dirty pages
- * @flush:     true if TLB flush is needed by caller
  */
-int kvm_clear_dirty_log_protect(struct kvm *kvm,
-                               struct kvm_clear_dirty_log *log, bool *flush)
+static int kvm_clear_dirty_log_protect(struct kvm *kvm,
+                                      struct kvm_clear_dirty_log *log)
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
@@ -1311,6 +1516,7 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
        unsigned long i, n;
        unsigned long *dirty_bitmap;
        unsigned long *dirty_bitmap_buffer;
+       bool flush;
 
        as_id = log->slot >> 16;
        id = (u16)log->slot;
@@ -1322,10 +1528,10 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
 
        slots = __kvm_memslots(kvm, as_id);
        memslot = id_to_memslot(slots, id);
+       if (!memslot || !memslot->dirty_bitmap)
+               return -ENOENT;
 
        dirty_bitmap = memslot->dirty_bitmap;
-       if (!dirty_bitmap)
-               return -ENOENT;
 
        n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
 
@@ -1334,7 +1540,9 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
            (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
            return -EINVAL;
 
-       *flush = false;
+       kvm_arch_sync_dirty_log(kvm, memslot);
+
+       flush = false;
        dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
        if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
                return -EFAULT;
@@ -1357,28 +1565,32 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
                 * a problem if userspace sets them in log->dirty_bitmap.
                */
                if (mask) {
-                       *flush = true;
+                       flush = true;
                        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
                                                                offset, mask);
                }
        }
        spin_unlock(&kvm->mmu_lock);
 
+       if (flush)
+               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
+
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
-#endif
 
-bool kvm_largepages_enabled(void)
+static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
+                                       struct kvm_clear_dirty_log *log)
 {
-       return largepages_enabled;
-}
+       int r;
 
-void kvm_disable_largepages(void)
-{
-       largepages_enabled = false;
+       mutex_lock(&kvm->slots_lock);
+
+       r = kvm_clear_dirty_log_protect(kvm, log);
+
+       mutex_unlock(&kvm->slots_lock);
+       return r;
 }
-EXPORT_SYMBOL_GPL(kvm_disable_largepages);
+#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
@@ -1754,12 +1966,6 @@ kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
-kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
-{
-       return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
-
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
@@ -3310,9 +3516,6 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
        case KVM_CAP_IOEVENTFD_ANY_LENGTH:
        case KVM_CAP_CHECK_EXTENSION_VM:
        case KVM_CAP_ENABLE_CAP_VM:
-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
-       case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
-#endif
                return 1;
 #ifdef CONFIG_KVM_MMIO
        case KVM_CAP_COALESCED_MMIO:
@@ -3320,6 +3523,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
        case KVM_CAP_COALESCED_PIO:
                return 1;
 #endif
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+       case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
+               return KVM_DIRTY_LOG_MANUAL_CAPS;
+#endif
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
        case KVM_CAP_IRQ_ROUTING:
                return KVM_MAX_IRQ_ROUTES;
@@ -3347,11 +3554,17 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 {
        switch (cap->cap) {
 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
-       case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
-               if (cap->flags || (cap->args[0] & ~1))
+       case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
+               u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
+
+               if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
+                       allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
+
+               if (cap->flags || (cap->args[0] & ~allowed_options))
                        return -EINVAL;
                kvm->manual_dirty_log_protect = cap->args[0];
                return 0;
+       }
 #endif
        default:
                return kvm_vm_ioctl_enable_cap(kvm, cap);