Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 26 May 2022 21:20:14 +0000 (14:20 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 26 May 2022 21:20:14 +0000 (14:20 -0700)
Pull kvm updates from Paolo Bonzini:
 "S390:

   - ultravisor communication device driver

   - fix TEID on terminating storage key ops

  RISC-V:

   - Added Sv57x4 support for G-stage page table

   - Added range based local HFENCE functions

   - Added remote HFENCE functions based on VCPU requests

   - Added ISA extension registers in ONE_REG interface

   - Updated KVM RISC-V maintainers entry to cover selftests support

  ARM:

   - Add support for the ARMv8.6 WFxT extension

   - Guard pages for the EL2 stacks

   - Trap and emulate AArch32 ID registers to hide unsupported features

   - Ability to select and save/restore the set of hypercalls exposed to
     the guest

   - Support for PSCI-initiated suspend in collaboration with userspace

   - GICv3 register-based LPI invalidation support

   - Move host PMU event merging into the vcpu data structure

   - GICv3 ITS save/restore fixes

   - The usual set of small-scale cleanups and fixes

  x86:

   - New ioctls to get/set TSC frequency for a whole VM

   - Allow userspace to opt out of hypercall patching

   - Only do MSR filtering for MSRs accessed by rdmsr/wrmsr

  AMD SEV improvements:

   - Add KVM_EXIT_SHUTDOWN metadata for SEV-ES

   - V_TSC_AUX support

  Nested virtualization improvements for AMD:

   - Support for "nested nested" optimizations (nested vVMLOAD/VMSAVE,
     nested vGIF)

   - Allow AVIC to co-exist with a nested guest running

   - Fixes for LBR virtualizations when a nested guest is running, and
     nested LBR virtualization support

   - PAUSE filtering for nested hypervisors

  Guest support:

   - Decoupling of vcpu_is_preempted from PV spinlocks"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (199 commits)
  KVM: x86: Fix the intel_pt PMI handling wrongly considered from guest
  KVM: selftests: x86: Sync the new name of the test case to .gitignore
  Documentation: kvm: reorder ARM-specific section about KVM_SYSTEM_EVENT_SUSPEND
  x86, kvm: use correct GFP flags for preemption disabled
  KVM: LAPIC: Drop pending LAPIC timer injection when canceling the timer
  x86/kvm: Alloc dummy async #PF token outside of raw spinlock
  KVM: x86: avoid calling x86 emulator without a decoded instruction
  KVM: SVM: Use kzalloc for sev ioctl interfaces to prevent kernel data leak
  x86/fpu: KVM: Set the base guest FPU uABI size to sizeof(struct kvm_xsave)
  s390/uv_uapi: depend on CONFIG_S390
  KVM: selftests: x86: Fix test failure on arch lbr capable platforms
  KVM: LAPIC: Trace LAPIC timer expiration on every vmentry
  KVM: s390: selftest: Test suppression indication on key prot exception
  KVM: s390: Don't indicate suppression on dirtying, failing memop
  selftests: drivers/s390x: Add uvdevice tests
  drivers/s390/char: Add Ultravisor io device
  MAINTAINERS: Update KVM RISC-V entry to cover selftests support
  RISC-V: KVM: Introduce ISA extension register
  RISC-V: KVM: Cleanup stale TLB entries when host CPU changes
  RISC-V: KVM: Add remote HFENCE functions based on VCPU requests
  ...

141 files changed:
Documentation/arm64/cpu-feature-registers.rst
Documentation/arm64/elf_hwcaps.rst
Documentation/virt/kvm/api.rst
Documentation/virt/kvm/arm/hypercalls.rst [new file with mode: 0644]
Documentation/virt/kvm/arm/index.rst
Documentation/virt/kvm/arm/psci.rst [deleted file]
Documentation/virt/kvm/x86/mmu.rst
MAINTAINERS
arch/arm64/include/asm/barrier.h
arch/arm64/include/asm/cputype.h
arch/arm64/include/asm/esr.h
arch/arm64/include/asm/hwcap.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_emulate.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/uapi/asm/hwcap.h
arch/arm64/include/uapi/asm/kvm.h
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/cpuinfo.c
arch/arm64/kvm/Makefile
arch/arm64/kvm/arch_timer.c
arch/arm64/kvm/arm.c
arch/arm64/kvm/guest.c
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/hyp/include/nvhe/mm.h
arch/arm64/kvm/hyp/nvhe/host.S
arch/arm64/kvm/hyp/nvhe/hyp-main.c
arch/arm64/kvm/hyp/nvhe/mm.c
arch/arm64/kvm/hyp/nvhe/setup.c
arch/arm64/kvm/hyp/nvhe/switch.c
arch/arm64/kvm/hyp/nvhe/sys_regs.c
arch/arm64/kvm/hypercalls.c
arch/arm64/kvm/mmu.c
arch/arm64/kvm/pmu-emul.c
arch/arm64/kvm/pmu.c
arch/arm64/kvm/psci.c
arch/arm64/kvm/sys_regs.c
arch/arm64/kvm/sys_regs.h
arch/arm64/kvm/vgic/vgic-init.c
arch/arm64/kvm/vgic/vgic-its.c
arch/arm64/kvm/vgic/vgic-mmio-v2.c
arch/arm64/kvm/vgic/vgic-mmio-v3.c
arch/arm64/kvm/vgic/vgic-v3.c
arch/arm64/kvm/vgic/vgic.h
arch/arm64/lib/delay.c
arch/arm64/tools/cpucaps
arch/riscv/include/asm/csr.h
arch/riscv/include/asm/kvm_host.h
arch/riscv/include/uapi/asm/kvm.h
arch/riscv/kvm/main.c
arch/riscv/kvm/mmu.c
arch/riscv/kvm/tlb.S [deleted file]
arch/riscv/kvm/tlb.c [new file with mode: 0644]
arch/riscv/kvm/vcpu.c
arch/riscv/kvm/vcpu_exit.c
arch/riscv/kvm/vcpu_sbi_replace.c
arch/riscv/kvm/vcpu_sbi_v01.c
arch/riscv/kvm/vm.c
arch/riscv/kvm/vmid.c
arch/s390/include/asm/uv.h
arch/s390/include/uapi/asm/uvdevice.h [new file with mode: 0644]
arch/s390/kvm/gaccess.c
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/kvm-x86-ops.h
arch/x86/include/asm/kvm-x86-pmu-ops.h [new file with mode: 0644]
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/uaccess.h
arch/x86/include/asm/vmx.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/fpu/core.c
arch/x86/kernel/kvm.c
arch/x86/kernel/kvmclock.c
arch/x86/kvm/i8259.c
arch/x86/kvm/irq.c
arch/x86/kvm/irq_comm.c
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/mmu_internal.h
arch/x86/kvm/mmu/mmutrace.h
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/mmu/spte.c
arch/x86/kvm/mmu/spte.h
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/pmu.c
arch/x86/kvm/pmu.h
arch/x86/kvm/svm/avic.c
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/pmu.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/trace.h
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/pmu_intel.c
arch/x86/kvm/vmx/posted_intr.c
arch/x86/kvm/vmx/vmcs.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/xen.c
arch/x86/kvm/xen.h
drivers/s390/char/Kconfig
drivers/s390/char/Makefile
drivers/s390/char/uvdevice.c [new file with mode: 0644]
include/kvm/arm_arch_timer.h
include/kvm/arm_hypercalls.h
include/kvm/arm_pmu.h
include/kvm/arm_psci.h
include/kvm/arm_vgic.h
include/linux/kvm_host.h
include/uapi/linux/kvm.h
init/Kconfig
scripts/kallsyms.c
tools/include/linux/arm-smccc.h [new file with mode: 0644]
tools/testing/selftests/Makefile
tools/testing/selftests/drivers/.gitignore
tools/testing/selftests/drivers/s390x/uvdevice/Makefile [new file with mode: 0644]
tools/testing/selftests/drivers/s390x/uvdevice/config [new file with mode: 0644]
tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c [new file with mode: 0644]
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/aarch64/get-reg-list.c
tools/testing/selftests/kvm/aarch64/hypercalls.c [new file with mode: 0644]
tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c [deleted file]
tools/testing/selftests/kvm/aarch64/psci_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/include/aarch64/processor.h
tools/testing/selftests/kvm/include/riscv/processor.h
tools/testing/selftests/kvm/lib/aarch64/processor.c
tools/testing/selftests/kvm/lib/riscv/processor.c
tools/testing/selftests/kvm/lib/riscv/ucall.c
tools/testing/selftests/kvm/s390x/memop.c
tools/testing/selftests/kvm/steal_time.c
tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c [moved from tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c with 83% similarity]
tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
virt/kvm/kvm_main.c

index 749ae97..04ba83e 100644 (file)
@@ -290,6 +290,8 @@ infrastructure:
      +------------------------------+---------+---------+
      | RPRES                        | [7-4]   |    y    |
      +------------------------------+---------+---------+
+     | WFXT                         | [3-0]   |    y    |
+     +------------------------------+---------+---------+
 
 
 Appendix I: Example
index f8d818e..3d116fb 100644 (file)
@@ -297,6 +297,10 @@ HWCAP2_SME_FA64
 
     Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
 
+HWCAP2_WFXT
+
+    Functionality implied by ID_AA64ISAR2_EL1.WFXT == 0b0010.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
index c8e2e9c..11e00a4 100644 (file)
@@ -982,12 +982,22 @@ memory.
        __u8 pad2[30];
   };
 
-If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the
-KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl.
-This requests KVM to generate the contents of the hypercall page
-automatically; hypercalls will be intercepted and passed to userspace
-through KVM_EXIT_XEN.  In this case, all of the blob size and address
-fields must be zero.
+If certain flags are returned from the KVM_CAP_XEN_HVM check, they may
+be set in the flags field of this ioctl:
+
+The KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag requests KVM to generate
+the contents of the hypercall page automatically; hypercalls will be
+intercepted and passed to userspace through KVM_EXIT_XEN.  In this
+ase, all of the blob size and address fields must be zero.
+
+The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates to KVM that userspace
+will always use the KVM_XEN_HVM_EVTCHN_SEND ioctl to deliver event
+channel interrupts rather than manipulating the guest's shared_info
+structures directly. This, in turn, may allow KVM to enable features
+such as intercepting the SCHEDOP_poll hypercall to accelerate PV
+spinlock operation for the guest. Userspace may still use the ioctl
+to deliver events if it was advertised, even if userspace does not
+send this indication that it will always do so
 
 No other flags are currently valid in the struct kvm_xen_hvm_config.
 
@@ -1476,14 +1486,43 @@ Possible values are:
                                  [s390]
    KVM_MP_STATE_LOAD             the vcpu is in a special load/startup state
                                  [s390]
+   KVM_MP_STATE_SUSPENDED        the vcpu is in a suspend state and is waiting
+                                 for a wakeup event [arm64]
    ==========================    ===============================================
 
 On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
 in-kernel irqchip, the multiprocessing state must be maintained by userspace on
 these architectures.
 
-For arm64/riscv:
-^^^^^^^^^^^^^^^^
+For arm64:
+^^^^^^^^^^
+
+If a vCPU is in the KVM_MP_STATE_SUSPENDED state, KVM will emulate the
+architectural execution of a WFI instruction.
+
+If a wakeup event is recognized, KVM will exit to userspace with a
+KVM_SYSTEM_EVENT exit, where the event type is KVM_SYSTEM_EVENT_WAKEUP. If
+userspace wants to honor the wakeup, it must set the vCPU's MP state to
+KVM_MP_STATE_RUNNABLE. If it does not, KVM will continue to await a wakeup
+event in subsequent calls to KVM_RUN.
+
+.. warning::
+
+     If userspace intends to keep the vCPU in a SUSPENDED state, it is
+     strongly recommended that userspace take action to suppress the
+     wakeup event (such as masking an interrupt). Otherwise, subsequent
+     calls to KVM_RUN will immediately exit with a KVM_SYSTEM_EVENT_WAKEUP
+     event and inadvertently waste CPU cycles.
+
+     Additionally, if userspace takes action to suppress a wakeup event,
+     it is strongly recommended that it also restores the vCPU to its
+     original state when the vCPU is made RUNNABLE again. For example,
+     if userspace masked a pending interrupt to suppress the wakeup,
+     the interrupt should be unmasked before returning control to the
+     guest.
+
+For riscv:
+^^^^^^^^^^
 
 The only states that are valid are KVM_MP_STATE_STOPPED and
 KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not.
@@ -1887,22 +1926,25 @@ the future.
 4.55 KVM_SET_TSC_KHZ
 --------------------
 
-:Capability: KVM_CAP_TSC_CONTROL
+:Capability: KVM_CAP_TSC_CONTROL / KVM_CAP_VM_TSC_CONTROL
 :Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
 :Parameters: virtual tsc_khz
 :Returns: 0 on success, -1 on error
 
 Specifies the tsc frequency for the virtual machine. The unit of the
 frequency is KHz.
 
+If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
+be used as a vm ioctl to set the initial tsc frequency of subsequently
+created vCPUs.
 
 4.56 KVM_GET_TSC_KHZ
 --------------------
 
-:Capability: KVM_CAP_GET_TSC_KHZ
+:Capability: KVM_CAP_GET_TSC_KHZ / KVM_CAP_VM_TSC_CONTROL
 :Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
 :Parameters: none
 :Returns: virtual tsc-khz on success, negative value on error
 
@@ -2601,6 +2643,24 @@ EINVAL.
 After the vcpu's SVE configuration is finalized, further attempts to
 write this register will fail with EPERM.
 
+arm64 bitmap feature firmware pseudo-registers have the following bit pattern::
+
+  0x6030 0000 0016 <regno:16>
+
+The bitmap feature firmware registers exposes the hypercall services that
+are available for userspace to configure. The set bits corresponds to the
+services that are available for the guests to access. By default, KVM
+sets all the supported bits during VM initialization. The userspace can
+discover the available services via KVM_GET_ONE_REG, and write back the
+bitmap corresponding to the features that it wishes guests to see via
+KVM_SET_ONE_REG.
+
+Note: These registers are immutable once any of the vCPUs of the VM has
+run at least once. A KVM_SET_ONE_REG in such a scenario will return
+a -EBUSY to userspace.
+
+(See Documentation/virt/kvm/arm/hypercalls.rst for more details.)
+
 
 MIPS registers are mapped using the lower 32 bits.  The upper 16 of that is
 the register group type:
@@ -3754,12 +3814,18 @@ in case of KVM_S390_MEMOP_F_CHECK_ONLY), the ioctl returns a positive
 error number indicating the type of exception. This exception is also
 raised directly at the corresponding VCPU if the flag
 KVM_S390_MEMOP_F_INJECT_EXCEPTION is set.
+On protection exceptions, unless specified otherwise, the injected
+translation-exception identifier (TEID) indicates suppression.
 
 If the KVM_S390_MEMOP_F_SKEY_PROTECTION flag is set, storage key
 protection is also in effect and may cause exceptions if accesses are
 prohibited given the access key designated by "key"; the valid range is 0..15.
 KVM_S390_MEMOP_F_SKEY_PROTECTION is available if KVM_CAP_S390_MEM_OP_EXTENSION
 is > 0.
+Since the accessed memory may span multiple pages and those pages might have
+different storage keys, it is possible that a protection exception occurs
+after memory has been modified. In this case, if the exception is injected,
+the TEID does not indicate suppression.
 
 Absolute read/write:
 ^^^^^^^^^^^^^^^^^^^^
@@ -5216,7 +5282,25 @@ have deterministic behavior.
                struct {
                        __u64 gfn;
                } shared_info;
-               __u64 pad[4];
+               struct {
+                       __u32 send_port;
+                       __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+                       __u32 flags;
+                       union {
+                               struct {
+                                       __u32 port;
+                                       __u32 vcpu;
+                                       __u32 priority;
+                               } port;
+                               struct {
+                                       __u32 port; /* Zero for eventfd */
+                                       __s32 fd;
+                               } eventfd;
+                               __u32 padding[4];
+                       } deliver;
+               } evtchn;
+               __u32 xen_version;
+               __u64 pad[8];
        } u;
   };
 
@@ -5247,6 +5331,30 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
 
 KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
   Sets the exception vector used to deliver Xen event channel upcalls.
+  This is the HVM-wide vector injected directly by the hypervisor
+  (not through the local APIC), typically configured by a guest via
+  HVM_PARAM_CALLBACK_IRQ.
+
+KVM_XEN_ATTR_TYPE_EVTCHN
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+  an outbound port number for interception of EVTCHNOP_send requests
+  from the guest. A given sending port number may be directed back
+  to a specified vCPU (by APIC ID) / port / priority on the guest,
+  or to trigger events on an eventfd. The vCPU and priority can be
+  changed by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call,
+  but other fields cannot change for a given sending port. A port
+  mapping is removed by using KVM_XEN_EVTCHN_DEASSIGN in the flags
+  field.
+
+KVM_XEN_ATTR_TYPE_XEN_VERSION
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+  the 32-bit version code returned to the guest when it invokes the
+  XENVER_version call; typically (XEN_MAJOR << 16 | XEN_MINOR). PV
+  Xen guests will often use this to as a dummy hypercall to trigger
+  event channel delivery, so responding within the kernel without
+  exiting to userspace is beneficial.
 
 4.127 KVM_XEN_HVM_GET_ATTR
 --------------------------
@@ -5258,7 +5366,8 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
 :Returns: 0 on success, < 0 on error
 
 Allows Xen VM attributes to be read. For the structure and types,
-see KVM_XEN_HVM_SET_ATTR above.
+see KVM_XEN_HVM_SET_ATTR above. The KVM_XEN_ATTR_TYPE_EVTCHN
+attribute cannot be read.
 
 4.128 KVM_XEN_VCPU_SET_ATTR
 ---------------------------
@@ -5285,6 +5394,13 @@ see KVM_XEN_HVM_SET_ATTR above.
                        __u64 time_blocked;
                        __u64 time_offline;
                } runstate;
+               __u32 vcpu_id;
+               struct {
+                       __u32 port;
+                       __u32 priority;
+                       __u64 expires_ns;
+               } timer;
+               __u8 vector;
        } u;
   };
 
@@ -5326,6 +5442,27 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
   or RUNSTATE_offline) to set the current accounted state as of the
   adjusted state_entry_time.
 
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the Xen
+  vCPU ID of the given vCPU, to allow timer-related VCPU operations to
+  be intercepted by KVM.
+
+KVM_XEN_VCPU_ATTR_TYPE_TIMER
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+  event channel port/priority for the VIRQ_TIMER of the vCPU, as well
+  as allowing a pending timer to be saved/restored.
+
+KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+  per-vCPU local APIC upcall vector, configured by a Xen guest with
+  the HVMOP_set_evtchn_upcall_vector hypercall. This is typically
+  used by Windows guests, and is distinct from the HVM-wide upcall
+  vector configured with HVM_PARAM_CALLBACK_IRQ.
+
+
 4.129 KVM_XEN_VCPU_GET_ATTR
 ---------------------------
 
@@ -5645,6 +5782,25 @@ enabled with ``arch_prctl()``, but this may change in the future.
 The offsets of the state save areas in struct kvm_xsave follow the contents
 of CPUID leaf 0xD on the host.
 
+4.135 KVM_XEN_HVM_EVTCHN_SEND
+-----------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_irq_routing_xen_evtchn
+:Returns: 0 on success, < 0 on error
+
+
+::
+
+   struct kvm_irq_routing_xen_evtchn {
+       __u32 port;
+       __u32 vcpu;
+       __u32 priority;
+   };
+
+This ioctl injects an event channel interrupt directly to the guest vCPU.
 
 5. The kvm_run structure
 ========================
@@ -5987,6 +6143,9 @@ should put the acknowledged interrupt vector into the 'epr' field.
   #define KVM_SYSTEM_EVENT_SHUTDOWN       1
   #define KVM_SYSTEM_EVENT_RESET          2
   #define KVM_SYSTEM_EVENT_CRASH          3
+  #define KVM_SYSTEM_EVENT_WAKEUP         4
+  #define KVM_SYSTEM_EVENT_SUSPEND        5
+  #define KVM_SYSTEM_EVENT_SEV_TERM       6
                        __u32 type;
                         __u32 ndata;
                         __u64 data[16];
@@ -6011,6 +6170,13 @@ Valid values for 'type' are:
    has requested a crash condition maintenance. Userspace can choose
    to ignore the request, or to gather VM memory core dump and/or
    reset/shutdown of the VM.
+ - KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination.
+   The guest physical address of the guest's GHCB is stored in `data[0]`.
+ - KVM_SYSTEM_EVENT_WAKEUP -- the exiting vCPU is in a suspended state and
+   KVM has recognized a wakeup event. Userspace may honor this event by
+   marking the exiting vCPU as runnable, or deny it and call KVM_RUN again.
+ - KVM_SYSTEM_EVENT_SUSPEND -- the guest has requested a suspension of
+   the VM.
 
 If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
 architecture specific information for the system-level event.  Only
@@ -6027,6 +6193,32 @@ Previous versions of Linux defined a `flags` member in this struct.  The
 field is now aliased to `data[0]`.  Userspace can assume that it is only
 written if ndata is greater than 0.
 
+For arm/arm64:
+--------------
+
+KVM_SYSTEM_EVENT_SUSPEND exits are enabled with the
+KVM_CAP_ARM_SYSTEM_SUSPEND VM capability. If a guest invokes the PSCI
+SYSTEM_SUSPEND function, KVM will exit to userspace with this event
+type.
+
+It is the sole responsibility of userspace to implement the PSCI
+SYSTEM_SUSPEND call according to ARM DEN0022D.b 5.19 "SYSTEM_SUSPEND".
+KVM does not change the vCPU's state before exiting to userspace, so
+the call parameters are left in-place in the vCPU registers.
+
+Userspace is _required_ to take action for such an exit. It must
+either:
+
+ - Honor the guest request to suspend the VM. Userspace can request
+   in-kernel emulation of suspension by setting the calling vCPU's
+   state to KVM_MP_STATE_SUSPENDED. Userspace must configure the vCPU's
+   state according to the parameters passed to the PSCI function when
+   the calling vCPU is resumed. See ARM DEN0022D.b 5.19.1 "Intended use"
+   for details on the function parameters.
+
+ - Deny the guest request to suspend the VM. See ARM DEN0022D.b 5.19.2
+   "Caller responsibilities" for possible return values.
+
 ::
 
                /* KVM_EXIT_IOAPIC_EOI */
@@ -7147,6 +7339,15 @@ The valid bits in cap.args[0] are:
                                     Additionally, when this quirk is disabled,
                                     KVM clears CPUID.01H:ECX[bit 3] if
                                     IA32_MISC_ENABLE[bit 18] is cleared.
+
+ KVM_X86_QUIRK_FIX_HYPERCALL_INSN   By default, KVM rewrites guest
+                                    VMMCALL/VMCALL instructions to match the
+                                    vendor's hypercall instruction for the
+                                    system. When this quirk is disabled, KVM
+                                    will no longer rewrite invalid guest
+                                    hypercall instructions. Executing the
+                                    incorrect hypercall instruction will
+                                    generate a #UD within the guest.
 =================================== ============================================
 
 8. Other capabilities.
@@ -7624,8 +7825,9 @@ PVHVM guests. Valid flags are::
   #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR     (1 << 0)
   #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL   (1 << 1)
   #define KVM_XEN_HVM_CONFIG_SHARED_INFO       (1 << 2)
-  #define KVM_XEN_HVM_CONFIG_RUNSTATE          (1 << 2)
-  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL     (1 << 3)
+  #define KVM_XEN_HVM_CONFIG_RUNSTATE          (1 << 3)
+  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL     (1 << 4)
+  #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND       (1 << 5)
 
 The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
 ioctl is available, for the guest to set its hypercall page.
@@ -7649,6 +7851,14 @@ The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL flag indicates that IRQ routing entries
 of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority
 field set to indicate 2 level event channel delivery.
 
+The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates that KVM supports
+injecting event channel events directly into the guest with the
+KVM_XEN_HVM_EVTCHN_SEND ioctl. It also indicates support for the
+KVM_XEN_ATTR_TYPE_EVTCHN/XEN_VERSION HVM attributes and the
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes.
+related to event channel delivery, timers, and the XENVER_version
+interception.
+
 8.31 KVM_CAP_PPC_MULTITCE
 -------------------------
 
@@ -7736,6 +7946,16 @@ At this time, KVM_PMU_CAP_DISABLE is the only capability.  Setting
 this capability will disable PMU virtualization for that VM.  Usermode
 should adjust CPUID leaf 0xA to reflect that the PMU is disabled.
 
+8.36 KVM_CAP_ARM_SYSTEM_SUSPEND
+-------------------------------
+
+:Capability: KVM_CAP_ARM_SYSTEM_SUSPEND
+:Architectures: arm64
+:Type: vm
+
+When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of
+type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request.
+
 9. Known KVM API problems
 =========================
 
diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/hypercalls.rst
new file mode 100644 (file)
index 0000000..3e23084
--- /dev/null
@@ -0,0 +1,138 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+ARM Hypercall Interface
+=======================
+
+KVM handles the hypercall services as requested by the guests. New hypercall
+services are regularly made available by the ARM specification or by KVM (as
+vendor services) if they make sense from a virtualization point of view.
+
+This means that a guest booted on two different versions of KVM can observe
+two different "firmware" revisions. This could cause issues if a given guest
+is tied to a particular version of a hypercall service, or if a migration
+causes a different version to be exposed out of the blue to an unsuspecting
+guest.
+
+In order to remedy this situation, KVM exposes a set of "firmware
+pseudo-registers" that can be manipulated using the GET/SET_ONE_REG
+interface. These registers can be saved/restored by userspace, and set
+to a convenient value as required.
+
+The following registers are defined:
+
+* KVM_REG_ARM_PSCI_VERSION:
+
+  KVM implements the PSCI (Power State Coordination Interface)
+  specification in order to provide services such as CPU on/off, reset
+  and power-off to the guest.
+
+  - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set
+    (and thus has already been initialized)
+  - Returns the current PSCI version on GET_ONE_REG (defaulting to the
+    highest PSCI version implemented by KVM and compatible with v0.2)
+  - Allows any PSCI version implemented by KVM and compatible with
+    v0.2 to be set with SET_ONE_REG
+  - Affects the whole VM (even if the register view is per-vcpu)
+
+* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+    Holds the state of the firmware support to mitigate CVE-2017-5715, as
+    offered by KVM to the guest via a HVC call. The workaround is described
+    under SMCCC_ARCH_WORKAROUND_1 in [1].
+
+  Accepted values are:
+
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL:
+      KVM does not offer
+      firmware support for the workaround. The mitigation status for the
+      guest is unknown.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL:
+      The workaround HVC call is
+      available to the guest and required for the mitigation.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED:
+      The workaround HVC call
+      is available to the guest, but it is not needed on this VCPU.
+
+* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+    Holds the state of the firmware support to mitigate CVE-2018-3639, as
+    offered by KVM to the guest via a HVC call. The workaround is described
+    under SMCCC_ARCH_WORKAROUND_2 in [1]_.
+
+  Accepted values are:
+
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
+      A workaround is not
+      available. KVM does not offer firmware support for the workaround.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
+      The workaround state is
+      unknown. KVM does not offer firmware support for the workaround.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
+      The workaround is available,
+      and can be disabled by a vCPU. If
+      KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for
+      this vCPU.
+    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
+      The workaround is always active on this vCPU or it is not needed.
+
+
+Bitmap Feature Firmware Registers
+---------------------------------
+
+Contrary to the above registers, the following registers exposes the
+hypercall services in the form of a feature-bitmap to the userspace. This
+bitmap is translated to the services that are available to the guest.
+There is a register defined per service call owner and can be accessed via
+GET/SET_ONE_REG interface.
+
+By default, these registers are set with the upper limit of the features
+that are supported. This way userspace can discover all the usable
+hypercall services via GET_ONE_REG. The user-space can write-back the
+desired bitmap back via SET_ONE_REG. The features for the registers that
+are untouched, probably because userspace isn't aware of them, will be
+exposed as is to the guest.
+
+Note that KVM will not allow the userspace to configure the registers
+anymore once any of the vCPUs has run at least once. Instead, it will
+return a -EBUSY.
+
+The pseudo-firmware bitmap register are as follows:
+
+* KVM_REG_ARM_STD_BMAP:
+    Controls the bitmap of the ARM Standard Secure Service Calls.
+
+  The following bits are accepted:
+
+    Bit-0: KVM_REG_ARM_STD_BIT_TRNG_V1_0:
+      The bit represents the services offered under v1.0 of ARM True Random
+      Number Generator (TRNG) specification, ARM DEN0098.
+
+* KVM_REG_ARM_STD_HYP_BMAP:
+    Controls the bitmap of the ARM Standard Hypervisor Service Calls.
+
+  The following bits are accepted:
+
+    Bit-0: KVM_REG_ARM_STD_HYP_BIT_PV_TIME:
+      The bit represents the Paravirtualized Time service as represented by
+      ARM DEN0057A.
+
+* KVM_REG_ARM_VENDOR_HYP_BMAP:
+    Controls the bitmap of the Vendor specific Hypervisor Service Calls.
+
+  The following bits are accepted:
+
+    Bit-0: KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT
+      The bit represents the ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID
+      and ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID function-ids.
+
+    Bit-1: KVM_REG_ARM_VENDOR_HYP_BIT_PTP:
+      The bit represents the Precision Time Protocol KVM service.
+
+Errors:
+
+    =======  =============================================================
+    -ENOENT   Unknown register accessed.
+    -EBUSY    Attempt a 'write' to the register after the VM has started.
+    -EINVAL   Invalid bitmap written to the register.
+    =======  =============================================================
+
+.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf
index 78a9b67..e848484 100644 (file)
@@ -8,6 +8,6 @@ ARM
    :maxdepth: 2
 
    hyp-abi
-   psci
+   hypercalls
    pvtime
    ptp_kvm
diff --git a/Documentation/virt/kvm/arm/psci.rst b/Documentation/virt/kvm/arm/psci.rst
deleted file mode 100644 (file)
index d52c2e8..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=========================================
-Power State Coordination Interface (PSCI)
-=========================================
-
-KVM implements the PSCI (Power State Coordination Interface)
-specification in order to provide services such as CPU on/off, reset
-and power-off to the guest.
-
-The PSCI specification is regularly updated to provide new features,
-and KVM implements these updates if they make sense from a virtualization
-point of view.
-
-This means that a guest booted on two different versions of KVM can
-observe two different "firmware" revisions. This could cause issues if
-a given guest is tied to a particular PSCI revision (unlikely), or if
-a migration causes a different PSCI version to be exposed out of the
-blue to an unsuspecting guest.
-
-In order to remedy this situation, KVM exposes a set of "firmware
-pseudo-registers" that can be manipulated using the GET/SET_ONE_REG
-interface. These registers can be saved/restored by userspace, and set
-to a convenient value if required.
-
-The following register is defined:
-
-* KVM_REG_ARM_PSCI_VERSION:
-
-  - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set
-    (and thus has already been initialized)
-  - Returns the current PSCI version on GET_ONE_REG (defaulting to the
-    highest PSCI version implemented by KVM and compatible with v0.2)
-  - Allows any PSCI version implemented by KVM and compatible with
-    v0.2 to be set with SET_ONE_REG
-  - Affects the whole VM (even if the register view is per-vcpu)
-
-* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-    Holds the state of the firmware support to mitigate CVE-2017-5715, as
-    offered by KVM to the guest via a HVC call. The workaround is described
-    under SMCCC_ARCH_WORKAROUND_1 in [1].
-
-  Accepted values are:
-
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL:
-      KVM does not offer
-      firmware support for the workaround. The mitigation status for the
-      guest is unknown.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL:
-      The workaround HVC call is
-      available to the guest and required for the mitigation.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED:
-      The workaround HVC call
-      is available to the guest, but it is not needed on this VCPU.
-
-* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-    Holds the state of the firmware support to mitigate CVE-2018-3639, as
-    offered by KVM to the guest via a HVC call. The workaround is described
-    under SMCCC_ARCH_WORKAROUND_2 in [1]_.
-
-  Accepted values are:
-
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
-      A workaround is not
-      available. KVM does not offer firmware support for the workaround.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
-      The workaround state is
-      unknown. KVM does not offer firmware support for the workaround.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
-      The workaround is available,
-      and can be disabled by a vCPU. If
-      KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for
-      this vCPU.
-    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
-      The workaround is always active on this vCPU or it is not needed.
-
-.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf
index 5b1ebad..8739120 100644 (file)
@@ -202,6 +202,10 @@ Shadow pages contain the following information:
     Is 1 if the MMU instance cannot use A/D bits.  EPT did not have A/D
     bits before Haswell; shadow EPT page tables also cannot use A/D bits
     if the L1 hypervisor does not enable them.
+  role.passthrough:
+    The page is not backed by a guest page table, but its first entry
+    points to one.  This is set if NPT uses 5-level page tables (host
+    CR4.LA57=1) and is shadowing L1's 4-level NPT (L1 CR4.LA57=1).
   gfn:
     Either the guest page table containing the translations shadowed by this
     page, or the base page frame for linear translations.  See role.direct.
index f806dfd..222ca09 100644 (file)
@@ -10830,6 +10830,8 @@ T:      git git://github.com/kvm-riscv/linux.git
 F:     arch/riscv/include/asm/kvm*
 F:     arch/riscv/include/uapi/asm/kvm*
 F:     arch/riscv/kvm/
+F:     tools/testing/selftests/kvm/*/riscv/
+F:     tools/testing/selftests/kvm/riscv/
 
 KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
 M:     Christian Borntraeger <borntraeger@linux.ibm.com>
@@ -10844,9 +10846,12 @@ F:     Documentation/virt/kvm/s390*
 F:     arch/s390/include/asm/gmap.h
 F:     arch/s390/include/asm/kvm*
 F:     arch/s390/include/uapi/asm/kvm*
+F:     arch/s390/include/uapi/asm/uvdevice.h
 F:     arch/s390/kernel/uv.c
 F:     arch/s390/kvm/
 F:     arch/s390/mm/gmap.c
+F:     drivers/s390/char/uvdevice.c
+F:     tools/testing/selftests/drivers/s390x/uvdevice/
 F:     tools/testing/selftests/kvm/*/s390x/
 F:     tools/testing/selftests/kvm/s390x/
 
index 62217be..9f3e2c3 100644 (file)
 
 #define sev()          asm volatile("sev" : : : "memory")
 #define wfe()          asm volatile("wfe" : : : "memory")
+#define wfet(val)      asm volatile("msr s0_3_c1_c0_0, %0"     \
+                                    : : "r" (val) : "memory")
 #define wfi()          asm volatile("wfi" : : : "memory")
+#define wfit(val)      asm volatile("msr s0_3_c1_c0_1, %0"     \
+                                    : : "r" (val) : "memory")
 
 #define isb()          asm volatile("isb" : : : "memory")
 #define dmb(opt)       asm volatile("dmb " #opt : : : "memory")
index 92331c0..8aa0d27 100644 (file)
 
 #define APPLE_CPU_PART_M1_ICESTORM     0x022
 #define APPLE_CPU_PART_M1_FIRESTORM    0x023
+#define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024
+#define APPLE_CPU_PART_M1_FIRESTORM_PRO        0x025
+#define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028
+#define APPLE_CPU_PART_M1_FIRESTORM_MAX        0x029
 
 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
 #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
 #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110)
 #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM)
 #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM)
+#define MIDR_APPLE_M1_ICESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_PRO)
+#define MIDR_APPLE_M1_FIRESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_PRO)
+#define MIDR_APPLE_M1_ICESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_MAX)
+#define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX)
 
 /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */
 #define MIDR_FUJITSU_ERRATUM_010001            MIDR_FUJITSU_A64FX
index 8f236de..15b34fb 100644 (file)
 #define ESR_ELx_CV             (UL(1) << 24)
 #define ESR_ELx_COND_SHIFT     (20)
 #define ESR_ELx_COND_MASK      (UL(0xF) << ESR_ELx_COND_SHIFT)
-#define ESR_ELx_WFx_ISS_TI     (UL(1) << 0)
+#define ESR_ELx_WFx_ISS_RN     (UL(0x1F) << 5)
+#define ESR_ELx_WFx_ISS_RV     (UL(1) << 2)
+#define ESR_ELx_WFx_ISS_TI     (UL(3) << 0)
+#define ESR_ELx_WFx_ISS_WFxT   (UL(2) << 0)
 #define ESR_ELx_WFx_ISS_WFI    (UL(0) << 0)
 #define ESR_ELx_WFx_ISS_WFE    (UL(1) << 0)
 #define ESR_ELx_xVC_IMM_MASK   ((UL(1) << 16) - 1)
 #define DISR_EL1_ESR_MASK      (ESR_ELx_AET | ESR_ELx_EA | ESR_ELx_FSC)
 
 /* ESR value templates for specific events */
-#define ESR_ELx_WFx_MASK       (ESR_ELx_EC_MASK | ESR_ELx_WFx_ISS_TI)
+#define ESR_ELx_WFx_MASK       (ESR_ELx_EC_MASK |                      \
+                                (ESR_ELx_WFx_ISS_TI & ~ESR_ELx_WFx_ISS_WFxT))
 #define ESR_ELx_WFx_WFI_VAL    ((ESR_ELx_EC_WFx << ESR_ELx_EC_SHIFT) | \
                                 ESR_ELx_WFx_ISS_WFI)
 
index 9f0ce00..aa443d8 100644 (file)
 #define KERNEL_HWCAP_SME_B16F32                __khwcap2_feature(SME_B16F32)
 #define KERNEL_HWCAP_SME_F32F32                __khwcap2_feature(SME_F32F32)
 #define KERNEL_HWCAP_SME_FA64          __khwcap2_feature(SME_FA64)
+#define KERNEL_HWCAP_WFXT              __khwcap2_feature(WFXT)
 
 /*
  * This yields a mask that user programs can use to figure out what
index 13ae232..8aa8492 100644 (file)
  * FMO:                Override CPSR.F and enable signaling with VF
  * SWIO:       Turn set/way invalidates into set/way clean+invalidate
  * PTW:                Take a stage2 fault if a stage1 walk steps in device memory
+ * TID3:       Trap EL1 reads of group 3 ID registers
  */
 #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
                         HCR_BSU_IS | HCR_FB | HCR_TACR | \
                         HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
-                        HCR_FMO | HCR_IMO | HCR_PTW )
+                        HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 )
 #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
 #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
 #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
index d5b0386..2e277f2 100644 (file)
@@ -169,6 +169,7 @@ struct kvm_nvhe_init_params {
        unsigned long tcr_el2;
        unsigned long tpidr_el2;
        unsigned long stack_hyp_va;
+       unsigned long stack_pa;
        phys_addr_t pgd_pa;
        unsigned long hcr_el2;
        unsigned long vttbr;
index 0823317..0e66edd 100644 (file)
@@ -87,13 +87,6 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 
        if (vcpu_el1_is_32bit(vcpu))
                vcpu->arch.hcr_el2 &= ~HCR_RW;
-       else
-               /*
-                * TID3: trap feature register accesses that we virtualise.
-                * For now this is conditional, since no AArch32 feature regs
-                * are currently virtualised.
-                */
-               vcpu->arch.hcr_el2 |= HCR_TID3;
 
        if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
            vcpu_el1_is_32bit(vcpu))
index d5888de..47a1e25 100644 (file)
@@ -46,6 +46,7 @@
 #define KVM_REQ_RECORD_STEAL   KVM_ARCH_REQ(3)
 #define KVM_REQ_RELOAD_GICv4   KVM_ARCH_REQ(4)
 #define KVM_REQ_RELOAD_PMU     KVM_ARCH_REQ(5)
+#define KVM_REQ_SUSPEND                KVM_ARCH_REQ(6)
 
 #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
                                     KVM_DIRTY_LOG_INITIALLY_SET)
@@ -101,15 +102,25 @@ struct kvm_s2_mmu {
 struct kvm_arch_memory_slot {
 };
 
+/**
+ * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests
+ *
+ * @std_bmap: Bitmap of standard secure service calls
+ * @std_hyp_bmap: Bitmap of standard hypervisor service calls
+ * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls
+ */
+struct kvm_smccc_features {
+       unsigned long std_bmap;
+       unsigned long std_hyp_bmap;
+       unsigned long vendor_hyp_bmap;
+};
+
 struct kvm_arch {
        struct kvm_s2_mmu mmu;
 
        /* VTCR_EL2 value for this VM */
        u64    vtcr;
 
-       /* The maximum number of vCPUs depends on the used GIC model */
-       int max_vcpus;
-
        /* Interrupt controller */
        struct vgic_dist        vgic;
 
@@ -136,6 +147,8 @@ struct kvm_arch {
         */
 #define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED             3
 #define KVM_ARCH_FLAG_EL1_32BIT                                4
+       /* PSCI SYSTEM_SUSPEND enabled for the guest */
+#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED           5
 
        unsigned long flags;
 
@@ -150,6 +163,9 @@ struct kvm_arch {
 
        u8 pfr0_csv2;
        u8 pfr0_csv3;
+
+       /* Hypercall features firmware registers' descriptor */
+       struct kvm_smccc_features smccc_feat;
 };
 
 struct kvm_vcpu_fault_info {
@@ -254,14 +270,8 @@ struct kvm_cpu_context {
        struct kvm_vcpu *__hyp_running_vcpu;
 };
 
-struct kvm_pmu_events {
-       u32 events_host;
-       u32 events_guest;
-};
-
 struct kvm_host_data {
        struct kvm_cpu_context host_ctxt;
-       struct kvm_pmu_events pmu_events;
 };
 
 struct kvm_host_psci_config {
@@ -368,8 +378,8 @@ struct kvm_vcpu_arch {
                u32     mdscr_el1;
        } guest_debug_preserved;
 
-       /* vcpu power-off state */
-       bool power_off;
+       /* vcpu power state */
+       struct kvm_mp_state mp_state;
 
        /* Don't run the guest (internal implementation need) */
        bool pause;
@@ -455,6 +465,7 @@ struct kvm_vcpu_arch {
 #define KVM_ARM64_FP_FOREIGN_FPSTATE   (1 << 14)
 #define KVM_ARM64_ON_UNSUPPORTED_CPU   (1 << 15) /* Physical CPU not in supported_cpus */
 #define KVM_ARM64_HOST_SME_ENABLED     (1 << 16) /* SME enabled for EL0 */
+#define KVM_ARM64_WFIT                 (1 << 17) /* WFIT instruction trapped */
 
 #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
                                 KVM_GUESTDBG_USE_SW_BP | \
@@ -687,10 +698,11 @@ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu);
 int kvm_handle_cp15_32(struct kvm_vcpu *vcpu);
 int kvm_handle_cp15_64(struct kvm_vcpu *vcpu);
 int kvm_handle_sys_reg(struct kvm_vcpu *vcpu);
+int kvm_handle_cp10_id(struct kvm_vcpu *vcpu);
 
 void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);
 
-void kvm_sys_reg_table_init(void);
+int kvm_sys_reg_table_init(void);
 
 /* MMIO helpers */
 void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
@@ -799,9 +811,6 @@ void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu);
 #ifdef CONFIG_KVM
 void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr);
 void kvm_clr_pmu_events(u32 clr);
-
-void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu);
-void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
 #else
 static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
 static inline void kvm_clr_pmu_events(u32 clr) {}
@@ -833,8 +842,6 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
 #define kvm_has_mte(kvm)                                       \
        (system_supports_mte() &&                               \
         test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags))
-#define kvm_vcpu_has_pmu(vcpu)                                 \
-       (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
 
 int kvm_trng_call(struct kvm_vcpu *vcpu);
 #ifdef CONFIG_KVM
@@ -845,4 +852,7 @@ void __init kvm_hyp_reserve(void);
 static inline void kvm_hyp_reserve(void) { }
 #endif
 
+void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
+bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM64_KVM_HOST_H__ */
index 74735a8..b208da3 100644 (file)
@@ -154,6 +154,9 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
 int kvm_share_hyp(void *from, void *to);
 void kvm_unshare_hyp(void *from, void *to);
 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
+int __create_hyp_mappings(unsigned long start, unsigned long size,
+                         unsigned long phys, enum kvm_pgtable_prot prot);
+int hyp_alloc_private_va_range(size_t size, unsigned long *haddr);
 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
                           void __iomem **kaddr,
                           void __iomem **haddr);
index b0256ce..4bb2cc8 100644 (file)
@@ -87,5 +87,6 @@
 #define HWCAP2_SME_B16F32      (1 << 28)
 #define HWCAP2_SME_F32F32      (1 << 29)
 #define HWCAP2_SME_FA64                (1 << 30)
+#define HWCAP2_WFXT            (1UL << 31)
 
 #endif /* _UAPI__ASM_HWCAP_H */
index ab58535..3bb1343 100644 (file)
@@ -334,6 +334,40 @@ struct kvm_arm_copy_mte_tags {
 #define KVM_ARM64_SVE_VLS_WORDS        \
        ((KVM_ARM64_SVE_VQ_MAX - KVM_ARM64_SVE_VQ_MIN) / 64 + 1)
 
+/* Bitmap feature firmware registers */
+#define KVM_REG_ARM_FW_FEAT_BMAP               (0x0016 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_FW_FEAT_BMAP_REG(r)                (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+                                               KVM_REG_ARM_FW_FEAT_BMAP |      \
+                                               ((r) & 0xffff))
+
+#define KVM_REG_ARM_STD_BMAP                   KVM_REG_ARM_FW_FEAT_BMAP_REG(0)
+
+enum {
+       KVM_REG_ARM_STD_BIT_TRNG_V1_0   = 0,
+#ifdef __KERNEL__
+       KVM_REG_ARM_STD_BMAP_BIT_COUNT,
+#endif
+};
+
+#define KVM_REG_ARM_STD_HYP_BMAP               KVM_REG_ARM_FW_FEAT_BMAP_REG(1)
+
+enum {
+       KVM_REG_ARM_STD_HYP_BIT_PV_TIME = 0,
+#ifdef __KERNEL__
+       KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT,
+#endif
+};
+
+#define KVM_REG_ARM_VENDOR_HYP_BMAP            KVM_REG_ARM_FW_FEAT_BMAP_REG(2)
+
+enum {
+       KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT    = 0,
+       KVM_REG_ARM_VENDOR_HYP_BIT_PTP          = 1,
+#ifdef __KERNEL__
+       KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT,
+#endif
+};
+
 /* Device Control API: ARM VGIC */
 #define KVM_DEV_ARM_VGIC_GRP_ADDR      0
 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1
index 4ccddf3..42ea2bd 100644 (file)
@@ -237,6 +237,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_GPA3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_RPRES_SHIFT, 4, 0),
+       ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_WFXT_SHIFT, 4, 0),
        ARM64_FTR_END,
 };
 
@@ -2517,6 +2518,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                .cpu_enable = fa64_kernel_enable,
        },
 #endif /* CONFIG_ARM64_SME */
+       {
+               .desc = "WFx with timeout",
+               .capability = ARM64_HAS_WFXT,
+               .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+               .sys_reg = SYS_ID_AA64ISAR2_EL1,
+               .sign = FTR_UNSIGNED,
+               .field_pos = ID_AA64ISAR2_WFXT_SHIFT,
+               .field_width = 4,
+               .matches = has_cpuid_feature,
+               .min_field_value = ID_AA64ISAR2_WFXT_SUPPORTED,
+       },
        {},
 };
 
@@ -2650,6 +2662,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
        HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV),
        HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_AFP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP),
        HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_RPRES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES),
+       HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_WFXT_SHIFT, 4, FTR_UNSIGNED, ID_AA64ISAR2_WFXT_SUPPORTED, CAP_HWCAP, KERNEL_HWCAP_WFXT),
 #ifdef CONFIG_ARM64_SME
        HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SME_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_SME, CAP_HWCAP, KERNEL_HWCAP_SME),
        HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_FA64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_FA64, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
index 8a8136a..8eff0a3 100644 (file)
@@ -106,6 +106,7 @@ static const char *const hwcap_str[] = {
        [KERNEL_HWCAP_SME_B16F32]       = "smeb16f32",
        [KERNEL_HWCAP_SME_F32F32]       = "smef32f32",
        [KERNEL_HWCAP_SME_FA64]         = "smefa64",
+       [KERNEL_HWCAP_WFXT]             = "wfxt",
 };
 
 #ifdef CONFIG_COMPAT
index 261644b..aa127ae 100644 (file)
@@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/
 kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
         inject_fault.o va_layout.o handle_exit.o \
         guest.o debug.o reset.o sys_regs.o \
-        vgic-sys-reg-v3.o fpsimd.o pmu.o pkvm.o \
+        vgic-sys-reg-v3.o fpsimd.o pkvm.o \
         arch_timer.o trng.o vmid.o \
         vgic/vgic.o vgic/vgic-init.o \
         vgic/vgic-irqfd.o vgic/vgic-v2.o \
@@ -22,7 +22,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
         vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
         vgic/vgic-its.o vgic/vgic-debug.o
 
-kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o
+kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
 
 always-y := hyp_constants.h hyp-constants.s
 
index 6e542e2..4e39ace 100644 (file)
@@ -208,18 +208,16 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
+static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx,
+                                    u64 val)
 {
-       u64 cval, now;
-
-       cval = timer_get_cval(timer_ctx);
-       now = kvm_phys_timer_read() - timer_get_offset(timer_ctx);
+       u64 now = kvm_phys_timer_read() - timer_get_offset(timer_ctx);
 
-       if (now < cval) {
+       if (now < val) {
                u64 ns;
 
                ns = cyclecounter_cyc2ns(timecounter->cc,
-                                        cval - now,
+                                        val - now,
                                         timecounter->mask,
                                         &timecounter->frac);
                return ns;
@@ -228,6 +226,11 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
        return 0;
 }
 
+static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
+{
+       return kvm_counter_compute_delta(timer_ctx, timer_get_cval(timer_ctx));
+}
+
 static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
 {
        WARN_ON(timer_ctx && timer_ctx->loaded);
@@ -236,6 +239,20 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
                  (ARCH_TIMER_CTRL_IT_MASK | ARCH_TIMER_CTRL_ENABLE)) == ARCH_TIMER_CTRL_ENABLE);
 }
 
+static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu)
+{
+       return (cpus_have_final_cap(ARM64_HAS_WFXT) &&
+               (vcpu->arch.flags & KVM_ARM64_WFIT));
+}
+
+static u64 wfit_delay_ns(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_context *ctx = vcpu_vtimer(vcpu);
+       u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
+
+       return kvm_counter_compute_delta(ctx, val);
+}
+
 /*
  * Returns the earliest expiration time in ns among guest timers.
  * Note that it will return 0 if none of timers can fire.
@@ -253,6 +270,9 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
                        min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
        }
 
+       if (vcpu_has_wfit_active(vcpu))
+               min_delta = min(min_delta, wfit_delay_ns(vcpu));
+
        /* If none of timers can fire, then return 0 */
        if (min_delta == ULLONG_MAX)
                return 0;
@@ -350,15 +370,9 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
        return cval <= now;
 }
 
-bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-       struct timer_map map;
-
-       get_timer_map(vcpu, &map);
-
-       return kvm_timer_should_fire(map.direct_vtimer) ||
-              kvm_timer_should_fire(map.direct_ptimer) ||
-              kvm_timer_should_fire(map.emul_ptimer);
+       return vcpu_has_wfit_active(vcpu) && wfit_delay_ns(vcpu) == 0;
 }
 
 /*
@@ -484,7 +498,8 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
         */
        if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
            !kvm_timer_irq_can_fire(map.direct_ptimer) &&
-           !kvm_timer_irq_can_fire(map.emul_ptimer))
+           !kvm_timer_irq_can_fire(map.emul_ptimer) &&
+           !vcpu_has_wfit_active(vcpu))
                return;
 
        /*
index cedc3ba..400bb0f 100644 (file)
@@ -97,6 +97,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                }
                mutex_unlock(&kvm->lock);
                break;
+       case KVM_CAP_ARM_SYSTEM_SUSPEND:
+               r = 0;
+               set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -153,9 +157,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm_vgic_early_init(kvm);
 
        /* The maximum number of VCPUs is limited by the host's GIC model */
-       kvm->arch.max_vcpus = kvm_arm_default_max_vcpus();
+       kvm->max_vcpus = kvm_arm_default_max_vcpus();
 
        set_default_spectre(kvm);
+       kvm_arm_init_hypercalls(kvm);
 
        return ret;
 out_free_stage2_pgd:
@@ -210,6 +215,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SET_GUEST_DEBUG:
        case KVM_CAP_VCPU_ATTRIBUTES:
        case KVM_CAP_PTP_KVM:
+       case KVM_CAP_ARM_SYSTEM_SUSPEND:
                r = 1;
                break;
        case KVM_CAP_SET_GUEST_DEBUG2:
@@ -230,7 +236,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_MAX_VCPUS:
        case KVM_CAP_MAX_VCPU_ID:
                if (kvm)
-                       r = kvm->arch.max_vcpus;
+                       r = kvm->max_vcpus;
                else
                        r = kvm_arm_default_max_vcpus();
                break;
@@ -306,7 +312,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
        if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
                return -EBUSY;
 
-       if (id >= kvm->arch.max_vcpus)
+       if (id >= kvm->max_vcpus)
                return -EINVAL;
 
        return 0;
@@ -356,11 +362,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_arm_vcpu_destroy(vcpu);
 }
 
-int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-       return kvm_timer_is_pending(vcpu);
-}
-
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
 
@@ -432,20 +433,34 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        vcpu->cpu = -1;
 }
 
-static void vcpu_power_off(struct kvm_vcpu *vcpu)
+void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.power_off = true;
+       vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
        kvm_make_request(KVM_REQ_SLEEP, vcpu);
        kvm_vcpu_kick(vcpu);
 }
 
+bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED;
+}
+
+static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED;
+       kvm_make_request(KVM_REQ_SUSPEND, vcpu);
+       kvm_vcpu_kick(vcpu);
+}
+
+static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED;
+}
+
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       if (vcpu->arch.power_off)
-               mp_state->mp_state = KVM_MP_STATE_STOPPED;
-       else
-               mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+       *mp_state = vcpu->arch.mp_state;
 
        return 0;
 }
@@ -457,10 +472,13 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 
        switch (mp_state->mp_state) {
        case KVM_MP_STATE_RUNNABLE:
-               vcpu->arch.power_off = false;
+               vcpu->arch.mp_state = *mp_state;
                break;
        case KVM_MP_STATE_STOPPED:
-               vcpu_power_off(vcpu);
+               kvm_arm_vcpu_power_off(vcpu);
+               break;
+       case KVM_MP_STATE_SUSPENDED:
+               kvm_arm_vcpu_suspend(vcpu);
                break;
        default:
                ret = -EINVAL;
@@ -480,7 +498,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
        bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
        return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
-               && !v->arch.power_off && !v->arch.pause);
+               && !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
 }
 
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
@@ -592,15 +610,15 @@ void kvm_arm_resume_guest(struct kvm *kvm)
        }
 }
 
-static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
 {
        struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
 
        rcuwait_wait_event(wait,
-                          (!vcpu->arch.power_off) &&(!vcpu->arch.pause),
+                          (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
                           TASK_INTERRUPTIBLE);
 
-       if (vcpu->arch.power_off || vcpu->arch.pause) {
+       if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
                /* Awaken to handle a signal, request we sleep again later. */
                kvm_make_request(KVM_REQ_SLEEP, vcpu);
        }
@@ -639,6 +657,7 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
        preempt_enable();
 
        kvm_vcpu_halt(vcpu);
+       vcpu->arch.flags &= ~KVM_ARM64_WFIT;
        kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 
        preempt_disable();
@@ -646,11 +665,53 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
        preempt_enable();
 }
 
-static void check_vcpu_requests(struct kvm_vcpu *vcpu)
+static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
+{
+       if (!kvm_arm_vcpu_suspended(vcpu))
+               return 1;
+
+       kvm_vcpu_wfi(vcpu);
+
+       /*
+        * The suspend state is sticky; we do not leave it until userspace
+        * explicitly marks the vCPU as runnable. Request that we suspend again
+        * later.
+        */
+       kvm_make_request(KVM_REQ_SUSPEND, vcpu);
+
+       /*
+        * Check to make sure the vCPU is actually runnable. If so, exit to
+        * userspace informing it of the wakeup condition.
+        */
+       if (kvm_arch_vcpu_runnable(vcpu)) {
+               memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
+               vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
+               vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+               return 0;
+       }
+
+       /*
+        * Otherwise, we were unblocked to process a different event, such as a
+        * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
+        * process the event.
+        */
+       return 1;
+}
+
+/**
+ * check_vcpu_requests - check and handle pending vCPU requests
+ * @vcpu:      the VCPU pointer
+ *
+ * Return: 1 if we should enter the guest
+ *        0 if we should exit to userspace
+ *        < 0 if we should exit to userspace, where the return value indicates
+ *        an error
+ */
+static int check_vcpu_requests(struct kvm_vcpu *vcpu)
 {
        if (kvm_request_pending(vcpu)) {
                if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
-                       vcpu_req_sleep(vcpu);
+                       kvm_vcpu_sleep(vcpu);
 
                if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
                        kvm_reset_vcpu(vcpu);
@@ -675,7 +736,12 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
                if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
                        kvm_pmu_handle_pmcr(vcpu,
                                            __vcpu_sys_reg(vcpu, PMCR_EL0));
+
+               if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
+                       return kvm_vcpu_suspend(vcpu);
        }
+
+       return 1;
 }
 
 static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
@@ -792,7 +858,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                if (!ret)
                        ret = 1;
 
-               check_vcpu_requests(vcpu);
+               if (ret > 0)
+                       ret = check_vcpu_requests(vcpu);
 
                /*
                 * Preparing the interrupts to be injected also
@@ -816,6 +883,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
                kvm_vgic_flush_hwstate(vcpu);
 
+               kvm_pmu_update_vcpu_events(vcpu);
+
                /*
                 * Ensure we set mode to IN_GUEST_MODE after we disable
                 * interrupts and before the final VCPU requests check.
@@ -1125,9 +1194,9 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
         * Handle the "start in power-off" case.
         */
        if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-               vcpu_power_off(vcpu);
+               kvm_arm_vcpu_power_off(vcpu);
        else
-               vcpu->arch.power_off = false;
+               vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
 
        return 0;
 }
@@ -1485,7 +1554,6 @@ static void cpu_prepare_hyp_mode(int cpu)
        tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
        params->tcr_el2 = tcr;
 
-       params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE);
        params->pgd_pa = kvm_mmu_get_httbr();
        if (is_protected_kvm_enabled())
                params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
@@ -1763,8 +1831,6 @@ static int init_subsystems(void)
 
        kvm_register_perf_callbacks(NULL);
 
-       kvm_sys_reg_table_init();
-
 out:
        if (err || !is_protected_kvm_enabled())
                on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
@@ -1935,14 +2001,46 @@ static int init_hyp_mode(void)
         * Map the Hyp stack pages
         */
        for_each_possible_cpu(cpu) {
+               struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
                char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
-                                         PAGE_HYP);
+               unsigned long hyp_addr;
+
+               /*
+                * Allocate a contiguous HYP private VA range for the stack
+                * and guard page. The allocation is also aligned based on
+                * the order of its size.
+                */
+               err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr);
+               if (err) {
+                       kvm_err("Cannot allocate hyp stack guard page\n");
+                       goto out_err;
+               }
 
+               /*
+                * Since the stack grows downwards, map the stack to the page
+                * at the higher address and leave the lower guard page
+                * unbacked.
+                *
+                * Any valid stack address now has the PAGE_SHIFT bit as 1
+                * and addresses corresponding to the guard page have the
+                * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+                */
+               err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE,
+                                           __pa(stack_page), PAGE_HYP);
                if (err) {
                        kvm_err("Cannot map hyp stack\n");
                        goto out_err;
                }
+
+               /*
+                * Save the stack PA in nvhe_init_params. This will be needed
+                * to recreate the stack mapping in protected nVHE mode.
+                * __hyp_pa() won't do the right thing there, since the stack
+                * has been mapped in the flexible private VA space.
+                */
+               params->stack_pa = __pa(stack_page);
+
+               params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
        }
 
        for_each_possible_cpu(cpu) {
@@ -2091,6 +2189,12 @@ int kvm_arch_init(void *opaque)
                return -ENODEV;
        }
 
+       err = kvm_sys_reg_table_init();
+       if (err) {
+               kvm_info("Error initializing system register tables");
+               return err;
+       }
+
        in_hyp_mode = is_kernel_in_hyp_mode();
 
        if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
index 7e15b03..8c60719 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/string.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
-#include <kvm/arm_psci.h>
+#include <kvm/arm_hypercalls.h>
 #include <asm/cputype.h>
 #include <linux/uaccess.h>
 #include <asm/fpsimd.h>
@@ -756,7 +756,9 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 
        switch (reg->id & KVM_REG_ARM_COPROC_MASK) {
        case KVM_REG_ARM_CORE:  return get_core_reg(vcpu, reg);
-       case KVM_REG_ARM_FW:    return kvm_arm_get_fw_reg(vcpu, reg);
+       case KVM_REG_ARM_FW:
+       case KVM_REG_ARM_FW_FEAT_BMAP:
+               return kvm_arm_get_fw_reg(vcpu, reg);
        case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg);
        }
 
@@ -774,7 +776,9 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 
        switch (reg->id & KVM_REG_ARM_COPROC_MASK) {
        case KVM_REG_ARM_CORE:  return set_core_reg(vcpu, reg);
-       case KVM_REG_ARM_FW:    return kvm_arm_set_fw_reg(vcpu, reg);
+       case KVM_REG_ARM_FW:
+       case KVM_REG_ARM_FW_FEAT_BMAP:
+               return kvm_arm_set_fw_reg(vcpu, reg);
        case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg);
        }
 
index 0b82929..f66c014 100644 (file)
@@ -80,24 +80,51 @@ static int handle_no_fpsimd(struct kvm_vcpu *vcpu)
  *
  * @vcpu:      the vcpu pointer
  *
- * WFE: Yield the CPU and come back to this vcpu when the scheduler
+ * WFE[T]: Yield the CPU and come back to this vcpu when the scheduler
  * decides to.
  * WFI: Simply call kvm_vcpu_halt(), which will halt execution of
  * world-switches and schedule other host processes until there is an
  * incoming IRQ or FIQ to the VM.
+ * WFIT: Same as WFI, with a timed wakeup implemented as a background timer
+ *
+ * WF{I,E}T can immediately return if the deadline has already expired.
  */
 static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
 {
-       if (kvm_vcpu_get_esr(vcpu) & ESR_ELx_WFx_ISS_WFE) {
+       u64 esr = kvm_vcpu_get_esr(vcpu);
+
+       if (esr & ESR_ELx_WFx_ISS_WFE) {
                trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
                vcpu->stat.wfe_exit_stat++;
-               kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu));
        } else {
                trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false);
                vcpu->stat.wfi_exit_stat++;
-               kvm_vcpu_wfi(vcpu);
        }
 
+       if (esr & ESR_ELx_WFx_ISS_WFxT) {
+               if (esr & ESR_ELx_WFx_ISS_RV) {
+                       u64 val, now;
+
+                       now = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_TIMER_CNT);
+                       val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
+
+                       if (now >= val)
+                               goto out;
+               } else {
+                       /* Treat WFxT as WFx if RN is invalid */
+                       esr &= ~ESR_ELx_WFx_ISS_WFxT;
+               }
+       }
+
+       if (esr & ESR_ELx_WFx_ISS_WFE) {
+               kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu));
+       } else {
+               if (esr & ESR_ELx_WFx_ISS_WFxT)
+                       vcpu->arch.flags |= KVM_ARM64_WFIT;
+
+               kvm_vcpu_wfi(vcpu);
+       }
+out:
        kvm_incr_pc(vcpu);
 
        return 1;
@@ -169,6 +196,7 @@ static exit_handle_fn arm_exit_handlers[] = {
        [ESR_ELx_EC_CP15_64]    = kvm_handle_cp15_64,
        [ESR_ELx_EC_CP14_MR]    = kvm_handle_cp14_32,
        [ESR_ELx_EC_CP14_LS]    = kvm_handle_cp14_load_store,
+       [ESR_ELx_EC_CP10_ID]    = kvm_handle_cp10_id,
        [ESR_ELx_EC_CP14_64]    = kvm_handle_cp14_64,
        [ESR_ELx_EC_HVC32]      = handle_hvc,
        [ESR_ELx_EC_SMC32]      = handle_smc,
@@ -297,13 +325,8 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
        u64 elr_in_kimg = __phys_to_kimg(elr_phys);
        u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt;
        u64 mode = spsr & PSR_MODE_MASK;
+       u64 panic_addr = elr_virt + hyp_offset;
 
-       /*
-        * The nVHE hyp symbols are not included by kallsyms to avoid issues
-        * with aliasing. That means that the symbols cannot be printed with the
-        * "%pS" format specifier, so fall back to the vmlinux address if
-        * there's no better option.
-        */
        if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
                kvm_err("Invalid host exception to nVHE hyp!\n");
        } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
@@ -323,9 +346,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
                if (file)
                        kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
                else
-                       kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset);
+                       kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr,
+                                       (void *)panic_addr);
        } else {
-               kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset);
+               kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr,
+                               (void *)panic_addr);
        }
 
        /*
index 2d08510..42d8eb9 100644 (file)
@@ -19,8 +19,10 @@ int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
 int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
 int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
 int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot);
-unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
-                                           enum kvm_pgtable_prot prot);
+int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+                                 enum kvm_pgtable_prot prot,
+                                 unsigned long *haddr);
+int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr);
 
 static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
                                     unsigned long *start, unsigned long *end)
index 727c979..ea6a397 100644 (file)
@@ -80,7 +80,7 @@ SYM_FUNC_START(__hyp_do_panic)
        mov     lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
                      PSR_MODE_EL1h)
        msr     spsr_el2, lr
-       ldr     lr, =nvhe_hyp_panic_handler
+       adr_l   lr, nvhe_hyp_panic_handler
        hyp_kimg_va lr, x6
        msr     elr_el2, lr
 
@@ -125,13 +125,11 @@ alternative_else_nop_endif
        add     sp, sp, #16
        /*
         * Compute the idmap address of __kvm_handle_stub_hvc and
-        * jump there. Since we use kimage_voffset, do not use the
-        * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
-        * (by loading it from the constant pool).
+        * jump there.
         *
         * Preserve x0-x4, which may contain stub parameters.
         */
-       ldr     x5, =__kvm_handle_stub_hvc
+       adr_l   x5, __kvm_handle_stub_hvc
        hyp_pa  x5, x6
        br      x5
 SYM_FUNC_END(__host_hvc)
@@ -153,6 +151,18 @@ SYM_FUNC_END(__host_hvc)
 
 .macro invalid_host_el2_vect
        .align 7
+
+       /*
+        * Test whether the SP has overflowed, without corrupting a GPR.
+        * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit
+        * of SP should always be 1.
+        */
+       add     sp, sp, x0                      // sp' = sp + x0
+       sub     x0, sp, x0                      // x0' = sp' - x0 = (sp + x0) - x0 = sp
+       tbz     x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@
+       sub     x0, sp, x0                      // x0'' = sp' - x0' = (sp + x0) - sp = x0
+       sub     sp, sp, x0                      // sp'' = sp' - x0 = (sp + x0) - x0 = sp
+
        /* If a guest is loaded, panic out of it. */
        stp     x0, x1, [sp, #-16]!
        get_loaded_vcpu x0, x1
@@ -165,6 +175,18 @@ SYM_FUNC_END(__host_hvc)
         * been partially clobbered by __host_enter.
         */
        b       hyp_panic
+
+.L__hyp_sp_overflow\@:
+       /*
+        * Reset SP to the top of the stack, to allow handling the hyp_panic.
+        * This corrupts the stack but is ok, since we won't be attempting
+        * any unwinding here.
+        */
+       ldr_this_cpu    x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1
+       mov     sp, x0
+
+       b       hyp_panic_bad_stack
+       ASM_BUG()
 .endm
 
 .macro invalid_host_el1_vect
index 5e2197d..3cea4b6 100644 (file)
@@ -160,7 +160,23 @@ static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ct
        DECLARE_REG(size_t, size, host_ctxt, 2);
        DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
 
-       cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot);
+       /*
+        * __pkvm_create_private_mapping() populates a pointer with the
+        * hypervisor start address of the allocation.
+        *
+        * However, handle___pkvm_create_private_mapping() hypercall crosses the
+        * EL1/EL2 boundary so the pointer would not be valid in this context.
+        *
+        * Instead pass the allocation address as the return value (or return
+        * ERR_PTR() on failure).
+        */
+       unsigned long haddr;
+       int err = __pkvm_create_private_mapping(phys, size, prot, &haddr);
+
+       if (err)
+               haddr = (unsigned long)ERR_PTR(err);
+
+       cpu_reg(host_ctxt, 1) = haddr;
 }
 
 static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
index cdbe8e2..96193cb 100644 (file)
@@ -37,36 +37,60 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size,
        return err;
 }
 
-unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
-                                           enum kvm_pgtable_prot prot)
+/**
+ * pkvm_alloc_private_va_range - Allocates a private VA range.
+ * @size:      The size of the VA range to reserve.
+ * @haddr:     The hypervisor virtual start address of the allocation.
+ *
+ * The private virtual address (VA) range is allocated above __io_map_base
+ * and aligned based on the order of @size.
+ *
+ * Return: 0 on success or negative error code on failure.
+ */
+int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr)
 {
-       unsigned long addr;
-       int err;
+       unsigned long base, addr;
+       int ret = 0;
 
        hyp_spin_lock(&pkvm_pgd_lock);
 
-       size = PAGE_ALIGN(size + offset_in_page(phys));
-       addr = __io_map_base;
-       __io_map_base += size;
+       /* Align the allocation based on the order of its size */
+       addr = ALIGN(__io_map_base, PAGE_SIZE << get_order(size));
 
-       /* Are we overflowing on the vmemmap ? */
-       if (__io_map_base > __hyp_vmemmap) {
-               __io_map_base -= size;
-               addr = (unsigned long)ERR_PTR(-ENOMEM);
-               goto out;
-       }
+       /* The allocated size is always a multiple of PAGE_SIZE */
+       base = addr + PAGE_ALIGN(size);
 
-       err = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, size, phys, prot);
-       if (err) {
-               addr = (unsigned long)ERR_PTR(err);
-               goto out;
+       /* Are we overflowing on the vmemmap ? */
+       if (!addr || base > __hyp_vmemmap)
+               ret = -ENOMEM;
+       else {
+               __io_map_base = base;
+               *haddr = addr;
        }
 
-       addr = addr + offset_in_page(phys);
-out:
        hyp_spin_unlock(&pkvm_pgd_lock);
 
-       return addr;
+       return ret;
+}
+
+int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+                                 enum kvm_pgtable_prot prot,
+                                 unsigned long *haddr)
+{
+       unsigned long addr;
+       int err;
+
+       size = PAGE_ALIGN(size + offset_in_page(phys));
+       err = pkvm_alloc_private_va_range(size, &addr);
+       if (err)
+               return err;
+
+       err = __pkvm_create_mappings(addr, size, phys, prot);
+       if (err)
+               return err;
+
+       *haddr = addr + offset_in_page(phys);
+       return err;
 }
 
 int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot)
@@ -146,7 +170,8 @@ int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot)
 int hyp_map_vectors(void)
 {
        phys_addr_t phys;
-       void *bp_base;
+       unsigned long bp_base;
+       int ret;
 
        if (!kvm_system_needs_idmapped_vectors()) {
                __hyp_bp_vect_base = __bp_harden_hyp_vecs;
@@ -154,13 +179,12 @@ int hyp_map_vectors(void)
        }
 
        phys = __hyp_pa(__bp_harden_hyp_vecs);
-       bp_base = (void *)__pkvm_create_private_mapping(phys,
-                                                       __BP_HARDEN_HYP_VECS_SZ,
-                                                       PAGE_HYP_EXEC);
-       if (IS_ERR_OR_NULL(bp_base))
-               return PTR_ERR(bp_base);
+       ret = __pkvm_create_private_mapping(phys, __BP_HARDEN_HYP_VECS_SZ,
+                                           PAGE_HYP_EXEC, &bp_base);
+       if (ret)
+               return ret;
 
-       __hyp_bp_vect_base = bp_base;
+       __hyp_bp_vect_base = (void *)bp_base;
 
        return 0;
 }
index 27af337..e8d4ea2 100644 (file)
@@ -99,17 +99,42 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
                return ret;
 
        for (i = 0; i < hyp_nr_cpus; i++) {
+               struct kvm_nvhe_init_params *params = per_cpu_ptr(&kvm_init_params, i);
+               unsigned long hyp_addr;
+
                start = (void *)kern_hyp_va(per_cpu_base[i]);
                end = start + PAGE_ALIGN(hyp_percpu_size);
                ret = pkvm_create_mappings(start, end, PAGE_HYP);
                if (ret)
                        return ret;
 
-               end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va;
-               start = end - PAGE_SIZE;
-               ret = pkvm_create_mappings(start, end, PAGE_HYP);
+               /*
+                * Allocate a contiguous HYP private VA range for the stack
+                * and guard page. The allocation is also aligned based on
+                * the order of its size.
+                */
+               ret = pkvm_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr);
+               if (ret)
+                       return ret;
+
+               /*
+                * Since the stack grows downwards, map the stack to the page
+                * at the higher address and leave the lower guard page
+                * unbacked.
+                *
+                * Any valid stack address now has the PAGE_SHIFT bit as 1
+                * and addresses corresponding to the guard page have the
+                * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+                */
+               hyp_spin_lock(&pkvm_pgd_lock);
+               ret = kvm_pgtable_hyp_map(&pkvm_pgtable, hyp_addr + PAGE_SIZE,
+                                       PAGE_SIZE, params->stack_pa, PAGE_HYP);
+               hyp_spin_unlock(&pkvm_pgd_lock);
                if (ret)
                        return ret;
+
+               /* Update stack_hyp_va to end of the stack's private VA range */
+               params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
        }
 
        /*
index caace61..6db801d 100644 (file)
@@ -150,16 +150,13 @@ static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu)
        }
 }
 
-/**
+/*
  * Disable host events, enable guest events
  */
-static bool __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt)
+#ifdef CONFIG_HW_PERF_EVENTS
+static bool __pmu_switch_to_guest(struct kvm_vcpu *vcpu)
 {
-       struct kvm_host_data *host;
-       struct kvm_pmu_events *pmu;
-
-       host = container_of(host_ctxt, struct kvm_host_data, host_ctxt);
-       pmu = &host->pmu_events;
+       struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events;
 
        if (pmu->events_host)
                write_sysreg(pmu->events_host, pmcntenclr_el0);
@@ -170,16 +167,12 @@ static bool __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt)
        return (pmu->events_host || pmu->events_guest);
 }
 
-/**
+/*
  * Disable guest events, enable host events
  */
-static void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt)
+static void __pmu_switch_to_host(struct kvm_vcpu *vcpu)
 {
-       struct kvm_host_data *host;
-       struct kvm_pmu_events *pmu;
-
-       host = container_of(host_ctxt, struct kvm_host_data, host_ctxt);
-       pmu = &host->pmu_events;
+       struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events;
 
        if (pmu->events_guest)
                write_sysreg(pmu->events_guest, pmcntenclr_el0);
@@ -187,8 +180,12 @@ static void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt)
        if (pmu->events_host)
                write_sysreg(pmu->events_host, pmcntenset_el0);
 }
+#else
+#define __pmu_switch_to_guest(v)       ({ false; })
+#define __pmu_switch_to_host(v)                do {} while (0)
+#endif
 
-/**
+/*
  * Handler for protected VM MSR, MRS or System instruction execution in AArch64.
  *
  * Returns true if the hypervisor has handled the exit, and control should go
@@ -205,23 +202,6 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code)
                kvm_handle_pvm_sysreg(vcpu, exit_code));
 }
 
-/**
- * Handler for protected floating-point and Advanced SIMD accesses.
- *
- * Returns true if the hypervisor has handled the exit, and control should go
- * back to the guest, or false if it hasn't.
- */
-static bool kvm_handle_pvm_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
-{
-       /* Linux guests assume support for floating-point and Advanced SIMD. */
-       BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP),
-                               PVM_ID_AA64PFR0_ALLOW));
-       BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD),
-                               PVM_ID_AA64PFR0_ALLOW));
-
-       return kvm_hyp_handle_fpsimd(vcpu, exit_code);
-}
-
 static const exit_handler_fn hyp_exit_handlers[] = {
        [0 ... ESR_ELx_EC_MAX]          = NULL,
        [ESR_ELx_EC_CP15_32]            = kvm_hyp_handle_cp15_32,
@@ -237,7 +217,7 @@ static const exit_handler_fn pvm_exit_handlers[] = {
        [0 ... ESR_ELx_EC_MAX]          = NULL,
        [ESR_ELx_EC_SYS64]              = kvm_handle_pvm_sys64,
        [ESR_ELx_EC_SVE]                = kvm_handle_pvm_restricted,
-       [ESR_ELx_EC_FP_ASIMD]           = kvm_handle_pvm_fpsimd,
+       [ESR_ELx_EC_FP_ASIMD]           = kvm_hyp_handle_fpsimd,
        [ESR_ELx_EC_IABT_LOW]           = kvm_hyp_handle_iabt_low,
        [ESR_ELx_EC_DABT_LOW]           = kvm_hyp_handle_dabt_low,
        [ESR_ELx_EC_PAC]                = kvm_hyp_handle_ptrauth,
@@ -304,7 +284,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
        host_ctxt->__hyp_running_vcpu = vcpu;
        guest_ctxt = &vcpu->arch.ctxt;
 
-       pmu_switch_needed = __pmu_switch_to_guest(host_ctxt);
+       pmu_switch_needed = __pmu_switch_to_guest(vcpu);
 
        __sysreg_save_state_nvhe(host_ctxt);
        /*
@@ -366,7 +346,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
        __debug_restore_host_buffers_nvhe(vcpu);
 
        if (pmu_switch_needed)
-               __pmu_switch_to_host(host_ctxt);
+               __pmu_switch_to_host(vcpu);
 
        /* Returning to host will clear PSR.I, remask PMR if needed */
        if (system_uses_irq_prio_masking())
@@ -377,7 +357,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
        return exit_code;
 }
 
-void __noreturn hyp_panic(void)
+asmlinkage void __noreturn hyp_panic(void)
 {
        u64 spsr = read_sysreg_el2(SYS_SPSR);
        u64 elr = read_sysreg_el2(SYS_ELR);
@@ -399,6 +379,11 @@ void __noreturn hyp_panic(void)
        unreachable();
 }
 
+asmlinkage void __noreturn hyp_panic_bad_stack(void)
+{
+       hyp_panic();
+}
+
 asmlinkage void kvm_unexpected_el2_exception(void)
 {
        return __kvm_unexpected_el2_exception();
index 619f94f..b6d86e4 100644 (file)
@@ -90,9 +90,6 @@ static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu)
        u64 set_mask = 0;
        u64 allow_mask = PVM_ID_AA64PFR0_ALLOW;
 
-       if (!vcpu_has_sve(vcpu))
-               allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE);
-
        set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val,
                PVM_ID_AA64PFR0_RESTRICT_UNSIGNED);
 
index 202b8c4..c9f401f 100644 (file)
@@ -9,6 +9,13 @@
 #include <kvm/arm_hypercalls.h>
 #include <kvm/arm_psci.h>
 
+#define KVM_ARM_SMCCC_STD_FEATURES                             \
+       GENMASK(KVM_REG_ARM_STD_BMAP_BIT_COUNT - 1, 0)
+#define KVM_ARM_SMCCC_STD_HYP_FEATURES                         \
+       GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0)
+#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES                      \
+       GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0)
+
 static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
 {
        struct system_time_snapshot systime_snapshot;
@@ -58,13 +65,73 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
        val[3] = lower_32_bits(cycles);
 }
 
+static bool kvm_hvc_call_default_allowed(u32 func_id)
+{
+       switch (func_id) {
+       /*
+        * List of function-ids that are not gated with the bitmapped
+        * feature firmware registers, and are to be allowed for
+        * servicing the call by default.
+        */
+       case ARM_SMCCC_VERSION_FUNC_ID:
+       case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
+               return true;
+       default:
+               /* PSCI 0.2 and up is in the 0:0x1f range */
+               if (ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD &&
+                   ARM_SMCCC_FUNC_NUM(func_id) <= 0x1f)
+                       return true;
+
+               /*
+                * KVM's PSCI 0.1 doesn't comply with SMCCC, and has
+                * its own function-id base and range
+                */
+               if (func_id >= KVM_PSCI_FN(0) && func_id <= KVM_PSCI_FN(3))
+                       return true;
+
+               return false;
+       }
+}
+
+static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id)
+{
+       struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
+
+       switch (func_id) {
+       case ARM_SMCCC_TRNG_VERSION:
+       case ARM_SMCCC_TRNG_FEATURES:
+       case ARM_SMCCC_TRNG_GET_UUID:
+       case ARM_SMCCC_TRNG_RND32:
+       case ARM_SMCCC_TRNG_RND64:
+               return test_bit(KVM_REG_ARM_STD_BIT_TRNG_V1_0,
+                               &smccc_feat->std_bmap);
+       case ARM_SMCCC_HV_PV_TIME_FEATURES:
+       case ARM_SMCCC_HV_PV_TIME_ST:
+               return test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME,
+                               &smccc_feat->std_hyp_bmap);
+       case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+       case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
+               return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT,
+                               &smccc_feat->vendor_hyp_bmap);
+       case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+               return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP,
+                               &smccc_feat->vendor_hyp_bmap);
+       default:
+               return kvm_hvc_call_default_allowed(func_id);
+       }
+}
+
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
+       struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
        u32 func_id = smccc_get_function(vcpu);
        u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
        u32 feature;
        gpa_t gpa;
 
+       if (!kvm_hvc_call_allowed(vcpu, func_id))
+               goto out;
+
        switch (func_id) {
        case ARM_SMCCC_VERSION_FUNC_ID:
                val[0] = ARM_SMCCC_VERSION_1_1;
@@ -120,7 +187,9 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                        }
                        break;
                case ARM_SMCCC_HV_PV_TIME_FEATURES:
-                       val[0] = SMCCC_RET_SUCCESS;
+                       if (test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME,
+                                    &smccc_feat->std_hyp_bmap))
+                               val[0] = SMCCC_RET_SUCCESS;
                        break;
                }
                break;
@@ -139,8 +208,7 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
                break;
        case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
-               val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
-               val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
+               val[0] = smccc_feat->vendor_hyp_bmap;
                break;
        case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
                kvm_ptp_get_time(vcpu, val);
@@ -155,6 +223,259 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                return kvm_psci_call(vcpu);
        }
 
+out:
        smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
        return 1;
 }
+
+static const u64 kvm_arm_fw_reg_ids[] = {
+       KVM_REG_ARM_PSCI_VERSION,
+       KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1,
+       KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2,
+       KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3,
+       KVM_REG_ARM_STD_BMAP,
+       KVM_REG_ARM_STD_HYP_BMAP,
+       KVM_REG_ARM_VENDOR_HYP_BMAP,
+};
+
+void kvm_arm_init_hypercalls(struct kvm *kvm)
+{
+       struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat;
+
+       smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES;
+       smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES;
+       smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;
+}
+
+int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
+{
+       return ARRAY_SIZE(kvm_arm_fw_reg_ids);
+}
+
+int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(kvm_arm_fw_reg_ids); i++) {
+               if (put_user(kvm_arm_fw_reg_ids[i], uindices++))
+                       return -EFAULT;
+       }
+
+       return 0;
+}
+
+#define KVM_REG_FEATURE_LEVEL_MASK     GENMASK(3, 0)
+
+/*
+ * Convert the workaround level into an easy-to-compare number, where higher
+ * values mean better protection.
+ */
+static int get_kernel_wa_level(u64 regid)
+{
+       switch (regid) {
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+               switch (arm64_get_spectre_v2_state()) {
+               case SPECTRE_VULNERABLE:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
+               case SPECTRE_MITIGATED:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
+               case SPECTRE_UNAFFECTED:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
+               }
+               return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+               switch (arm64_get_spectre_v4_state()) {
+               case SPECTRE_MITIGATED:
+                       /*
+                        * As for the hypercall discovery, we pretend we
+                        * don't have any FW mitigation if SSBS is there at
+                        * all times.
+                        */
+                       if (cpus_have_final_cap(ARM64_SSBS))
+                               return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+                       fallthrough;
+               case SPECTRE_UNAFFECTED:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
+               case SPECTRE_VULNERABLE:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+               }
+               break;
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
+               switch (arm64_get_spectre_bhb_state()) {
+               case SPECTRE_VULNERABLE:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL;
+               case SPECTRE_MITIGATED:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL;
+               case SPECTRE_UNAFFECTED:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED;
+               }
+               return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL;
+       }
+
+       return -EINVAL;
+}
+
+int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+       struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
+       void __user *uaddr = (void __user *)(long)reg->addr;
+       u64 val;
+
+       switch (reg->id) {
+       case KVM_REG_ARM_PSCI_VERSION:
+               val = kvm_psci_version(vcpu);
+               break;
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
+               val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
+               break;
+       case KVM_REG_ARM_STD_BMAP:
+               val = READ_ONCE(smccc_feat->std_bmap);
+               break;
+       case KVM_REG_ARM_STD_HYP_BMAP:
+               val = READ_ONCE(smccc_feat->std_hyp_bmap);
+               break;
+       case KVM_REG_ARM_VENDOR_HYP_BMAP:
+               val = READ_ONCE(smccc_feat->vendor_hyp_bmap);
+               break;
+       default:
+               return -ENOENT;
+       }
+
+       if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val)
+{
+       int ret = 0;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat;
+       unsigned long *fw_reg_bmap, fw_reg_features;
+
+       switch (reg_id) {
+       case KVM_REG_ARM_STD_BMAP:
+               fw_reg_bmap = &smccc_feat->std_bmap;
+               fw_reg_features = KVM_ARM_SMCCC_STD_FEATURES;
+               break;
+       case KVM_REG_ARM_STD_HYP_BMAP:
+               fw_reg_bmap = &smccc_feat->std_hyp_bmap;
+               fw_reg_features = KVM_ARM_SMCCC_STD_HYP_FEATURES;
+               break;
+       case KVM_REG_ARM_VENDOR_HYP_BMAP:
+               fw_reg_bmap = &smccc_feat->vendor_hyp_bmap;
+               fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;
+               break;
+       default:
+               return -ENOENT;
+       }
+
+       /* Check for unsupported bit */
+       if (val & ~fw_reg_features)
+               return -EINVAL;
+
+       mutex_lock(&kvm->lock);
+
+       if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) &&
+           val != *fw_reg_bmap) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       WRITE_ONCE(*fw_reg_bmap, val);
+out:
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+       void __user *uaddr = (void __user *)(long)reg->addr;
+       u64 val;
+       int wa_level;
+
+       if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
+               return -EFAULT;
+
+       switch (reg->id) {
+       case KVM_REG_ARM_PSCI_VERSION:
+       {
+               bool wants_02;
+
+               wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features);
+
+               switch (val) {
+               case KVM_ARM_PSCI_0_1:
+                       if (wants_02)
+                               return -EINVAL;
+                       vcpu->kvm->arch.psci_version = val;
+                       return 0;
+               case KVM_ARM_PSCI_0_2:
+               case KVM_ARM_PSCI_1_0:
+               case KVM_ARM_PSCI_1_1:
+                       if (!wants_02)
+                               return -EINVAL;
+                       vcpu->kvm->arch.psci_version = val;
+                       return 0;
+               }
+               break;
+       }
+
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
+               if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
+                       return -EINVAL;
+
+               if (get_kernel_wa_level(reg->id) < val)
+                       return -EINVAL;
+
+               return 0;
+
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+               if (val & ~(KVM_REG_FEATURE_LEVEL_MASK |
+                           KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
+                       return -EINVAL;
+
+               /* The enabled bit must not be set unless the level is AVAIL. */
+               if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) &&
+                   (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL)
+                       return -EINVAL;
+
+               /*
+                * Map all the possible incoming states to the only two we
+                * really want to deal with.
+                */
+               switch (val & KVM_REG_FEATURE_LEVEL_MASK) {
+               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
+               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
+                       wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+                       break;
+               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
+               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
+                       wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+
+               /*
+                * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the
+                * other way around.
+                */
+               if (get_kernel_wa_level(reg->id) < wa_level)
+                       return -EINVAL;
+
+               return 0;
+       case KVM_REG_ARM_STD_BMAP:
+       case KVM_REG_ARM_STD_HYP_BMAP:
+       case KVM_REG_ARM_VENDOR_HYP_BMAP:
+               return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val);
+       default:
+               return -ENOENT;
+       }
+
+       return -EINVAL;
+}
index 5400fc0..f5651a0 100644 (file)
@@ -258,8 +258,8 @@ static bool kvm_host_owns_hyp_mappings(void)
        return true;
 }
 
-static int __create_hyp_mappings(unsigned long start, unsigned long size,
-                                unsigned long phys, enum kvm_pgtable_prot prot)
+int __create_hyp_mappings(unsigned long start, unsigned long size,
+                         unsigned long phys, enum kvm_pgtable_prot prot)
 {
        int err;
 
@@ -457,23 +457,22 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
        return 0;
 }
 
-static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
-                                       unsigned long *haddr,
-                                       enum kvm_pgtable_prot prot)
+
+/**
+ * hyp_alloc_private_va_range - Allocates a private VA range.
+ * @size:      The size of the VA range to reserve.
+ * @haddr:     The hypervisor virtual start address of the allocation.
+ *
+ * The private virtual address (VA) range is allocated below io_map_base
+ * and aligned based on the order of @size.
+ *
+ * Return: 0 on success or negative error code on failure.
+ */
+int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
 {
        unsigned long base;
        int ret = 0;
 
-       if (!kvm_host_owns_hyp_mappings()) {
-               base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
-                                        phys_addr, size, prot);
-               if (IS_ERR_OR_NULL((void *)base))
-                       return PTR_ERR((void *)base);
-               *haddr = base;
-
-               return 0;
-       }
-
        mutex_lock(&kvm_hyp_pgd_mutex);
 
        /*
@@ -484,8 +483,10 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
         *
         * The allocated size is always a multiple of PAGE_SIZE.
         */
-       size = PAGE_ALIGN(size + offset_in_page(phys_addr));
-       base = io_map_base - size;
+       base = io_map_base - PAGE_ALIGN(size);
+
+       /* Align the allocation based on the order of its size */
+       base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
 
        /*
         * Verify that BIT(VA_BITS - 1) hasn't been flipped by
@@ -495,19 +496,40 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
        if ((base ^ io_map_base) & BIT(VA_BITS - 1))
                ret = -ENOMEM;
        else
-               io_map_base = base;
+               *haddr = io_map_base = base;
 
        mutex_unlock(&kvm_hyp_pgd_mutex);
 
+       return ret;
+}
+
+static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
+                                       unsigned long *haddr,
+                                       enum kvm_pgtable_prot prot)
+{
+       unsigned long addr;
+       int ret = 0;
+
+       if (!kvm_host_owns_hyp_mappings()) {
+               addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
+                                        phys_addr, size, prot);
+               if (IS_ERR_VALUE(addr))
+                       return addr;
+               *haddr = addr;
+
+               return 0;
+       }
+
+       size = PAGE_ALIGN(size + offset_in_page(phys_addr));
+       ret = hyp_alloc_private_va_range(size, &addr);
        if (ret)
-               goto out;
+               return ret;
 
-       ret = __create_hyp_mappings(base, size, phys_addr, prot);
+       ret = __create_hyp_mappings(addr, size, phys_addr, prot);
        if (ret)
-               goto out;
+               return ret;
 
-       *haddr = base + offset_in_page(phys_addr);
-out:
+       *haddr = addr + offset_in_page(phys_addr);
        return ret;
 }
 
index 3dc990a..11c43be 100644 (file)
@@ -774,8 +774,7 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
 {
        struct arm_pmu_entry *entry;
 
-       if (pmu->pmuver == 0 || pmu->pmuver == ID_AA64DFR0_PMUVER_IMP_DEF ||
-           is_protected_kvm_enabled())
+       if (pmu->pmuver == 0 || pmu->pmuver == ID_AA64DFR0_PMUVER_IMP_DEF)
                return;
 
        mutex_lock(&arm_pmus_lock);
index 03a6c1f..7887133 100644 (file)
@@ -5,7 +5,8 @@
  */
 #include <linux/kvm_host.h>
 #include <linux/perf_event.h>
-#include <asm/kvm_hyp.h>
+
+static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events);
 
 /*
  * Given the perf event attributes and system type, determine
@@ -25,21 +26,26 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr)
        return (attr->exclude_host != attr->exclude_guest);
 }
 
+struct kvm_pmu_events *kvm_get_pmu_events(void)
+{
+       return this_cpu_ptr(&kvm_pmu_events);
+}
+
 /*
  * Add events to track that we may want to switch at guest entry/exit
  * time.
  */
 void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
 {
-       struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
+       struct kvm_pmu_events *pmu = kvm_get_pmu_events();
 
-       if (!kvm_arm_support_pmu_v3() || !ctx || !kvm_pmu_switch_needed(attr))
+       if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr))
                return;
 
        if (!attr->exclude_host)
-               ctx->pmu_events.events_host |= set;
+               pmu->events_host |= set;
        if (!attr->exclude_guest)
-               ctx->pmu_events.events_guest |= set;
+               pmu->events_guest |= set;
 }
 
 /*
@@ -47,13 +53,13 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
  */
 void kvm_clr_pmu_events(u32 clr)
 {
-       struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
+       struct kvm_pmu_events *pmu = kvm_get_pmu_events();
 
-       if (!kvm_arm_support_pmu_v3() || !ctx)
+       if (!kvm_arm_support_pmu_v3() || !pmu)
                return;
 
-       ctx->pmu_events.events_host &= ~clr;
-       ctx->pmu_events.events_guest &= ~clr;
+       pmu->events_host &= ~clr;
+       pmu->events_guest &= ~clr;
 }
 
 #define PMEVTYPER_READ_CASE(idx)                               \
@@ -169,16 +175,16 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events)
  */
 void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
 {
-       struct kvm_host_data *host;
+       struct kvm_pmu_events *pmu;
        u32 events_guest, events_host;
 
        if (!kvm_arm_support_pmu_v3() || !has_vhe())
                return;
 
        preempt_disable();
-       host = this_cpu_ptr_hyp_sym(kvm_host_data);
-       events_guest = host->pmu_events.events_guest;
-       events_host = host->pmu_events.events_host;
+       pmu = kvm_get_pmu_events();
+       events_guest = pmu->events_guest;
+       events_host = pmu->events_host;
 
        kvm_vcpu_pmu_enable_el0(events_guest);
        kvm_vcpu_pmu_disable_el0(events_host);
@@ -190,15 +196,15 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
  */
 void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
 {
-       struct kvm_host_data *host;
+       struct kvm_pmu_events *pmu;
        u32 events_guest, events_host;
 
        if (!kvm_arm_support_pmu_v3() || !has_vhe())
                return;
 
-       host = this_cpu_ptr_hyp_sym(kvm_host_data);
-       events_guest = host->pmu_events.events_guest;
-       events_host = host->pmu_events.events_host;
+       pmu = kvm_get_pmu_events();
+       events_guest = pmu->events_guest;
+       events_host = pmu->events_host;
 
        kvm_vcpu_pmu_enable_el0(events_host);
        kvm_vcpu_pmu_disable_el0(events_guest);
index 708d80e..7fbc4c1 100644 (file)
@@ -51,13 +51,6 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
        return PSCI_RET_SUCCESS;
 }
 
-static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.power_off = true;
-       kvm_make_request(KVM_REQ_SLEEP, vcpu);
-       kvm_vcpu_kick(vcpu);
-}
-
 static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu,
                                           unsigned long affinity)
 {
@@ -83,7 +76,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
         */
        if (!vcpu)
                return PSCI_RET_INVALID_PARAMS;
-       if (!vcpu->arch.power_off) {
+       if (!kvm_arm_vcpu_stopped(vcpu)) {
                if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
                        return PSCI_RET_ALREADY_ON;
                else
@@ -107,12 +100,12 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
        kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
 
        /*
-        * Make sure the reset request is observed if the change to
-        * power_off is observed.
+        * Make sure the reset request is observed if the RUNNABLE mp_state is
+        * observed.
         */
        smp_wmb();
 
-       vcpu->arch.power_off = false;
+       vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
        kvm_vcpu_wake_up(vcpu);
 
        return PSCI_RET_SUCCESS;
@@ -150,7 +143,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
                mpidr = kvm_vcpu_get_mpidr_aff(tmp);
                if ((mpidr & target_affinity_mask) == target_affinity) {
                        matching_cpus++;
-                       if (!tmp->arch.power_off)
+                       if (!kvm_arm_vcpu_stopped(tmp))
                                return PSCI_0_2_AFFINITY_LEVEL_ON;
                }
        }
@@ -176,7 +169,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
         * re-initialized.
         */
        kvm_for_each_vcpu(i, tmp, vcpu->kvm)
-               tmp->arch.power_off = true;
+               tmp->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
        kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
 
        memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
@@ -202,6 +195,15 @@ static void kvm_psci_system_reset2(struct kvm_vcpu *vcpu)
                                 KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2);
 }
 
+static void kvm_psci_system_suspend(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+
+       memset(&run->system_event, 0, sizeof(vcpu->run->system_event));
+       run->system_event.type = KVM_SYSTEM_EVENT_SUSPEND;
+       run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+}
+
 static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
 {
        int i;
@@ -245,7 +247,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
                val = kvm_psci_vcpu_suspend(vcpu);
                break;
        case PSCI_0_2_FN_CPU_OFF:
-               kvm_psci_vcpu_off(vcpu);
+               kvm_arm_vcpu_power_off(vcpu);
                val = PSCI_RET_SUCCESS;
                break;
        case PSCI_0_2_FN_CPU_ON:
@@ -305,9 +307,10 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
 
 static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
 {
+       unsigned long val = PSCI_RET_NOT_SUPPORTED;
        u32 psci_fn = smccc_get_function(vcpu);
+       struct kvm *kvm = vcpu->kvm;
        u32 arg;
-       unsigned long val;
        int ret = 1;
 
        switch(psci_fn) {
@@ -320,6 +323,8 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
                if (val)
                        break;
 
+               val = PSCI_RET_NOT_SUPPORTED;
+
                switch(arg) {
                case PSCI_0_2_FN_PSCI_VERSION:
                case PSCI_0_2_FN_CPU_SUSPEND:
@@ -336,18 +341,32 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
                case ARM_SMCCC_VERSION_FUNC_ID:
                        val = 0;
                        break;
+               case PSCI_1_0_FN_SYSTEM_SUSPEND:
+               case PSCI_1_0_FN64_SYSTEM_SUSPEND:
+                       if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags))
+                               val = 0;
+                       break;
                case PSCI_1_1_FN_SYSTEM_RESET2:
                case PSCI_1_1_FN64_SYSTEM_RESET2:
-                       if (minor >= 1) {
+                       if (minor >= 1)
                                val = 0;
-                               break;
-                       }
-                       fallthrough;
-               default:
-                       val = PSCI_RET_NOT_SUPPORTED;
                        break;
                }
                break;
+       case PSCI_1_0_FN_SYSTEM_SUSPEND:
+               kvm_psci_narrow_to_32bit(vcpu);
+               fallthrough;
+       case PSCI_1_0_FN64_SYSTEM_SUSPEND:
+               /*
+                * Return directly to userspace without changing the vCPU's
+                * registers. Userspace depends on reading the SMCCC parameters
+                * to implement SYSTEM_SUSPEND.
+                */
+               if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags)) {
+                       kvm_psci_system_suspend(vcpu);
+                       return 0;
+               }
+               break;
        case PSCI_1_1_FN_SYSTEM_RESET2:
                kvm_psci_narrow_to_32bit(vcpu);
                fallthrough;
@@ -365,7 +384,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
                        val = PSCI_RET_INVALID_PARAMS;
                        break;
                }
-               fallthrough;
+               break;
        default:
                return kvm_psci_0_2_call(vcpu);
        }
@@ -382,7 +401,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
 
        switch (psci_fn) {
        case KVM_PSCI_FN_CPU_OFF:
-               kvm_psci_vcpu_off(vcpu);
+               kvm_arm_vcpu_power_off(vcpu);
                val = PSCI_RET_SUCCESS;
                break;
        case KVM_PSCI_FN_CPU_ON:
@@ -437,186 +456,3 @@ int kvm_psci_call(struct kvm_vcpu *vcpu)
                return -EINVAL;
        }
 }
-
-int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
-{
-       return 4;               /* PSCI version and three workaround registers */
-}
-
-int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
-{
-       if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++))
-               return -EFAULT;
-
-       if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++))
-               return -EFAULT;
-
-       if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++))
-               return -EFAULT;
-
-       if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3, uindices++))
-               return -EFAULT;
-
-       return 0;
-}
-
-#define KVM_REG_FEATURE_LEVEL_WIDTH    4
-#define KVM_REG_FEATURE_LEVEL_MASK     (BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1)
-
-/*
- * Convert the workaround level into an easy-to-compare number, where higher
- * values mean better protection.
- */
-static int get_kernel_wa_level(u64 regid)
-{
-       switch (regid) {
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-               switch (arm64_get_spectre_v2_state()) {
-               case SPECTRE_VULNERABLE:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
-               case SPECTRE_MITIGATED:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
-               case SPECTRE_UNAFFECTED:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
-               }
-               return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-               switch (arm64_get_spectre_v4_state()) {
-               case SPECTRE_MITIGATED:
-                       /*
-                        * As for the hypercall discovery, we pretend we
-                        * don't have any FW mitigation if SSBS is there at
-                        * all times.
-                        */
-                       if (cpus_have_final_cap(ARM64_SSBS))
-                               return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
-                       fallthrough;
-               case SPECTRE_UNAFFECTED:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
-               case SPECTRE_VULNERABLE:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
-               }
-               break;
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
-               switch (arm64_get_spectre_bhb_state()) {
-               case SPECTRE_VULNERABLE:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL;
-               case SPECTRE_MITIGATED:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL;
-               case SPECTRE_UNAFFECTED:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED;
-               }
-               return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL;
-       }
-
-       return -EINVAL;
-}
-
-int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
-       void __user *uaddr = (void __user *)(long)reg->addr;
-       u64 val;
-
-       switch (reg->id) {
-       case KVM_REG_ARM_PSCI_VERSION:
-               val = kvm_psci_version(vcpu);
-               break;
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
-               val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
-               break;
-       default:
-               return -ENOENT;
-       }
-
-       if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)))
-               return -EFAULT;
-
-       return 0;
-}
-
-int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
-       void __user *uaddr = (void __user *)(long)reg->addr;
-       u64 val;
-       int wa_level;
-
-       if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
-               return -EFAULT;
-
-       switch (reg->id) {
-       case KVM_REG_ARM_PSCI_VERSION:
-       {
-               bool wants_02;
-
-               wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features);
-
-               switch (val) {
-               case KVM_ARM_PSCI_0_1:
-                       if (wants_02)
-                               return -EINVAL;
-                       vcpu->kvm->arch.psci_version = val;
-                       return 0;
-               case KVM_ARM_PSCI_0_2:
-               case KVM_ARM_PSCI_1_0:
-               case KVM_ARM_PSCI_1_1:
-                       if (!wants_02)
-                               return -EINVAL;
-                       vcpu->kvm->arch.psci_version = val;
-                       return 0;
-               }
-               break;
-       }
-
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
-               if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
-                       return -EINVAL;
-
-               if (get_kernel_wa_level(reg->id) < val)
-                       return -EINVAL;
-
-               return 0;
-
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-               if (val & ~(KVM_REG_FEATURE_LEVEL_MASK |
-                           KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
-                       return -EINVAL;
-
-               /* The enabled bit must not be set unless the level is AVAIL. */
-               if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) &&
-                   (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL)
-                       return -EINVAL;
-
-               /*
-                * Map all the possible incoming states to the only two we
-                * really want to deal with.
-                */
-               switch (val & KVM_REG_FEATURE_LEVEL_MASK) {
-               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
-               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
-                       wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
-                       break;
-               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
-               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
-                       wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
-                       break;
-               default:
-                       return -EINVAL;
-               }
-
-               /*
-                * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the
-                * other way around.
-                */
-               if (get_kernel_wa_level(reg->id) < wa_level)
-                       return -EINVAL;
-
-               return 0;
-       default:
-               return -ENOENT;
-       }
-
-       return -EINVAL;
-}
index 18b403b..c06c047 100644 (file)
@@ -1145,6 +1145,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
                if (!vcpu_has_ptrauth(vcpu))
                        val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_APA3) |
                                 ARM64_FEATURE_MASK(ID_AA64ISAR2_GPA3));
+               if (!cpus_have_final_cap(ARM64_HAS_WFXT))
+                       val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_WFXT);
                break;
        case SYS_ID_AA64DFR0_EL1:
                /* Limit debug to ARMv8.0 */
@@ -2020,20 +2022,22 @@ static const struct sys_reg_desc cp14_64_regs[] = {
        { Op1( 0), CRm( 2), .access = trap_raz_wi },
 };
 
+#define CP15_PMU_SYS_REG(_map, _Op1, _CRn, _CRm, _Op2)                 \
+       AA32(_map),                                                     \
+       Op1(_Op1), CRn(_CRn), CRm(_CRm), Op2(_Op2),                     \
+       .visibility = pmu_visibility
+
 /* Macro to expand the PMEVCNTRn register */
 #define PMU_PMEVCNTR(n)                                                        \
-       /* PMEVCNTRn */                                                 \
-       { Op1(0), CRn(0b1110),                                          \
-         CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)),         \
-         access_pmu_evcntr }
+       { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110,                           \
+         (0b1000 | (((n) >> 3) & 0x3)), ((n) & 0x7)),                  \
+         .access = access_pmu_evcntr }
 
 /* Macro to expand the PMEVTYPERn register */
 #define PMU_PMEVTYPER(n)                                               \
-       /* PMEVTYPERn */                                                \
-       { Op1(0), CRn(0b1110),                                          \
-         CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)),         \
-         access_pmu_evtyper }
-
+       { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110,                           \
+         (0b1100 | (((n) >> 3) & 0x3)), ((n) & 0x7)),                  \
+         .access = access_pmu_evtyper }
 /*
  * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding,
  * depending on the way they are accessed (as a 32bit or a 64bit
@@ -2073,25 +2077,25 @@ static const struct sys_reg_desc cp15_regs[] = {
        { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw },
 
        /* PMU */
-       { Op1( 0), CRn( 9), CRm(12), Op2( 0), access_pmcr },
-       { Op1( 0), CRn( 9), CRm(12), Op2( 1), access_pmcnten },
-       { Op1( 0), CRn( 9), CRm(12), Op2( 2), access_pmcnten },
-       { Op1( 0), CRn( 9), CRm(12), Op2( 3), access_pmovs },
-       { Op1( 0), CRn( 9), CRm(12), Op2( 4), access_pmswinc },
-       { Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmselr },
-       { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid },
-       { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid },
-       { Op1( 0), CRn( 9), CRm(13), Op2( 0), access_pmu_evcntr },
-       { Op1( 0), CRn( 9), CRm(13), Op2( 1), access_pmu_evtyper },
-       { Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_evcntr },
-       { Op1( 0), CRn( 9), CRm(14), Op2( 0), access_pmuserenr },
-       { Op1( 0), CRn( 9), CRm(14), Op2( 1), access_pminten },
-       { Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pminten },
-       { Op1( 0), CRn( 9), CRm(14), Op2( 3), access_pmovs },
-       { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 4), access_pmceid },
-       { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 5), access_pmceid },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 0), .access = access_pmcr },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 1), .access = access_pmcnten },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 2), .access = access_pmcnten },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 3), .access = access_pmovs },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 4), .access = access_pmswinc },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 5), .access = access_pmselr },
+       { CP15_PMU_SYS_REG(LO,     0, 9, 12, 6), .access = access_pmceid },
+       { CP15_PMU_SYS_REG(LO,     0, 9, 12, 7), .access = access_pmceid },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 0), .access = access_pmu_evcntr },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 1), .access = access_pmu_evtyper },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 2), .access = access_pmu_evcntr },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 0), .access = access_pmuserenr },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 1), .access = access_pminten },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 2), .access = access_pminten },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 3), .access = access_pmovs },
+       { CP15_PMU_SYS_REG(HI,     0, 9, 14, 4), .access = access_pmceid },
+       { CP15_PMU_SYS_REG(HI,     0, 9, 14, 5), .access = access_pmceid },
        /* PMMIR */
-       { Op1( 0), CRn( 9), CRm(14), Op2( 6), trap_raz_wi },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 6), .access = trap_raz_wi },
 
        /* PRRR/MAIR0 */
        { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 },
@@ -2176,7 +2180,7 @@ static const struct sys_reg_desc cp15_regs[] = {
        PMU_PMEVTYPER(29),
        PMU_PMEVTYPER(30),
        /* PMCCFILTR */
-       { Op1(0), CRn(14), CRm(15), Op2(7), access_pmu_evtyper },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 14, 15, 7), .access = access_pmu_evtyper },
 
        { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr },
        { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr },
@@ -2185,7 +2189,7 @@ static const struct sys_reg_desc cp15_regs[] = {
 
 static const struct sys_reg_desc cp15_64_regs[] = {
        { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 },
-       { Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr },
+       { CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr },
        { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */
        { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 },
        { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */
@@ -2193,25 +2197,24 @@ static const struct sys_reg_desc cp15_64_regs[] = {
        { SYS_DESC(SYS_AARCH32_CNTP_CVAL),    access_arch_timer },
 };
 
-static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
-                             bool is_32)
+static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
+                              bool is_32)
 {
        unsigned int i;
 
        for (i = 0; i < n; i++) {
                if (!is_32 && table[i].reg && !table[i].reset) {
-                       kvm_err("sys_reg table %p entry %d has lacks reset\n",
-                               table, i);
-                       return 1;
+                       kvm_err("sys_reg table %pS entry %d lacks reset\n", &table[i], i);
+                       return false;
                }
 
                if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) {
-                       kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1);
-                       return 1;
+                       kvm_err("sys_reg table %pS entry %d out of order\n", &table[i - 1], i - 1);
+                       return false;
                }
        }
 
-       return 0;
+       return true;
 }
 
 int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu)
@@ -2252,27 +2255,27 @@ static void perform_access(struct kvm_vcpu *vcpu,
  * @table: array of trap descriptors
  * @num: size of the trap descriptor array
  *
- * Return 0 if the access has been handled, and -1 if not.
+ * Return true if the access has been handled, false if not.
  */
-static int emulate_cp(struct kvm_vcpu *vcpu,
-                     struct sys_reg_params *params,
-                     const struct sys_reg_desc *table,
-                     size_t num)
+static bool emulate_cp(struct kvm_vcpu *vcpu,
+                      struct sys_reg_params *params,
+                      const struct sys_reg_desc *table,
+                      size_t num)
 {
        const struct sys_reg_desc *r;
 
        if (!table)
-               return -1;      /* Not handled */
+               return false;   /* Not handled */
 
        r = find_reg(params, table, num);
 
        if (r) {
                perform_access(vcpu, params, r);
-               return 0;
+               return true;
        }
 
        /* Not handled */
-       return -1;
+       return false;
 }
 
 static void unhandled_cp_access(struct kvm_vcpu *vcpu,
@@ -2336,7 +2339,7 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu,
         * potential register operation in the case of a read and return
         * with success.
         */
-       if (!emulate_cp(vcpu, &params, global, nr_global)) {
+       if (emulate_cp(vcpu, &params, global, nr_global)) {
                /* Split up the value between registers for the read side */
                if (!params.is_write) {
                        vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval));
@@ -2350,34 +2353,144 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu,
        return 1;
 }
 
+static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params);
+
+/*
+ * The CP10 ID registers are architecturally mapped to AArch64 feature
+ * registers. Abuse that fact so we can rely on the AArch64 handler for accesses
+ * from AArch32.
+ */
+static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params)
+{
+       u8 reg_id = (esr >> 10) & 0xf;
+       bool valid;
+
+       params->is_write = ((esr & 1) == 0);
+       params->Op0 = 3;
+       params->Op1 = 0;
+       params->CRn = 0;
+       params->CRm = 3;
+
+       /* CP10 ID registers are read-only */
+       valid = !params->is_write;
+
+       switch (reg_id) {
+       /* MVFR0 */
+       case 0b0111:
+               params->Op2 = 0;
+               break;
+       /* MVFR1 */
+       case 0b0110:
+               params->Op2 = 1;
+               break;
+       /* MVFR2 */
+       case 0b0101:
+               params->Op2 = 2;
+               break;
+       default:
+               valid = false;
+       }
+
+       if (valid)
+               return true;
+
+       kvm_pr_unimpl("Unhandled cp10 register %s: %u\n",
+                     params->is_write ? "write" : "read", reg_id);
+       return false;
+}
+
+/**
+ * kvm_handle_cp10_id() - Handles a VMRS trap on guest access to a 'Media and
+ *                       VFP Register' from AArch32.
+ * @vcpu: The vCPU pointer
+ *
+ * MVFR{0-2} are architecturally mapped to the AArch64 MVFR{0-2}_EL1 registers.
+ * Work out the correct AArch64 system register encoding and reroute to the
+ * AArch64 system register emulation.
+ */
+int kvm_handle_cp10_id(struct kvm_vcpu *vcpu)
+{
+       int Rt = kvm_vcpu_sys_get_rt(vcpu);
+       u64 esr = kvm_vcpu_get_esr(vcpu);
+       struct sys_reg_params params;
+
+       /* UNDEF on any unhandled register access */
+       if (!kvm_esr_cp10_id_to_sys64(esr, &params)) {
+               kvm_inject_undefined(vcpu);
+               return 1;
+       }
+
+       if (emulate_sys_reg(vcpu, &params))
+               vcpu_set_reg(vcpu, Rt, params.regval);
+
+       return 1;
+}
+
+/**
+ * kvm_emulate_cp15_id_reg() - Handles an MRC trap on a guest CP15 access where
+ *                            CRn=0, which corresponds to the AArch32 feature
+ *                            registers.
+ * @vcpu: the vCPU pointer
+ * @params: the system register access parameters.
+ *
+ * Our cp15 system register tables do not enumerate the AArch32 feature
+ * registers. Conveniently, our AArch64 table does, and the AArch32 system
+ * register encoding can be trivially remapped into the AArch64 for the feature
+ * registers: Append op0=3, leaving op1, CRn, CRm, and op2 the same.
+ *
+ * According to DDI0487G.b G7.3.1, paragraph "Behavior of VMSAv8-32 32-bit
+ * System registers with (coproc=0b1111, CRn==c0)", read accesses from this
+ * range are either UNKNOWN or RES0. Rerouting remains architectural as we
+ * treat undefined registers in this range as RAZ.
+ */
+static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu,
+                                  struct sys_reg_params *params)
+{
+       int Rt = kvm_vcpu_sys_get_rt(vcpu);
+
+       /* Treat impossible writes to RO registers as UNDEFINED */
+       if (params->is_write) {
+               unhandled_cp_access(vcpu, params);
+               return 1;
+       }
+
+       params->Op0 = 3;
+
+       /*
+        * All registers where CRm > 3 are known to be UNKNOWN/RAZ from AArch32.
+        * Avoid conflicting with future expansion of AArch64 feature registers
+        * and simply treat them as RAZ here.
+        */
+       if (params->CRm > 3)
+               params->regval = 0;
+       else if (!emulate_sys_reg(vcpu, params))
+               return 1;
+
+       vcpu_set_reg(vcpu, Rt, params->regval);
+       return 1;
+}
+
 /**
  * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access
  * @vcpu: The VCPU pointer
  * @run:  The kvm_run struct
  */
 static int kvm_handle_cp_32(struct kvm_vcpu *vcpu,
+                           struct sys_reg_params *params,
                            const struct sys_reg_desc *global,
                            size_t nr_global)
 {
-       struct sys_reg_params params;
-       u64 esr = kvm_vcpu_get_esr(vcpu);
        int Rt  = kvm_vcpu_sys_get_rt(vcpu);
 
-       params.CRm = (esr >> 1) & 0xf;
-       params.regval = vcpu_get_reg(vcpu, Rt);
-       params.is_write = ((esr & 1) == 0);
-       params.CRn = (esr >> 10) & 0xf;
-       params.Op0 = 0;
-       params.Op1 = (esr >> 14) & 0x7;
-       params.Op2 = (esr >> 17) & 0x7;
+       params->regval = vcpu_get_reg(vcpu, Rt);
 
-       if (!emulate_cp(vcpu, &params, global, nr_global)) {
-               if (!params.is_write)
-                       vcpu_set_reg(vcpu, Rt, params.regval);
+       if (emulate_cp(vcpu, params, global, nr_global)) {
+               if (!params->is_write)
+                       vcpu_set_reg(vcpu, Rt, params->regval);
                return 1;
        }
 
-       unhandled_cp_access(vcpu, &params);
+       unhandled_cp_access(vcpu, params);
        return 1;
 }
 
@@ -2388,7 +2501,20 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu)
 
 int kvm_handle_cp15_32(struct kvm_vcpu *vcpu)
 {
-       return kvm_handle_cp_32(vcpu, cp15_regs, ARRAY_SIZE(cp15_regs));
+       struct sys_reg_params params;
+
+       params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu));
+
+       /*
+        * Certain AArch32 ID registers are handled by rerouting to the AArch64
+        * system register table. Registers in the ID range where CRm=0 are
+        * excluded from this scheme as they do not trivially map into AArch64
+        * system register encodings.
+        */
+       if (params.Op1 == 0 && params.CRn == 0 && params.CRm)
+               return kvm_emulate_cp15_id_reg(vcpu, &params);
+
+       return kvm_handle_cp_32(vcpu, &params, cp15_regs, ARRAY_SIZE(cp15_regs));
 }
 
 int kvm_handle_cp14_64(struct kvm_vcpu *vcpu)
@@ -2398,7 +2524,11 @@ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu)
 
 int kvm_handle_cp14_32(struct kvm_vcpu *vcpu)
 {
-       return kvm_handle_cp_32(vcpu, cp14_regs, ARRAY_SIZE(cp14_regs));
+       struct sys_reg_params params;
+
+       params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu));
+
+       return kvm_handle_cp_32(vcpu, &params, cp14_regs, ARRAY_SIZE(cp14_regs));
 }
 
 static bool is_imp_def_sys_reg(struct sys_reg_params *params)
@@ -2407,7 +2537,14 @@ static bool is_imp_def_sys_reg(struct sys_reg_params *params)
        return params->Op0 == 3 && (params->CRn & 0b1011) == 0b1011;
 }
 
-static int emulate_sys_reg(struct kvm_vcpu *vcpu,
+/**
+ * emulate_sys_reg - Emulate a guest access to an AArch64 system register
+ * @vcpu: The VCPU pointer
+ * @params: Decoded system register parameters
+ *
+ * Return: true if the system register access was successful, false otherwise.
+ */
+static bool emulate_sys_reg(struct kvm_vcpu *vcpu,
                           struct sys_reg_params *params)
 {
        const struct sys_reg_desc *r;
@@ -2416,7 +2553,10 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu,
 
        if (likely(r)) {
                perform_access(vcpu, params, r);
-       } else if (is_imp_def_sys_reg(params)) {
+               return true;
+       }
+
+       if (is_imp_def_sys_reg(params)) {
                kvm_inject_undefined(vcpu);
        } else {
                print_sys_reg_msg(params,
@@ -2424,7 +2564,7 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu,
                                  *vcpu_pc(vcpu), *vcpu_cpsr(vcpu));
                kvm_inject_undefined(vcpu);
        }
-       return 1;
+       return false;
 }
 
 /**
@@ -2452,18 +2592,18 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu)
        struct sys_reg_params params;
        unsigned long esr = kvm_vcpu_get_esr(vcpu);
        int Rt = kvm_vcpu_sys_get_rt(vcpu);
-       int ret;
 
        trace_kvm_handle_sys_reg(esr);
 
        params = esr_sys64_to_params(esr);
        params.regval = vcpu_get_reg(vcpu, Rt);
 
-       ret = emulate_sys_reg(vcpu, &params);
+       if (!emulate_sys_reg(vcpu, &params))
+               return 1;
 
        if (!params.is_write)
                vcpu_set_reg(vcpu, Rt, params.regval);
-       return ret;
+       return 1;
 }
 
 /******************************************************************************
@@ -2866,18 +3006,22 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
        return write_demux_regids(uindices);
 }
 
-void kvm_sys_reg_table_init(void)
+int kvm_sys_reg_table_init(void)
 {
+       bool valid = true;
        unsigned int i;
        struct sys_reg_desc clidr;
 
        /* Make sure tables are unique and in order. */
-       BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false));
-       BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true));
-       BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true));
-       BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true));
-       BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true));
-       BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false));
+       valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false);
+       valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true);
+       valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true);
+       valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true);
+       valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true);
+       valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false);
+
+       if (!valid)
+               return -EINVAL;
 
        /* We abuse the reset function to overwrite the table itself. */
        for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
@@ -2900,4 +3044,6 @@ void kvm_sys_reg_table_init(void)
                        break;
        /* Clear all higher bits. */
        cache_levels &= (1 << (i*3))-1;
+
+       return 0;
 }
index cc0cc95..aee8ea0 100644 (file)
@@ -35,12 +35,19 @@ struct sys_reg_params {
                                  .Op2 = ((esr) >> 17) & 0x7,                  \
                                  .is_write = !((esr) & 1) })
 
+#define esr_cp1x_32_to_params(esr)                                             \
+       ((struct sys_reg_params){ .Op1 = ((esr) >> 14) & 0x7,                   \
+                                 .CRn = ((esr) >> 10) & 0xf,                   \
+                                 .CRm = ((esr) >> 1) & 0xf,                    \
+                                 .Op2 = ((esr) >> 17) & 0x7,                   \
+                                 .is_write = !((esr) & 1) })
+
 struct sys_reg_desc {
        /* Sysreg string for debug */
        const char *name;
 
        enum {
-               AA32_ZEROHIGH,
+               AA32_DIRECT,
                AA32_LO,
                AA32_HI,
        } aarch32_map;
index fc00304..f6d4f40 100644 (file)
@@ -98,11 +98,11 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
        ret = 0;
 
        if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
-               kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
+               kvm->max_vcpus = VGIC_V2_MAX_CPUS;
        else
-               kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
+               kvm->max_vcpus = VGIC_V3_MAX_CPUS;
 
-       if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
+       if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) {
                ret = -E2BIG;
                goto out_unlock;
        }
@@ -319,7 +319,12 @@ int vgic_init(struct kvm *kvm)
 
        vgic_debug_init(kvm);
 
-       dist->implementation_rev = 2;
+       /*
+        * If userspace didn't set the GIC implementation revision,
+        * default to the latest and greatest. You know want it.
+        */
+       if (!dist->implementation_rev)
+               dist->implementation_rev = KVM_VGIC_IMP_REV_LATEST;
        dist->initialized = true;
 
 out:
index 2e13402..9d3299a 100644 (file)
@@ -683,7 +683,7 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
        if (!vcpu)
                return E_ITS_INT_UNMAPPED_INTERRUPT;
 
-       if (!vcpu->arch.vgic_cpu.lpis_enabled)
+       if (!vgic_lpis_enabled(vcpu))
                return -EBUSY;
 
        vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
@@ -894,6 +894,18 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
        return update_affinity(ite->irq, vcpu);
 }
 
+static bool __is_visible_gfn_locked(struct vgic_its *its, gpa_t gpa)
+{
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       int idx;
+       bool ret;
+
+       idx = srcu_read_lock(&its->dev->kvm->srcu);
+       ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
+       srcu_read_unlock(&its->dev->kvm->srcu, idx);
+       return ret;
+}
+
 /*
  * Check whether an ID can be stored into the corresponding guest table.
  * For a direct table this is pretty easy, but gets a bit nasty for
@@ -908,9 +920,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
        u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
        phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
        int esz = GITS_BASER_ENTRY_SIZE(baser);
-       int index, idx;
-       gfn_t gfn;
-       bool ret;
+       int index;
 
        switch (type) {
        case GITS_BASER_TYPE_DEVICE:
@@ -933,12 +943,11 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
                        return false;
 
                addr = base + id * esz;
-               gfn = addr >> PAGE_SHIFT;
 
                if (eaddr)
                        *eaddr = addr;
 
-               goto out;
+               return __is_visible_gfn_locked(its, addr);
        }
 
        /* calculate and check the index into the 1st level */
@@ -964,27 +973,42 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
        /* Find the address of the actual entry */
        index = id % (SZ_64K / esz);
        indirect_ptr += index * esz;
-       gfn = indirect_ptr >> PAGE_SHIFT;
 
        if (eaddr)
                *eaddr = indirect_ptr;
 
-out:
-       idx = srcu_read_lock(&its->dev->kvm->srcu);
-       ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
-       srcu_read_unlock(&its->dev->kvm->srcu, idx);
-       return ret;
+       return __is_visible_gfn_locked(its, indirect_ptr);
 }
 
+/*
+ * Check whether an event ID can be stored in the corresponding Interrupt
+ * Translation Table, which starts at device->itt_addr.
+ */
+static bool vgic_its_check_event_id(struct vgic_its *its, struct its_device *device,
+               u32 event_id)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       int ite_esz = abi->ite_esz;
+       gpa_t gpa;
+
+       /* max table size is: BIT_ULL(device->num_eventid_bits) * ite_esz */
+       if (event_id >= BIT_ULL(device->num_eventid_bits))
+               return false;
+
+       gpa = device->itt_addr + event_id * ite_esz;
+       return __is_visible_gfn_locked(its, gpa);
+}
+
+/*
+ * Add a new collection into the ITS collection table.
+ * Returns 0 on success, and a negative error value for generic errors.
+ */
 static int vgic_its_alloc_collection(struct vgic_its *its,
                                     struct its_collection **colp,
                                     u32 coll_id)
 {
        struct its_collection *collection;
 
-       if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
-               return E_ITS_MAPC_COLLECTION_OOR;
-
        collection = kzalloc(sizeof(*collection), GFP_KERNEL_ACCOUNT);
        if (!collection)
                return -ENOMEM;
@@ -1061,7 +1085,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
        if (!device)
                return E_ITS_MAPTI_UNMAPPED_DEVICE;
 
-       if (event_id >= BIT_ULL(device->num_eventid_bits))
+       if (!vgic_its_check_event_id(its, device, event_id))
                return E_ITS_MAPTI_ID_OOR;
 
        if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
@@ -1078,7 +1102,12 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 
        collection = find_collection(its, coll_id);
        if (!collection) {
-               int ret = vgic_its_alloc_collection(its, &collection, coll_id);
+               int ret;
+
+               if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
+                       return E_ITS_MAPC_COLLECTION_OOR;
+
+               ret = vgic_its_alloc_collection(its, &collection, coll_id);
                if (ret)
                        return ret;
                new_coll = collection;
@@ -1233,6 +1262,10 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
                if (!collection) {
                        int ret;
 
+                       if (!vgic_its_check_id(its, its->baser_coll_table,
+                                               coll_id, NULL))
+                               return E_ITS_MAPC_COLLECTION_OOR;
+
                        ret = vgic_its_alloc_collection(its, &collection,
                                                        coll_id);
                        if (ret)
@@ -1272,6 +1305,11 @@ static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
        return 0;
 }
 
+int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq)
+{
+       return update_lpi_config(kvm, irq, NULL, true);
+}
+
 /*
  * The INV command syncs the configuration bits from the memory table.
  * Must be called with the its_lock mutex held.
@@ -1288,7 +1326,41 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
        if (!ite)
                return E_ITS_INV_UNMAPPED_INTERRUPT;
 
-       return update_lpi_config(kvm, ite->irq, NULL, true);
+       return vgic_its_inv_lpi(kvm, ite->irq);
+}
+
+/**
+ * vgic_its_invall - invalidate all LPIs targetting a given vcpu
+ * @vcpu: the vcpu for which the RD is targetted by an invalidation
+ *
+ * Contrary to the INVALL command, this targets a RD instead of a
+ * collection, and we don't need to hold the its_lock, since no ITS is
+ * involved here.
+ */
+int vgic_its_invall(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       int irq_count, i = 0;
+       u32 *intids;
+
+       irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
+       if (irq_count < 0)
+               return irq_count;
+
+       for (i = 0; i < irq_count; i++) {
+               struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intids[i]);
+               if (!irq)
+                       continue;
+               update_lpi_config(kvm, irq, vcpu, false);
+               vgic_put_irq(kvm, irq);
+       }
+
+       kfree(intids);
+
+       if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
+               its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
+
+       return 0;
 }
 
 /*
@@ -1305,32 +1377,13 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
        u32 coll_id = its_cmd_get_collection(its_cmd);
        struct its_collection *collection;
        struct kvm_vcpu *vcpu;
-       struct vgic_irq *irq;
-       u32 *intids;
-       int irq_count, i;
 
        collection = find_collection(its, coll_id);
        if (!its_is_collection_mapped(collection))
                return E_ITS_INVALL_UNMAPPED_COLLECTION;
 
        vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-       irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
-       if (irq_count < 0)
-               return irq_count;
-
-       for (i = 0; i < irq_count; i++) {
-               irq = vgic_get_irq(kvm, NULL, intids[i]);
-               if (!irq)
-                       continue;
-               update_lpi_config(kvm, irq, vcpu, false);
-               vgic_put_irq(kvm, irq);
-       }
-
-       kfree(intids);
-
-       if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
-               its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
+       vgic_its_invall(vcpu);
 
        return 0;
 }
@@ -2175,6 +2228,9 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
        if (!collection)
                return -EINVAL;
 
+       if (!vgic_its_check_event_id(its, dev, event_id))
+               return -EINVAL;
+
        ite = vgic_its_alloc_ite(dev, collection, event_id);
        if (IS_ERR(ite))
                return PTR_ERR(ite);
@@ -2183,8 +2239,10 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
                vcpu = kvm_get_vcpu(kvm, collection->target_addr);
 
        irq = vgic_add_lpi(kvm, lpi_id, vcpu);
-       if (IS_ERR(irq))
+       if (IS_ERR(irq)) {
+               its_free_ite(kvm, ite);
                return PTR_ERR(irq);
+       }
        ite->irq = irq;
 
        return offset;
@@ -2296,6 +2354,7 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
                                void *ptr, void *opaque)
 {
        struct its_device *dev;
+       u64 baser = its->baser_device_table;
        gpa_t itt_addr;
        u8 num_eventid_bits;
        u64 entry = *(u64 *)ptr;
@@ -2316,6 +2375,9 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
        /* dte entry is valid */
        offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT;
 
+       if (!vgic_its_check_id(its, baser, id, NULL))
+               return -EINVAL;
+
        dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits);
        if (IS_ERR(dev))
                return PTR_ERR(dev);
@@ -2445,6 +2507,9 @@ static int vgic_its_restore_device_tables(struct vgic_its *its)
        if (ret > 0)
                ret = 0;
 
+       if (ret < 0)
+               vgic_its_free_device_list(its->dev->kvm, its);
+
        return ret;
 }
 
@@ -2461,6 +2526,11 @@ static int vgic_its_save_cte(struct vgic_its *its,
        return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
 }
 
+/*
+ * Restore a collection entry into the ITS collection table.
+ * Return +1 on success, 0 if the entry was invalid (which should be
+ * interpreted as end-of-table), and a negative error value for generic errors.
+ */
 static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
 {
        struct its_collection *collection;
@@ -2487,6 +2557,10 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
        collection = find_collection(its, coll_id);
        if (collection)
                return -EEXIST;
+
+       if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
+               return -EINVAL;
+
        ret = vgic_its_alloc_collection(its, &collection, coll_id);
        if (ret)
                return ret;
@@ -2566,6 +2640,9 @@ static int vgic_its_restore_collection_table(struct vgic_its *its)
        if (ret > 0)
                return 0;
 
+       if (ret < 0)
+               vgic_its_free_collection_list(its->dev->kvm, its);
+
        return ret;
 }
 
@@ -2597,7 +2674,10 @@ static int vgic_its_restore_tables_v0(struct vgic_its *its)
        if (ret)
                return ret;
 
-       return vgic_its_restore_device_tables(its);
+       ret = vgic_its_restore_device_tables(its);
+       if (ret)
+               vgic_its_free_collection_list(its->dev->kvm, its);
+       return ret;
 }
 
 static int vgic_its_commit_v0(struct vgic_its *its)
index 12e4c22..77a67e9 100644 (file)
@@ -73,9 +73,13 @@ static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
                                           gpa_t addr, unsigned int len,
                                           unsigned long val)
 {
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       u32 reg;
+
        switch (addr & 0x0c) {
        case GIC_DIST_IIDR:
-               if (val != vgic_mmio_read_v2_misc(vcpu, addr, len))
+               reg = vgic_mmio_read_v2_misc(vcpu, addr, len);
+               if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK)
                        return -EINVAL;
 
                /*
@@ -87,8 +91,16 @@ static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
                 * migration from old kernels to new kernels with legacy
                 * userspace.
                 */
-               vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
-               return 0;
+               reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg);
+               switch (reg) {
+               case KVM_VGIC_IMP_REV_2:
+               case KVM_VGIC_IMP_REV_3:
+                       vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
+                       dist->implementation_rev = reg;
+                       return 0;
+               default:
+                       return -EINVAL;
+               }
        }
 
        vgic_mmio_write_v2_misc(vcpu, addr, len, val);
index 58e40b4..f7aa7bc 100644 (file)
@@ -155,13 +155,27 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
                                           unsigned long val)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       u32 reg;
 
        switch (addr & 0x0c) {
        case GICD_TYPER2:
-       case GICD_IIDR:
                if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
                        return -EINVAL;
                return 0;
+       case GICD_IIDR:
+               reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
+               if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK)
+                       return -EINVAL;
+
+               reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg);
+               switch (reg) {
+               case KVM_VGIC_IMP_REV_2:
+               case KVM_VGIC_IMP_REV_3:
+                       dist->implementation_rev = reg;
+                       return 0;
+               default:
+                       return -EINVAL;
+               }
        case GICD_CTLR:
                /* Not a GICv4.1? No HW SGIs */
                if (!kvm_vgic_global_state.has_gicv4_1)
@@ -221,34 +235,58 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
        vgic_put_irq(vcpu->kvm, irq);
 }
 
+bool vgic_lpis_enabled(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       return atomic_read(&vgic_cpu->ctlr) == GICR_CTLR_ENABLE_LPIS;
+}
+
 static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
                                             gpa_t addr, unsigned int len)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       unsigned long val;
 
-       return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
-}
+       val = atomic_read(&vgic_cpu->ctlr);
+       if (vgic_get_implementation_rev(vcpu) >= KVM_VGIC_IMP_REV_3)
+               val |= GICR_CTLR_IR | GICR_CTLR_CES;
 
+       return val;
+}
 
 static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
                                     gpa_t addr, unsigned int len,
                                     unsigned long val)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       bool was_enabled = vgic_cpu->lpis_enabled;
+       u32 ctlr;
 
        if (!vgic_has_its(vcpu->kvm))
                return;
 
-       vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+       if (!(val & GICR_CTLR_ENABLE_LPIS)) {
+               /*
+                * Don't disable if RWP is set, as there already an
+                * ongoing disable. Funky guest...
+                */
+               ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr,
+                                             GICR_CTLR_ENABLE_LPIS,
+                                             GICR_CTLR_RWP);
+               if (ctlr != GICR_CTLR_ENABLE_LPIS)
+                       return;
 
-       if (was_enabled && !vgic_cpu->lpis_enabled) {
                vgic_flush_pending_lpis(vcpu);
                vgic_its_invalidate_cache(vcpu->kvm);
-       }
+               atomic_set_release(&vgic_cpu->ctlr, 0);
+       } else {
+               ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0,
+                                             GICR_CTLR_ENABLE_LPIS);
+               if (ctlr != 0)
+                       return;
 
-       if (!was_enabled && vgic_cpu->lpis_enabled)
                vgic_enable_lpis(vcpu);
+       }
 }
 
 static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu)
@@ -478,11 +516,10 @@ static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
                                     unsigned long val)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        u64 old_propbaser, propbaser;
 
        /* Storing a value with LPIs already enabled is undefined */
-       if (vgic_cpu->lpis_enabled)
+       if (vgic_lpis_enabled(vcpu))
                return;
 
        do {
@@ -513,7 +550,7 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
        u64 old_pendbaser, pendbaser;
 
        /* Storing a value with LPIs already enabled is undefined */
-       if (vgic_cpu->lpis_enabled)
+       if (vgic_lpis_enabled(vcpu))
                return;
 
        do {
@@ -525,6 +562,63 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
                           pendbaser) != old_pendbaser);
 }
 
+static unsigned long vgic_mmio_read_sync(struct kvm_vcpu *vcpu,
+                                        gpa_t addr, unsigned int len)
+{
+       return !!atomic_read(&vcpu->arch.vgic_cpu.syncr_busy);
+}
+
+static void vgic_set_rdist_busy(struct kvm_vcpu *vcpu, bool busy)
+{
+       if (busy) {
+               atomic_inc(&vcpu->arch.vgic_cpu.syncr_busy);
+               smp_mb__after_atomic();
+       } else {
+               smp_mb__before_atomic();
+               atomic_dec(&vcpu->arch.vgic_cpu.syncr_busy);
+       }
+}
+
+static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu,
+                                  gpa_t addr, unsigned int len,
+                                  unsigned long val)
+{
+       struct vgic_irq *irq;
+
+       /*
+        * If the guest wrote only to the upper 32bit part of the
+        * register, drop the write on the floor, as it is only for
+        * vPEs (which we don't support for obvious reasons).
+        *
+        * Also discard the access if LPIs are not enabled.
+        */
+       if ((addr & 4) || !vgic_lpis_enabled(vcpu))
+               return;
+
+       vgic_set_rdist_busy(vcpu, true);
+
+       irq = vgic_get_irq(vcpu->kvm, NULL, lower_32_bits(val));
+       if (irq) {
+               vgic_its_inv_lpi(vcpu->kvm, irq);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       vgic_set_rdist_busy(vcpu, false);
+}
+
+static void vgic_mmio_write_invall(struct kvm_vcpu *vcpu,
+                                  gpa_t addr, unsigned int len,
+                                  unsigned long val)
+{
+       /* See vgic_mmio_write_invlpi() for the early return rationale */
+       if ((addr & 4) || !vgic_lpis_enabled(vcpu))
+               return;
+
+       vgic_set_rdist_busy(vcpu, true);
+       vgic_its_invall(vcpu);
+       vgic_set_rdist_busy(vcpu, false);
+}
+
 /*
  * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
  * redistributors, while SPIs are covered by registers in the distributor
@@ -630,6 +724,15 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
        REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
                vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_INVLPIR,
+               vgic_mmio_read_raz, vgic_mmio_write_invlpi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_INVALLR,
+               vgic_mmio_read_raz, vgic_mmio_write_invall, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_SYNCR,
+               vgic_mmio_read_sync, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
                vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
                VGIC_ACCESS_32bit),
index b549af8..826ff6f 100644 (file)
@@ -612,6 +612,10 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
 static const struct midr_range broken_seis[] = {
        MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM),
+       MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO),
+       MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO),
+       MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX),
+       MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX),
        {},
 };
 
index 3fd6c86..4c6bdd3 100644 (file)
 #define DEBUG_SPINLOCK_BUG_ON(p)
 #endif
 
+static inline u32 vgic_get_implementation_rev(struct kvm_vcpu *vcpu)
+{
+       return vcpu->kvm->arch.vgic.implementation_rev;
+}
+
 /* Requires the irq_lock to be held by the caller. */
 static inline bool irq_is_pending(struct vgic_irq *irq)
 {
@@ -308,6 +313,7 @@ static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
                (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE);
 }
 
+bool vgic_lpis_enabled(struct kvm_vcpu *vcpu);
 int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
 int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
                         u32 devid, u32 eventid, struct vgic_irq **irq);
@@ -317,6 +323,10 @@ void vgic_lpi_translation_cache_init(struct kvm *kvm);
 void vgic_lpi_translation_cache_destroy(struct kvm *kvm);
 void vgic_its_invalidate_cache(struct kvm *kvm);
 
+/* GICv4.1 MMIO interface */
+int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
+int vgic_its_invall(struct kvm_vcpu *vcpu);
+
 bool vgic_supports_direct_msis(struct kvm *kvm);
 int vgic_v4_init(struct kvm *kvm);
 void vgic_v4_teardown(struct kvm *kvm);
index 1688af0..5b78901 100644 (file)
@@ -27,7 +27,17 @@ void __delay(unsigned long cycles)
 {
        cycles_t start = get_cycles();
 
-       if (arch_timer_evtstrm_available()) {
+       if (cpus_have_const_cap(ARM64_HAS_WFXT)) {
+               u64 end = start + cycles;
+
+               /*
+                * Start with WFIT. If an interrupt makes us resume
+                * early, use a WFET loop to complete the delay.
+                */
+               wfit(end);
+               while ((get_cycles() - start) < cycles)
+                       wfet(end);
+       } else  if (arch_timer_evtstrm_available()) {
                const cycles_t timer_evt_period =
                        USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US);
 
index e52b289..507b203 100644 (file)
@@ -38,6 +38,7 @@ HAS_STAGE2_FWB
 HAS_SYSREG_GIC_CPUIF
 HAS_TLB_RANGE
 HAS_VIRT_HOST_EXTN
+HAS_WFXT
 HW_DBM
 KVM_PROTECTED_MODE
 MISMATCHED_CACHE_TYPE
index e935f27..cc40521 100644 (file)
 #define HGATP_MODE_SV32X4      _AC(1, UL)
 #define HGATP_MODE_SV39X4      _AC(8, UL)
 #define HGATP_MODE_SV48X4      _AC(9, UL)
+#define HGATP_MODE_SV57X4      _AC(10, UL)
 
 #define HGATP32_MODE_SHIFT     31
 #define HGATP32_VMID_SHIFT     22
index cd4bbce..319c8ae 100644 (file)
 #include <linux/types.h>
 #include <linux/kvm.h>
 #include <linux/kvm_types.h>
+#include <linux/spinlock.h>
 #include <asm/csr.h>
 #include <asm/kvm_vcpu_fp.h>
 #include <asm/kvm_vcpu_timer.h>
 
-#define KVM_MAX_VCPUS                  \
-       ((HGATP_VMID_MASK >> HGATP_VMID_SHIFT) + 1)
+#define KVM_MAX_VCPUS                  1024
 
 #define KVM_HALT_POLL_NS_DEFAULT       500000
 
        KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_VCPU_RESET             KVM_ARCH_REQ(1)
 #define KVM_REQ_UPDATE_HGATP           KVM_ARCH_REQ(2)
+#define KVM_REQ_FENCE_I                        \
+       KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HFENCE_GVMA_VMID_ALL   KVM_REQ_TLB_FLUSH
+#define KVM_REQ_HFENCE_VVMA_ALL                \
+       KVM_ARCH_REQ_FLAGS(4, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HFENCE                 \
+       KVM_ARCH_REQ_FLAGS(5, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+
+enum kvm_riscv_hfence_type {
+       KVM_RISCV_HFENCE_UNKNOWN = 0,
+       KVM_RISCV_HFENCE_GVMA_VMID_GPA,
+       KVM_RISCV_HFENCE_VVMA_ASID_GVA,
+       KVM_RISCV_HFENCE_VVMA_ASID_ALL,
+       KVM_RISCV_HFENCE_VVMA_GVA,
+};
+
+struct kvm_riscv_hfence {
+       enum kvm_riscv_hfence_type type;
+       unsigned long asid;
+       unsigned long order;
+       gpa_t addr;
+       gpa_t size;
+};
+
+#define KVM_RISCV_VCPU_MAX_HFENCE      64
 
 struct kvm_vm_stat {
        struct kvm_vm_stat_generic generic;
@@ -54,10 +79,10 @@ struct kvm_vmid {
 };
 
 struct kvm_arch {
-       /* stage2 vmid */
+       /* G-stage vmid */
        struct kvm_vmid vmid;
 
-       /* stage2 page table */
+       /* G-stage page table */
        pgd_t *pgd;
        phys_addr_t pgd_phys;
 
@@ -141,6 +166,9 @@ struct kvm_vcpu_arch {
        /* VCPU ran at least once */
        bool ran_atleast_once;
 
+       /* Last Host CPU on which Guest VCPU exited */
+       int last_exit_cpu;
+
        /* ISA feature bits (similar to MISA) */
        unsigned long isa;
 
@@ -179,6 +207,12 @@ struct kvm_vcpu_arch {
        /* VCPU Timer */
        struct kvm_vcpu_timer timer;
 
+       /* HFENCE request queue */
+       spinlock_t hfence_lock;
+       unsigned long hfence_head;
+       unsigned long hfence_tail;
+       struct kvm_riscv_hfence hfence_queue[KVM_RISCV_VCPU_MAX_HFENCE];
+
        /* MMIO instruction details */
        struct kvm_mmio_decode mmio_decode;
 
@@ -201,27 +235,71 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 
-void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa_divby_4,
-                                     unsigned long vmid);
-void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid);
-void __kvm_riscv_hfence_gvma_gpa(unsigned long gpa_divby_4);
-void __kvm_riscv_hfence_gvma_all(void);
-
-int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
+#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER         12
+
+void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
+                                         gpa_t gpa, gpa_t gpsz,
+                                         unsigned long order);
+void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid);
+void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz,
+                                    unsigned long order);
+void kvm_riscv_local_hfence_gvma_all(void);
+void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid,
+                                         unsigned long asid,
+                                         unsigned long gva,
+                                         unsigned long gvsz,
+                                         unsigned long order);
+void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid,
+                                         unsigned long asid);
+void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid,
+                                    unsigned long gva, unsigned long gvsz,
+                                    unsigned long order);
+void kvm_riscv_local_hfence_vvma_all(unsigned long vmid);
+
+void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu);
+
+void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu);
+void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu);
+void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu);
+void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu);
+
+void kvm_riscv_fence_i(struct kvm *kvm,
+                      unsigned long hbase, unsigned long hmask);
+void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm,
+                                   unsigned long hbase, unsigned long hmask,
+                                   gpa_t gpa, gpa_t gpsz,
+                                   unsigned long order);
+void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm,
+                                   unsigned long hbase, unsigned long hmask);
+void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
+                                   unsigned long hbase, unsigned long hmask,
+                                   unsigned long gva, unsigned long gvsz,
+                                   unsigned long order, unsigned long asid);
+void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
+                                   unsigned long hbase, unsigned long hmask,
+                                   unsigned long asid);
+void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
+                              unsigned long hbase, unsigned long hmask,
+                              unsigned long gva, unsigned long gvsz,
+                              unsigned long order);
+void kvm_riscv_hfence_vvma_all(struct kvm *kvm,
+                              unsigned long hbase, unsigned long hmask);
+
+int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
                         struct kvm_memory_slot *memslot,
                         gpa_t gpa, unsigned long hva, bool is_write);
-int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm);
-void kvm_riscv_stage2_free_pgd(struct kvm *kvm);
-void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu);
-void kvm_riscv_stage2_mode_detect(void);
-unsigned long kvm_riscv_stage2_mode(void);
-int kvm_riscv_stage2_gpa_bits(void);
-
-void kvm_riscv_stage2_vmid_detect(void);
-unsigned long kvm_riscv_stage2_vmid_bits(void);
-int kvm_riscv_stage2_vmid_init(struct kvm *kvm);
-bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid);
-void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu);
+int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm);
+void kvm_riscv_gstage_free_pgd(struct kvm *kvm);
+void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu);
+void kvm_riscv_gstage_mode_detect(void);
+unsigned long kvm_riscv_gstage_mode(void);
+int kvm_riscv_gstage_gpa_bits(void);
+
+void kvm_riscv_gstage_vmid_detect(void);
+unsigned long kvm_riscv_gstage_vmid_bits(void);
+int kvm_riscv_gstage_vmid_init(struct kvm *kvm);
+bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid);
+void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu);
 
 void __kvm_riscv_unpriv_trap(void);
 
index f808ad1..6119368 100644 (file)
@@ -82,6 +82,23 @@ struct kvm_riscv_timer {
        __u64 state;
 };
 
+/*
+ * ISA extension IDs specific to KVM. This is not the same as the host ISA
+ * extension IDs as that is internal to the host and should not be exposed
+ * to the guest. This should always be contiguous to keep the mapping simple
+ * in KVM implementation.
+ */
+enum KVM_RISCV_ISA_EXT_ID {
+       KVM_RISCV_ISA_EXT_A = 0,
+       KVM_RISCV_ISA_EXT_C,
+       KVM_RISCV_ISA_EXT_D,
+       KVM_RISCV_ISA_EXT_F,
+       KVM_RISCV_ISA_EXT_H,
+       KVM_RISCV_ISA_EXT_I,
+       KVM_RISCV_ISA_EXT_M,
+       KVM_RISCV_ISA_EXT_MAX,
+};
+
 /* Possible states for kvm_riscv_timer */
 #define KVM_RISCV_TIMER_STATE_OFF      0
 #define KVM_RISCV_TIMER_STATE_ON       1
@@ -123,6 +140,9 @@ struct kvm_riscv_timer {
 #define KVM_REG_RISCV_FP_D_REG(name)   \
                (offsetof(struct __riscv_d_ext_state, name) / sizeof(__u64))
 
+/* ISA Extension registers are mapped as type 7 */
+#define KVM_REG_RISCV_ISA_EXT          (0x07 << KVM_REG_RISCV_TYPE_SHIFT)
+
 #endif
 
 #endif /* __LINUX_KVM_RISCV_H */
index 2e5ca43..1549205 100644 (file)
@@ -89,13 +89,13 @@ int kvm_arch_init(void *opaque)
                return -ENODEV;
        }
 
-       kvm_riscv_stage2_mode_detect();
+       kvm_riscv_gstage_mode_detect();
 
-       kvm_riscv_stage2_vmid_detect();
+       kvm_riscv_gstage_vmid_detect();
 
        kvm_info("hypervisor extension available\n");
 
-       switch (kvm_riscv_stage2_mode()) {
+       switch (kvm_riscv_gstage_mode()) {
        case HGATP_MODE_SV32X4:
                str = "Sv32x4";
                break;
@@ -105,12 +105,15 @@ int kvm_arch_init(void *opaque)
        case HGATP_MODE_SV48X4:
                str = "Sv48x4";
                break;
+       case HGATP_MODE_SV57X4:
+               str = "Sv57x4";
+               break;
        default:
                return -ENODEV;
        }
        kvm_info("using %s G-stage page table format\n", str);
 
-       kvm_info("VMID %ld bits available\n", kvm_riscv_stage2_vmid_bits());
+       kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
 
        return 0;
 }
index f80a34f..1c00695 100644 (file)
 #include <asm/csr.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <asm/sbi.h>
 
 #ifdef CONFIG_64BIT
-static unsigned long stage2_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
-static unsigned long stage2_pgd_levels = 3;
-#define stage2_index_bits      9
+static unsigned long gstage_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
+static unsigned long gstage_pgd_levels = 3;
+#define gstage_index_bits      9
 #else
-static unsigned long stage2_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
-static unsigned long stage2_pgd_levels = 2;
-#define stage2_index_bits      10
+static unsigned long gstage_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
+static unsigned long gstage_pgd_levels = 2;
+#define gstage_index_bits      10
 #endif
 
-#define stage2_pgd_xbits       2
-#define stage2_pgd_size        (1UL << (HGATP_PAGE_SHIFT + stage2_pgd_xbits))
-#define stage2_gpa_bits        (HGATP_PAGE_SHIFT + \
-                        (stage2_pgd_levels * stage2_index_bits) + \
-                        stage2_pgd_xbits)
-#define stage2_gpa_size        ((gpa_t)(1ULL << stage2_gpa_bits))
+#define gstage_pgd_xbits       2
+#define gstage_pgd_size        (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits))
+#define gstage_gpa_bits        (HGATP_PAGE_SHIFT + \
+                        (gstage_pgd_levels * gstage_index_bits) + \
+                        gstage_pgd_xbits)
+#define gstage_gpa_size        ((gpa_t)(1ULL << gstage_gpa_bits))
 
-#define stage2_pte_leaf(__ptep)        \
+#define gstage_pte_leaf(__ptep)        \
        (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
 
-static inline unsigned long stage2_pte_index(gpa_t addr, u32 level)
+static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
 {
        unsigned long mask;
-       unsigned long shift = HGATP_PAGE_SHIFT + (stage2_index_bits * level);
+       unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level);
 
-       if (level == (stage2_pgd_levels - 1))
-               mask = (PTRS_PER_PTE * (1UL << stage2_pgd_xbits)) - 1;
+       if (level == (gstage_pgd_levels - 1))
+               mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1;
        else
                mask = PTRS_PER_PTE - 1;
 
        return (addr >> shift) & mask;
 }
 
-static inline unsigned long stage2_pte_page_vaddr(pte_t pte)
+static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
 {
        return (unsigned long)pfn_to_virt(pte_val(pte) >> _PAGE_PFN_SHIFT);
 }
 
-static int stage2_page_size_to_level(unsigned long page_size, u32 *out_level)
+static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
 {
        u32 i;
        unsigned long psz = 1UL << 12;
 
-       for (i = 0; i < stage2_pgd_levels; i++) {
-               if (page_size == (psz << (i * stage2_index_bits))) {
+       for (i = 0; i < gstage_pgd_levels; i++) {
+               if (page_size == (psz << (i * gstage_index_bits))) {
                        *out_level = i;
                        return 0;
                }
@@ -73,27 +72,39 @@ static int stage2_page_size_to_level(unsigned long page_size, u32 *out_level)
        return -EINVAL;
 }
 
-static int stage2_level_to_page_size(u32 level, unsigned long *out_pgsize)
+static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
 {
-       if (stage2_pgd_levels < level)
+       if (gstage_pgd_levels < level)
                return -EINVAL;
 
-       *out_pgsize = 1UL << (12 + (level * stage2_index_bits));
+       *out_pgorder = 12 + (level * gstage_index_bits);
+       return 0;
+}
+
+static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
+{
+       int rc;
+       unsigned long page_order = PAGE_SHIFT;
+
+       rc = gstage_level_to_page_order(level, &page_order);
+       if (rc)
+               return rc;
 
+       *out_pgsize = BIT(page_order);
        return 0;
 }
 
-static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr,
+static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr,
                                  pte_t **ptepp, u32 *ptep_level)
 {
        pte_t *ptep;
-       u32 current_level = stage2_pgd_levels - 1;
+       u32 current_level = gstage_pgd_levels - 1;
 
        *ptep_level = current_level;
        ptep = (pte_t *)kvm->arch.pgd;
-       ptep = &ptep[stage2_pte_index(addr, current_level)];
+       ptep = &ptep[gstage_pte_index(addr, current_level)];
        while (ptep && pte_val(*ptep)) {
-               if (stage2_pte_leaf(ptep)) {
+               if (gstage_pte_leaf(ptep)) {
                        *ptep_level = current_level;
                        *ptepp = ptep;
                        return true;
@@ -102,8 +113,8 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr,
                if (current_level) {
                        current_level--;
                        *ptep_level = current_level;
-                       ptep = (pte_t *)stage2_pte_page_vaddr(*ptep);
-                       ptep = &ptep[stage2_pte_index(addr, current_level)];
+                       ptep = (pte_t *)gstage_pte_page_vaddr(*ptep);
+                       ptep = &ptep[gstage_pte_index(addr, current_level)];
                } else {
                        ptep = NULL;
                }
@@ -112,38 +123,30 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr,
        return false;
 }
 
-static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr)
+static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr)
 {
-       unsigned long size = PAGE_SIZE;
-       struct kvm_vmid *vmid = &kvm->arch.vmid;
+       unsigned long order = PAGE_SHIFT;
 
-       if (stage2_level_to_page_size(level, &size))
+       if (gstage_level_to_page_order(level, &order))
                return;
-       addr &= ~(size - 1);
+       addr &= ~(BIT(order) - 1);
 
-       /*
-        * TODO: Instead of cpu_online_mask, we should only target CPUs
-        * where the Guest/VM is running.
-        */
-       preempt_disable();
-       sbi_remote_hfence_gvma_vmid(cpu_online_mask, addr, size,
-                                   READ_ONCE(vmid->vmid));
-       preempt_enable();
+       kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order);
 }
 
-static int stage2_set_pte(struct kvm *kvm, u32 level,
+static int gstage_set_pte(struct kvm *kvm, u32 level,
                           struct kvm_mmu_memory_cache *pcache,
                           gpa_t addr, const pte_t *new_pte)
 {
-       u32 current_level = stage2_pgd_levels - 1;
+       u32 current_level = gstage_pgd_levels - 1;
        pte_t *next_ptep = (pte_t *)kvm->arch.pgd;
-       pte_t *ptep = &next_ptep[stage2_pte_index(addr, current_level)];
+       pte_t *ptep = &next_ptep[gstage_pte_index(addr, current_level)];
 
        if (current_level < level)
                return -EINVAL;
 
        while (current_level != level) {
-               if (stage2_pte_leaf(ptep))
+               if (gstage_pte_leaf(ptep))
                        return -EEXIST;
 
                if (!pte_val(*ptep)) {
@@ -155,23 +158,23 @@ static int stage2_set_pte(struct kvm *kvm, u32 level,
                        *ptep = pfn_pte(PFN_DOWN(__pa(next_ptep)),
                                        __pgprot(_PAGE_TABLE));
                } else {
-                       if (stage2_pte_leaf(ptep))
+                       if (gstage_pte_leaf(ptep))
                                return -EEXIST;
-                       next_ptep = (pte_t *)stage2_pte_page_vaddr(*ptep);
+                       next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep);
                }
 
                current_level--;
-               ptep = &next_ptep[stage2_pte_index(addr, current_level)];
+               ptep = &next_ptep[gstage_pte_index(addr, current_level)];
        }
 
        *ptep = *new_pte;
-       if (stage2_pte_leaf(ptep))
-               stage2_remote_tlb_flush(kvm, current_level, addr);
+       if (gstage_pte_leaf(ptep))
+               gstage_remote_tlb_flush(kvm, current_level, addr);
 
        return 0;
 }
 
-static int stage2_map_page(struct kvm *kvm,
+static int gstage_map_page(struct kvm *kvm,
                           struct kvm_mmu_memory_cache *pcache,
                           gpa_t gpa, phys_addr_t hpa,
                           unsigned long page_size,
@@ -182,7 +185,7 @@ static int stage2_map_page(struct kvm *kvm,
        pte_t new_pte;
        pgprot_t prot;
 
-       ret = stage2_page_size_to_level(page_size, &level);
+       ret = gstage_page_size_to_level(page_size, &level);
        if (ret)
                return ret;
 
@@ -193,9 +196,9 @@ static int stage2_map_page(struct kvm *kvm,
         *    PTE so that software can update these bits.
         *
         * We support both options mentioned above. To achieve this, we
-        * always set 'A' and 'D' PTE bits at time of creating stage2
+        * always set 'A' and 'D' PTE bits at time of creating G-stage
         * mapping. To support KVM dirty page logging with both options
-        * mentioned above, we will write-protect stage2 PTEs to track
+        * mentioned above, we will write-protect G-stage PTEs to track
         * dirty pages.
         */
 
@@ -213,24 +216,24 @@ static int stage2_map_page(struct kvm *kvm,
        new_pte = pfn_pte(PFN_DOWN(hpa), prot);
        new_pte = pte_mkdirty(new_pte);
 
-       return stage2_set_pte(kvm, level, pcache, gpa, &new_pte);
+       return gstage_set_pte(kvm, level, pcache, gpa, &new_pte);
 }
 
-enum stage2_op {
-       STAGE2_OP_NOP = 0,      /* Nothing */
-       STAGE2_OP_CLEAR,        /* Clear/Unmap */
-       STAGE2_OP_WP,           /* Write-protect */
+enum gstage_op {
+       GSTAGE_OP_NOP = 0,      /* Nothing */
+       GSTAGE_OP_CLEAR,        /* Clear/Unmap */
+       GSTAGE_OP_WP,           /* Write-protect */
 };
 
-static void stage2_op_pte(struct kvm *kvm, gpa_t addr,
-                         pte_t *ptep, u32 ptep_level, enum stage2_op op)
+static void gstage_op_pte(struct kvm *kvm, gpa_t addr,
+                         pte_t *ptep, u32 ptep_level, enum gstage_op op)
 {
        int i, ret;
        pte_t *next_ptep;
        u32 next_ptep_level;
        unsigned long next_page_size, page_size;
 
-       ret = stage2_level_to_page_size(ptep_level, &page_size);
+       ret = gstage_level_to_page_size(ptep_level, &page_size);
        if (ret)
                return;
 
@@ -239,31 +242,31 @@ static void stage2_op_pte(struct kvm *kvm, gpa_t addr,
        if (!pte_val(*ptep))
                return;
 
-       if (ptep_level && !stage2_pte_leaf(ptep)) {
-               next_ptep = (pte_t *)stage2_pte_page_vaddr(*ptep);
+       if (ptep_level && !gstage_pte_leaf(ptep)) {
+               next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep);
                next_ptep_level = ptep_level - 1;
-               ret = stage2_level_to_page_size(next_ptep_level,
+               ret = gstage_level_to_page_size(next_ptep_level,
                                                &next_page_size);
                if (ret)
                        return;
 
-               if (op == STAGE2_OP_CLEAR)
+               if (op == GSTAGE_OP_CLEAR)
                        set_pte(ptep, __pte(0));
                for (i = 0; i < PTRS_PER_PTE; i++)
-                       stage2_op_pte(kvm, addr + i * next_page_size,
+                       gstage_op_pte(kvm, addr + i * next_page_size,
                                        &next_ptep[i], next_ptep_level, op);
-               if (op == STAGE2_OP_CLEAR)
+               if (op == GSTAGE_OP_CLEAR)
                        put_page(virt_to_page(next_ptep));
        } else {
-               if (op == STAGE2_OP_CLEAR)
+               if (op == GSTAGE_OP_CLEAR)
                        set_pte(ptep, __pte(0));
-               else if (op == STAGE2_OP_WP)
+               else if (op == GSTAGE_OP_WP)
                        set_pte(ptep, __pte(pte_val(*ptep) & ~_PAGE_WRITE));
-               stage2_remote_tlb_flush(kvm, ptep_level, addr);
+               gstage_remote_tlb_flush(kvm, ptep_level, addr);
        }
 }
 
-static void stage2_unmap_range(struct kvm *kvm, gpa_t start,
+static void gstage_unmap_range(struct kvm *kvm, gpa_t start,
                               gpa_t size, bool may_block)
 {
        int ret;
@@ -274,9 +277,9 @@ static void stage2_unmap_range(struct kvm *kvm, gpa_t start,
        gpa_t addr = start, end = start + size;
 
        while (addr < end) {
-               found_leaf = stage2_get_leaf_entry(kvm, addr,
+               found_leaf = gstage_get_leaf_entry(kvm, addr,
                                                   &ptep, &ptep_level);
-               ret = stage2_level_to_page_size(ptep_level, &page_size);
+               ret = gstage_level_to_page_size(ptep_level, &page_size);
                if (ret)
                        break;
 
@@ -284,8 +287,8 @@ static void stage2_unmap_range(struct kvm *kvm, gpa_t start,
                        goto next;
 
                if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
-                       stage2_op_pte(kvm, addr, ptep,
-                                     ptep_level, STAGE2_OP_CLEAR);
+                       gstage_op_pte(kvm, addr, ptep,
+                                     ptep_level, GSTAGE_OP_CLEAR);
 
 next:
                addr += page_size;
@@ -299,7 +302,7 @@ next:
        }
 }
 
-static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end)
+static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end)
 {
        int ret;
        pte_t *ptep;
@@ -309,9 +312,9 @@ static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end)
        unsigned long page_size;
 
        while (addr < end) {
-               found_leaf = stage2_get_leaf_entry(kvm, addr,
+               found_leaf = gstage_get_leaf_entry(kvm, addr,
                                                   &ptep, &ptep_level);
-               ret = stage2_level_to_page_size(ptep_level, &page_size);
+               ret = gstage_level_to_page_size(ptep_level, &page_size);
                if (ret)
                        break;
 
@@ -319,15 +322,15 @@ static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end)
                        goto next;
 
                if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
-                       stage2_op_pte(kvm, addr, ptep,
-                                     ptep_level, STAGE2_OP_WP);
+                       gstage_op_pte(kvm, addr, ptep,
+                                     ptep_level, GSTAGE_OP_WP);
 
 next:
                addr += page_size;
        }
 }
 
-static void stage2_wp_memory_region(struct kvm *kvm, int slot)
+static void gstage_wp_memory_region(struct kvm *kvm, int slot)
 {
        struct kvm_memslots *slots = kvm_memslots(kvm);
        struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
@@ -335,12 +338,12 @@ static void stage2_wp_memory_region(struct kvm *kvm, int slot)
        phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
 
        spin_lock(&kvm->mmu_lock);
-       stage2_wp_range(kvm, start, end);
+       gstage_wp_range(kvm, start, end);
        spin_unlock(&kvm->mmu_lock);
        kvm_flush_remote_tlbs(kvm);
 }
 
-static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
+static int gstage_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
                          unsigned long size, bool writable)
 {
        pte_t pte;
@@ -361,12 +364,12 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
                if (!writable)
                        pte = pte_wrprotect(pte);
 
-               ret = kvm_mmu_topup_memory_cache(&pcache, stage2_pgd_levels);
+               ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels);
                if (ret)
                        goto out;
 
                spin_lock(&kvm->mmu_lock);
-               ret = stage2_set_pte(kvm, 0, &pcache, addr, &pte);
+               ret = gstage_set_pte(kvm, 0, &pcache, addr, &pte);
                spin_unlock(&kvm->mmu_lock);
                if (ret)
                        goto out;
@@ -388,7 +391,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
        phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
        phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
 
-       stage2_wp_range(kvm, start, end);
+       gstage_wp_range(kvm, start, end);
 }
 
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
@@ -411,7 +414,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
-       kvm_riscv_stage2_free_pgd(kvm);
+       kvm_riscv_gstage_free_pgd(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -421,7 +424,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
        phys_addr_t size = slot->npages << PAGE_SHIFT;
 
        spin_lock(&kvm->mmu_lock);
-       stage2_unmap_range(kvm, gpa, size, false);
+       gstage_unmap_range(kvm, gpa, size, false);
        spin_unlock(&kvm->mmu_lock);
 }
 
@@ -436,7 +439,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         * the memory slot is write protected.
         */
        if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES)
-               stage2_wp_memory_region(kvm, new->id);
+               gstage_wp_memory_region(kvm, new->id);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -458,7 +461,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
         * space addressable by the KVM guest GPA space.
         */
        if ((new->base_gfn + new->npages) >=
-           (stage2_gpa_size >> PAGE_SHIFT))
+           (gstage_gpa_size >> PAGE_SHIFT))
                return -EFAULT;
 
        hva = new->userspace_addr;
@@ -514,7 +517,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                goto out;
                        }
 
-                       ret = stage2_ioremap(kvm, gpa, pa,
+                       ret = gstage_ioremap(kvm, gpa, pa,
                                             vm_end - vm_start, writable);
                        if (ret)
                                break;
@@ -527,7 +530,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
        spin_lock(&kvm->mmu_lock);
        if (ret)
-               stage2_unmap_range(kvm, base_gpa, size, false);
+               gstage_unmap_range(kvm, base_gpa, size, false);
        spin_unlock(&kvm->mmu_lock);
 
 out:
@@ -540,7 +543,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
        if (!kvm->arch.pgd)
                return false;
 
-       stage2_unmap_range(kvm, range->start << PAGE_SHIFT,
+       gstage_unmap_range(kvm, range->start << PAGE_SHIFT,
                           (range->end - range->start) << PAGE_SHIFT,
                           range->may_block);
        return false;
@@ -556,10 +559,10 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 
        WARN_ON(range->end - range->start != 1);
 
-       ret = stage2_map_page(kvm, NULL, range->start << PAGE_SHIFT,
+       ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT,
                              __pfn_to_phys(pfn), PAGE_SIZE, true, true);
        if (ret) {
-               kvm_debug("Failed to map stage2 page (error %d)\n", ret);
+               kvm_debug("Failed to map G-stage page (error %d)\n", ret);
                return true;
        }
 
@@ -577,7 +580,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 
        WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
 
-       if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
+       if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
                                   &ptep, &ptep_level))
                return false;
 
@@ -595,14 +598,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 
        WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
 
-       if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
+       if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
                                   &ptep, &ptep_level))
                return false;
 
        return pte_young(*ptep);
 }
 
-int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
+int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
                         struct kvm_memory_slot *memslot,
                         gpa_t gpa, unsigned long hva, bool is_write)
 {
@@ -648,9 +651,9 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
        }
 
        /* We need minimum second+third level pages */
-       ret = kvm_mmu_topup_memory_cache(pcache, stage2_pgd_levels);
+       ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels);
        if (ret) {
-               kvm_err("Failed to topup stage2 cache\n");
+               kvm_err("Failed to topup G-stage cache\n");
                return ret;
        }
 
@@ -680,15 +683,15 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
        if (writeable) {
                kvm_set_pfn_dirty(hfn);
                mark_page_dirty(kvm, gfn);
-               ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
+               ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
                                      vma_pagesize, false, true);
        } else {
-               ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
+               ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
                                      vma_pagesize, true, true);
        }
 
        if (ret)
-               kvm_err("Failed to map in stage2\n");
+               kvm_err("Failed to map in G-stage\n");
 
 out_unlock:
        spin_unlock(&kvm->mmu_lock);
@@ -697,7 +700,7 @@ out_unlock:
        return ret;
 }
 
-int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm)
+int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
 {
        struct page *pgd_page;
 
@@ -707,7 +710,7 @@ int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm)
        }
 
        pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
-                               get_order(stage2_pgd_size));
+                               get_order(gstage_pgd_size));
        if (!pgd_page)
                return -ENOMEM;
        kvm->arch.pgd = page_to_virt(pgd_page);
@@ -716,13 +719,13 @@ int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm)
        return 0;
 }
 
-void kvm_riscv_stage2_free_pgd(struct kvm *kvm)
+void kvm_riscv_gstage_free_pgd(struct kvm *kvm)
 {
        void *pgd = NULL;
 
        spin_lock(&kvm->mmu_lock);
        if (kvm->arch.pgd) {
-               stage2_unmap_range(kvm, 0UL, stage2_gpa_size, false);
+               gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false);
                pgd = READ_ONCE(kvm->arch.pgd);
                kvm->arch.pgd = NULL;
                kvm->arch.pgd_phys = 0;
@@ -730,12 +733,12 @@ void kvm_riscv_stage2_free_pgd(struct kvm *kvm)
        spin_unlock(&kvm->mmu_lock);
 
        if (pgd)
-               free_pages((unsigned long)pgd, get_order(stage2_pgd_size));
+               free_pages((unsigned long)pgd, get_order(gstage_pgd_size));
 }
 
-void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu)
+void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu)
 {
-       unsigned long hgatp = stage2_mode;
+       unsigned long hgatp = gstage_mode;
        struct kvm_arch *k = &vcpu->kvm->arch;
 
        hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) &
@@ -744,31 +747,40 @@ void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu)
 
        csr_write(CSR_HGATP, hgatp);
 
-       if (!kvm_riscv_stage2_vmid_bits())
-               __kvm_riscv_hfence_gvma_all();
+       if (!kvm_riscv_gstage_vmid_bits())
+               kvm_riscv_local_hfence_gvma_all();
 }
 
-void kvm_riscv_stage2_mode_detect(void)
+void kvm_riscv_gstage_mode_detect(void)
 {
 #ifdef CONFIG_64BIT
-       /* Try Sv48x4 stage2 mode */
+       /* Try Sv57x4 G-stage mode */
+       csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
+       if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
+               gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
+               gstage_pgd_levels = 5;
+               goto skip_sv48x4_test;
+       }
+
+       /* Try Sv48x4 G-stage mode */
        csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
        if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
-               stage2_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
-               stage2_pgd_levels = 4;
+               gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
+               gstage_pgd_levels = 4;
        }
-       csr_write(CSR_HGATP, 0);
+skip_sv48x4_test:
 
-       __kvm_riscv_hfence_gvma_all();
+       csr_write(CSR_HGATP, 0);
+       kvm_riscv_local_hfence_gvma_all();
 #endif
 }
 
-unsigned long kvm_riscv_stage2_mode(void)
+unsigned long kvm_riscv_gstage_mode(void)
 {
-       return stage2_mode >> HGATP_MODE_SHIFT;
+       return gstage_mode >> HGATP_MODE_SHIFT;
 }
 
-int kvm_riscv_stage2_gpa_bits(void)
+int kvm_riscv_gstage_gpa_bits(void)
 {
-       return stage2_gpa_bits;
+       return gstage_gpa_bits;
 }
diff --git a/arch/riscv/kvm/tlb.S b/arch/riscv/kvm/tlb.S
deleted file mode 100644 (file)
index 899f75d..0000000
+++ /dev/null
@@ -1,74 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2019 Western Digital Corporation or its affiliates.
- *
- * Authors:
- *     Anup Patel <anup.patel@wdc.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-       .text
-       .altmacro
-       .option norelax
-
-       /*
-        * Instruction encoding of hfence.gvma is:
-        * HFENCE.GVMA rs1, rs2
-        * HFENCE.GVMA zero, rs2
-        * HFENCE.GVMA rs1
-        * HFENCE.GVMA
-        *
-        * rs1!=zero and rs2!=zero ==> HFENCE.GVMA rs1, rs2
-        * rs1==zero and rs2!=zero ==> HFENCE.GVMA zero, rs2
-        * rs1!=zero and rs2==zero ==> HFENCE.GVMA rs1
-        * rs1==zero and rs2==zero ==> HFENCE.GVMA
-        *
-        * Instruction encoding of HFENCE.GVMA is:
-        * 0110001 rs2(5) rs1(5) 000 00000 1110011
-        */
-
-ENTRY(__kvm_riscv_hfence_gvma_vmid_gpa)
-       /*
-        * rs1 = a0 (GPA >> 2)
-        * rs2 = a1 (VMID)
-        * HFENCE.GVMA a0, a1
-        * 0110001 01011 01010 000 00000 1110011
-        */
-       .word 0x62b50073
-       ret
-ENDPROC(__kvm_riscv_hfence_gvma_vmid_gpa)
-
-ENTRY(__kvm_riscv_hfence_gvma_vmid)
-       /*
-        * rs1 = zero
-        * rs2 = a0 (VMID)
-        * HFENCE.GVMA zero, a0
-        * 0110001 01010 00000 000 00000 1110011
-        */
-       .word 0x62a00073
-       ret
-ENDPROC(__kvm_riscv_hfence_gvma_vmid)
-
-ENTRY(__kvm_riscv_hfence_gvma_gpa)
-       /*
-        * rs1 = a0 (GPA >> 2)
-        * rs2 = zero
-        * HFENCE.GVMA a0
-        * 0110001 00000 01010 000 00000 1110011
-        */
-       .word 0x62050073
-       ret
-ENDPROC(__kvm_riscv_hfence_gvma_gpa)
-
-ENTRY(__kvm_riscv_hfence_gvma_all)
-       /*
-        * rs1 = zero
-        * rs2 = zero
-        * HFENCE.GVMA
-        * 0110001 00000 00000 000 00000 1110011
-        */
-       .word 0x62000073
-       ret
-ENDPROC(__kvm_riscv_hfence_gvma_all)
diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c
new file mode 100644 (file)
index 0000000..1a76d0b
--- /dev/null
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/kvm_host.h>
+#include <asm/cacheflush.h>
+#include <asm/csr.h>
+
+/*
+ * Instruction encoding of hfence.gvma is:
+ * HFENCE.GVMA rs1, rs2
+ * HFENCE.GVMA zero, rs2
+ * HFENCE.GVMA rs1
+ * HFENCE.GVMA
+ *
+ * rs1!=zero and rs2!=zero ==> HFENCE.GVMA rs1, rs2
+ * rs1==zero and rs2!=zero ==> HFENCE.GVMA zero, rs2
+ * rs1!=zero and rs2==zero ==> HFENCE.GVMA rs1
+ * rs1==zero and rs2==zero ==> HFENCE.GVMA
+ *
+ * Instruction encoding of HFENCE.GVMA is:
+ * 0110001 rs2(5) rs1(5) 000 00000 1110011
+ */
+
+void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
+                                         gpa_t gpa, gpa_t gpsz,
+                                         unsigned long order)
+{
+       gpa_t pos;
+
+       if (PTRS_PER_PTE < (gpsz >> order)) {
+               kvm_riscv_local_hfence_gvma_vmid_all(vmid);
+               return;
+       }
+
+       for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) {
+               /*
+                * rs1 = a0 (GPA >> 2)
+                * rs2 = a1 (VMID)
+                * HFENCE.GVMA a0, a1
+                * 0110001 01011 01010 000 00000 1110011
+                */
+               asm volatile ("srli a0, %0, 2\n"
+                             "add a1, %1, zero\n"
+                             ".word 0x62b50073\n"
+                             :: "r" (pos), "r" (vmid)
+                             : "a0", "a1", "memory");
+       }
+}
+
+void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid)
+{
+       /*
+        * rs1 = zero
+        * rs2 = a0 (VMID)
+        * HFENCE.GVMA zero, a0
+        * 0110001 01010 00000 000 00000 1110011
+        */
+       asm volatile ("add a0, %0, zero\n"
+                     ".word 0x62a00073\n"
+                     :: "r" (vmid) : "a0", "memory");
+}
+
+void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz,
+                                    unsigned long order)
+{
+       gpa_t pos;
+
+       if (PTRS_PER_PTE < (gpsz >> order)) {
+               kvm_riscv_local_hfence_gvma_all();
+               return;
+       }
+
+       for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) {
+               /*
+                * rs1 = a0 (GPA >> 2)
+                * rs2 = zero
+                * HFENCE.GVMA a0
+                * 0110001 00000 01010 000 00000 1110011
+                */
+               asm volatile ("srli a0, %0, 2\n"
+                             ".word 0x62050073\n"
+                             :: "r" (pos) : "a0", "memory");
+       }
+}
+
+void kvm_riscv_local_hfence_gvma_all(void)
+{
+       /*
+        * rs1 = zero
+        * rs2 = zero
+        * HFENCE.GVMA
+        * 0110001 00000 00000 000 00000 1110011
+        */
+       asm volatile (".word 0x62000073" ::: "memory");
+}
+
+/*
+ * Instruction encoding of hfence.gvma is:
+ * HFENCE.VVMA rs1, rs2
+ * HFENCE.VVMA zero, rs2
+ * HFENCE.VVMA rs1
+ * HFENCE.VVMA
+ *
+ * rs1!=zero and rs2!=zero ==> HFENCE.VVMA rs1, rs2
+ * rs1==zero and rs2!=zero ==> HFENCE.VVMA zero, rs2
+ * rs1!=zero and rs2==zero ==> HFENCE.VVMA rs1
+ * rs1==zero and rs2==zero ==> HFENCE.VVMA
+ *
+ * Instruction encoding of HFENCE.VVMA is:
+ * 0010001 rs2(5) rs1(5) 000 00000 1110011
+ */
+
+void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid,
+                                         unsigned long asid,
+                                         unsigned long gva,
+                                         unsigned long gvsz,
+                                         unsigned long order)
+{
+       unsigned long pos, hgatp;
+
+       if (PTRS_PER_PTE < (gvsz >> order)) {
+               kvm_riscv_local_hfence_vvma_asid_all(vmid, asid);
+               return;
+       }
+
+       hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT);
+
+       for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) {
+               /*
+                * rs1 = a0 (GVA)
+                * rs2 = a1 (ASID)
+                * HFENCE.VVMA a0, a1
+                * 0010001 01011 01010 000 00000 1110011
+                */
+               asm volatile ("add a0, %0, zero\n"
+                             "add a1, %1, zero\n"
+                             ".word 0x22b50073\n"
+                             :: "r" (pos), "r" (asid)
+                             : "a0", "a1", "memory");
+       }
+
+       csr_write(CSR_HGATP, hgatp);
+}
+
+void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid,
+                                         unsigned long asid)
+{
+       unsigned long hgatp;
+
+       hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT);
+
+       /*
+        * rs1 = zero
+        * rs2 = a0 (ASID)
+        * HFENCE.VVMA zero, a0
+        * 0010001 01010 00000 000 00000 1110011
+        */
+       asm volatile ("add a0, %0, zero\n"
+                     ".word 0x22a00073\n"
+                     :: "r" (asid) : "a0", "memory");
+
+       csr_write(CSR_HGATP, hgatp);
+}
+
+void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid,
+                                    unsigned long gva, unsigned long gvsz,
+                                    unsigned long order)
+{
+       unsigned long pos, hgatp;
+
+       if (PTRS_PER_PTE < (gvsz >> order)) {
+               kvm_riscv_local_hfence_vvma_all(vmid);
+               return;
+       }
+
+       hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT);
+
+       for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) {
+               /*
+                * rs1 = a0 (GVA)
+                * rs2 = zero
+                * HFENCE.VVMA a0
+                * 0010001 00000 01010 000 00000 1110011
+                */
+               asm volatile ("add a0, %0, zero\n"
+                             ".word 0x22050073\n"
+                             :: "r" (pos) : "a0", "memory");
+       }
+
+       csr_write(CSR_HGATP, hgatp);
+}
+
+void kvm_riscv_local_hfence_vvma_all(unsigned long vmid)
+{
+       unsigned long hgatp;
+
+       hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT);
+
+       /*
+        * rs1 = zero
+        * rs2 = zero
+        * HFENCE.VVMA
+        * 0010001 00000 00000 000 00000 1110011
+        */
+       asm volatile (".word 0x22000073" ::: "memory");
+
+       csr_write(CSR_HGATP, hgatp);
+}
+
+void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu)
+{
+       unsigned long vmid;
+
+       if (!kvm_riscv_gstage_vmid_bits() ||
+           vcpu->arch.last_exit_cpu == vcpu->cpu)
+               return;
+
+       /*
+        * On RISC-V platforms with hardware VMID support, we share same
+        * VMID for all VCPUs of a particular Guest/VM. This means we might
+        * have stale G-stage TLB entries on the current Host CPU due to
+        * some other VCPU of the same Guest which ran previously on the
+        * current Host CPU.
+        *
+        * To cleanup stale TLB entries, we simply flush all G-stage TLB
+        * entries by VMID whenever underlying Host CPU changes for a VCPU.
+        */
+
+       vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
+       kvm_riscv_local_hfence_gvma_vmid_all(vmid);
+}
+
+void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu)
+{
+       local_flush_icache_all();
+}
+
+void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu)
+{
+       struct kvm_vmid *vmid;
+
+       vmid = &vcpu->kvm->arch.vmid;
+       kvm_riscv_local_hfence_gvma_vmid_all(READ_ONCE(vmid->vmid));
+}
+
+void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu)
+{
+       struct kvm_vmid *vmid;
+
+       vmid = &vcpu->kvm->arch.vmid;
+       kvm_riscv_local_hfence_vvma_all(READ_ONCE(vmid->vmid));
+}
+
+static bool vcpu_hfence_dequeue(struct kvm_vcpu *vcpu,
+                               struct kvm_riscv_hfence *out_data)
+{
+       bool ret = false;
+       struct kvm_vcpu_arch *varch = &vcpu->arch;
+
+       spin_lock(&varch->hfence_lock);
+
+       if (varch->hfence_queue[varch->hfence_head].type) {
+               memcpy(out_data, &varch->hfence_queue[varch->hfence_head],
+                      sizeof(*out_data));
+               varch->hfence_queue[varch->hfence_head].type = 0;
+
+               varch->hfence_head++;
+               if (varch->hfence_head == KVM_RISCV_VCPU_MAX_HFENCE)
+                       varch->hfence_head = 0;
+
+               ret = true;
+       }
+
+       spin_unlock(&varch->hfence_lock);
+
+       return ret;
+}
+
+static bool vcpu_hfence_enqueue(struct kvm_vcpu *vcpu,
+                               const struct kvm_riscv_hfence *data)
+{
+       bool ret = false;
+       struct kvm_vcpu_arch *varch = &vcpu->arch;
+
+       spin_lock(&varch->hfence_lock);
+
+       if (!varch->hfence_queue[varch->hfence_tail].type) {
+               memcpy(&varch->hfence_queue[varch->hfence_tail],
+                      data, sizeof(*data));
+
+               varch->hfence_tail++;
+               if (varch->hfence_tail == KVM_RISCV_VCPU_MAX_HFENCE)
+                       varch->hfence_tail = 0;
+
+               ret = true;
+       }
+
+       spin_unlock(&varch->hfence_lock);
+
+       return ret;
+}
+
+void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu)
+{
+       struct kvm_riscv_hfence d = { 0 };
+       struct kvm_vmid *v = &vcpu->kvm->arch.vmid;
+
+       while (vcpu_hfence_dequeue(vcpu, &d)) {
+               switch (d.type) {
+               case KVM_RISCV_HFENCE_UNKNOWN:
+                       break;
+               case KVM_RISCV_HFENCE_GVMA_VMID_GPA:
+                       kvm_riscv_local_hfence_gvma_vmid_gpa(
+                                               READ_ONCE(v->vmid),
+                                               d.addr, d.size, d.order);
+                       break;
+               case KVM_RISCV_HFENCE_VVMA_ASID_GVA:
+                       kvm_riscv_local_hfence_vvma_asid_gva(
+                                               READ_ONCE(v->vmid), d.asid,
+                                               d.addr, d.size, d.order);
+                       break;
+               case KVM_RISCV_HFENCE_VVMA_ASID_ALL:
+                       kvm_riscv_local_hfence_vvma_asid_all(
+                                               READ_ONCE(v->vmid), d.asid);
+                       break;
+               case KVM_RISCV_HFENCE_VVMA_GVA:
+                       kvm_riscv_local_hfence_vvma_gva(
+                                               READ_ONCE(v->vmid),
+                                               d.addr, d.size, d.order);
+                       break;
+               default:
+                       break;
+               }
+       }
+}
+
+static void make_xfence_request(struct kvm *kvm,
+                               unsigned long hbase, unsigned long hmask,
+                               unsigned int req, unsigned int fallback_req,
+                               const struct kvm_riscv_hfence *data)
+{
+       unsigned long i;
+       struct kvm_vcpu *vcpu;
+       unsigned int actual_req = req;
+       DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
+
+       bitmap_clear(vcpu_mask, 0, KVM_MAX_VCPUS);
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (hbase != -1UL) {
+                       if (vcpu->vcpu_id < hbase)
+                               continue;
+                       if (!(hmask & (1UL << (vcpu->vcpu_id - hbase))))
+                               continue;
+               }
+
+               bitmap_set(vcpu_mask, i, 1);
+
+               if (!data || !data->type)
+                       continue;
+
+               /*
+                * Enqueue hfence data to VCPU hfence queue. If we don't
+                * have space in the VCPU hfence queue then fallback to
+                * a more conservative hfence request.
+                */
+               if (!vcpu_hfence_enqueue(vcpu, data))
+                       actual_req = fallback_req;
+       }
+
+       kvm_make_vcpus_request_mask(kvm, actual_req, vcpu_mask);
+}
+
+void kvm_riscv_fence_i(struct kvm *kvm,
+                      unsigned long hbase, unsigned long hmask)
+{
+       make_xfence_request(kvm, hbase, hmask, KVM_REQ_FENCE_I,
+                           KVM_REQ_FENCE_I, NULL);
+}
+
+void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm,
+                                   unsigned long hbase, unsigned long hmask,
+                                   gpa_t gpa, gpa_t gpsz,
+                                   unsigned long order)
+{
+       struct kvm_riscv_hfence data;
+
+       data.type = KVM_RISCV_HFENCE_GVMA_VMID_GPA;
+       data.asid = 0;
+       data.addr = gpa;
+       data.size = gpsz;
+       data.order = order;
+       make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+                           KVM_REQ_HFENCE_GVMA_VMID_ALL, &data);
+}
+
+void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm,
+                                   unsigned long hbase, unsigned long hmask)
+{
+       make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_GVMA_VMID_ALL,
+                           KVM_REQ_HFENCE_GVMA_VMID_ALL, NULL);
+}
+
+void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
+                                   unsigned long hbase, unsigned long hmask,
+                                   unsigned long gva, unsigned long gvsz,
+                                   unsigned long order, unsigned long asid)
+{
+       struct kvm_riscv_hfence data;
+
+       data.type = KVM_RISCV_HFENCE_VVMA_ASID_GVA;
+       data.asid = asid;
+       data.addr = gva;
+       data.size = gvsz;
+       data.order = order;
+       make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+                           KVM_REQ_HFENCE_VVMA_ALL, &data);
+}
+
+void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
+                                   unsigned long hbase, unsigned long hmask,
+                                   unsigned long asid)
+{
+       struct kvm_riscv_hfence data;
+
+       data.type = KVM_RISCV_HFENCE_VVMA_ASID_ALL;
+       data.asid = asid;
+       data.addr = data.size = data.order = 0;
+       make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+                           KVM_REQ_HFENCE_VVMA_ALL, &data);
+}
+
+void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
+                              unsigned long hbase, unsigned long hmask,
+                              unsigned long gva, unsigned long gvsz,
+                              unsigned long order)
+{
+       struct kvm_riscv_hfence data;
+
+       data.type = KVM_RISCV_HFENCE_VVMA_GVA;
+       data.asid = 0;
+       data.addr = gva;
+       data.size = gvsz;
+       data.order = order;
+       make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+                           KVM_REQ_HFENCE_VVMA_ALL, &data);
+}
+
+void kvm_riscv_hfence_vvma_all(struct kvm *kvm,
+                              unsigned long hbase, unsigned long hmask)
+{
+       make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_VVMA_ALL,
+                           KVM_REQ_HFENCE_VVMA_ALL, NULL);
+}
index 7461f96..7f4ad5e 100644 (file)
@@ -67,6 +67,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
        if (loaded)
                kvm_arch_vcpu_put(vcpu);
 
+       vcpu->arch.last_exit_cpu = -1;
+
        memcpy(csr, reset_csr, sizeof(*csr));
 
        memcpy(cntx, reset_cntx, sizeof(*cntx));
@@ -78,6 +80,10 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
        WRITE_ONCE(vcpu->arch.irqs_pending, 0);
        WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0);
 
+       vcpu->arch.hfence_head = 0;
+       vcpu->arch.hfence_tail = 0;
+       memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue));
+
        /* Reset the guest CSRs for hotplug usecase */
        if (loaded)
                kvm_arch_vcpu_load(vcpu, smp_processor_id());
@@ -101,6 +107,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        /* Setup ISA features available to VCPU */
        vcpu->arch.isa = riscv_isa_extension_base(NULL) & KVM_RISCV_ISA_ALLOWED;
 
+       /* Setup VCPU hfence queue */
+       spin_lock_init(&vcpu->arch.hfence_lock);
+
        /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */
        cntx = &vcpu->arch.guest_reset_context;
        cntx->sstatus = SR_SPP | SR_SPIE;
@@ -137,7 +146,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        /* Cleanup VCPU timer */
        kvm_riscv_vcpu_timer_deinit(vcpu);
 
-       /* Free unused pages pre-allocated for Stage2 page table mappings */
+       /* Free unused pages pre-allocated for G-stage page table mappings */
        kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 }
 
@@ -365,6 +374,101 @@ static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+/* Mapping between KVM ISA Extension ID & Host ISA extension ID */
+static unsigned long kvm_isa_ext_arr[] = {
+       RISCV_ISA_EXT_a,
+       RISCV_ISA_EXT_c,
+       RISCV_ISA_EXT_d,
+       RISCV_ISA_EXT_f,
+       RISCV_ISA_EXT_h,
+       RISCV_ISA_EXT_i,
+       RISCV_ISA_EXT_m,
+};
+
+static int kvm_riscv_vcpu_get_reg_isa_ext(struct kvm_vcpu *vcpu,
+                                         const struct kvm_one_reg *reg)
+{
+       unsigned long __user *uaddr =
+                       (unsigned long __user *)(unsigned long)reg->addr;
+       unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+                                           KVM_REG_SIZE_MASK |
+                                           KVM_REG_RISCV_ISA_EXT);
+       unsigned long reg_val = 0;
+       unsigned long host_isa_ext;
+
+       if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+               return -EINVAL;
+
+       if (reg_num >= KVM_RISCV_ISA_EXT_MAX || reg_num >= ARRAY_SIZE(kvm_isa_ext_arr))
+               return -EINVAL;
+
+       host_isa_ext = kvm_isa_ext_arr[reg_num];
+       if (__riscv_isa_extension_available(&vcpu->arch.isa, host_isa_ext))
+               reg_val = 1; /* Mark the given extension as available */
+
+       if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu,
+                                         const struct kvm_one_reg *reg)
+{
+       unsigned long __user *uaddr =
+                       (unsigned long __user *)(unsigned long)reg->addr;
+       unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+                                           KVM_REG_SIZE_MASK |
+                                           KVM_REG_RISCV_ISA_EXT);
+       unsigned long reg_val;
+       unsigned long host_isa_ext;
+       unsigned long host_isa_ext_mask;
+
+       if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+               return -EINVAL;
+
+       if (reg_num >= KVM_RISCV_ISA_EXT_MAX || reg_num >= ARRAY_SIZE(kvm_isa_ext_arr))
+               return -EINVAL;
+
+       if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+               return -EFAULT;
+
+       host_isa_ext = kvm_isa_ext_arr[reg_num];
+       if (!__riscv_isa_extension_available(NULL, host_isa_ext))
+               return  -EOPNOTSUPP;
+
+       if (host_isa_ext >= RISCV_ISA_EXT_BASE &&
+           host_isa_ext < RISCV_ISA_EXT_MAX) {
+               /*
+                * Multi-letter ISA extension. Currently there is no provision
+                * to enable/disable the multi-letter ISA extensions for guests.
+                * Return success if the request is to enable any ISA extension
+                * that is available in the hardware.
+                * Return -EOPNOTSUPP otherwise.
+                */
+               if (!reg_val)
+                       return -EOPNOTSUPP;
+               else
+                       return 0;
+       }
+
+       /* Single letter base ISA extension */
+       if (!vcpu->arch.ran_atleast_once) {
+               host_isa_ext_mask = BIT_MASK(host_isa_ext);
+               if (!reg_val && (host_isa_ext_mask & KVM_RISCV_ISA_DISABLE_ALLOWED))
+                       vcpu->arch.isa &= ~host_isa_ext_mask;
+               else
+                       vcpu->arch.isa |= host_isa_ext_mask;
+               vcpu->arch.isa &= riscv_isa_extension_base(NULL);
+               vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED;
+               kvm_riscv_vcpu_fp_reset(vcpu);
+       } else {
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
+
 static int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu,
                                  const struct kvm_one_reg *reg)
 {
@@ -382,6 +486,8 @@ static int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu,
        else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_D)
                return kvm_riscv_vcpu_set_reg_fp(vcpu, reg,
                                                 KVM_REG_RISCV_FP_D);
+       else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_ISA_EXT)
+               return kvm_riscv_vcpu_set_reg_isa_ext(vcpu, reg);
 
        return -EINVAL;
 }
@@ -403,6 +509,8 @@ static int kvm_riscv_vcpu_get_reg(struct kvm_vcpu *vcpu,
        else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_D)
                return kvm_riscv_vcpu_get_reg_fp(vcpu, reg,
                                                 KVM_REG_RISCV_FP_D);
+       else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_ISA_EXT)
+               return kvm_riscv_vcpu_get_reg_isa_ext(vcpu, reg);
 
        return -EINVAL;
 }
@@ -635,7 +743,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        csr_write(CSR_HVIP, csr->hvip);
        csr_write(CSR_VSATP, csr->vsatp);
 
-       kvm_riscv_stage2_update_hgatp(vcpu);
+       kvm_riscv_gstage_update_hgatp(vcpu);
 
        kvm_riscv_vcpu_timer_restore(vcpu);
 
@@ -690,10 +798,23 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
                        kvm_riscv_reset_vcpu(vcpu);
 
                if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu))
-                       kvm_riscv_stage2_update_hgatp(vcpu);
+                       kvm_riscv_gstage_update_hgatp(vcpu);
 
-               if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
-                       __kvm_riscv_hfence_gvma_all();
+               if (kvm_check_request(KVM_REQ_FENCE_I, vcpu))
+                       kvm_riscv_fence_i_process(vcpu);
+
+               /*
+                * The generic KVM_REQ_TLB_FLUSH is same as
+                * KVM_REQ_HFENCE_GVMA_VMID_ALL
+                */
+               if (kvm_check_request(KVM_REQ_HFENCE_GVMA_VMID_ALL, vcpu))
+                       kvm_riscv_hfence_gvma_vmid_all_process(vcpu);
+
+               if (kvm_check_request(KVM_REQ_HFENCE_VVMA_ALL, vcpu))
+                       kvm_riscv_hfence_vvma_all_process(vcpu);
+
+               if (kvm_check_request(KVM_REQ_HFENCE, vcpu))
+                       kvm_riscv_hfence_process(vcpu);
        }
 }
 
@@ -715,6 +836,7 @@ static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu)
 {
        guest_state_enter_irqoff();
        __kvm_riscv_switch_to(&vcpu->arch);
+       vcpu->arch.last_exit_cpu = vcpu->cpu;
        guest_state_exit_irqoff();
 }
 
@@ -762,7 +884,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                /* Check conditions before entering the guest */
                cond_resched();
 
-               kvm_riscv_stage2_vmid_update(vcpu);
+               kvm_riscv_gstage_vmid_update(vcpu);
 
                kvm_riscv_check_vcpu_requests(vcpu);
 
@@ -800,7 +922,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                kvm_riscv_update_hvip(vcpu);
 
                if (ret <= 0 ||
-                   kvm_riscv_stage2_vmid_ver_changed(&vcpu->kvm->arch.vmid) ||
+                   kvm_riscv_gstage_vmid_ver_changed(&vcpu->kvm->arch.vmid) ||
                    kvm_request_pending(vcpu)) {
                        vcpu->mode = OUTSIDE_GUEST_MODE;
                        local_irq_enable();
@@ -809,6 +931,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                        continue;
                }
 
+               /*
+                * Cleanup stale TLB enteries
+                *
+                * Note: This should be done after G-stage VMID has been
+                * updated using kvm_riscv_gstage_vmid_ver_changed()
+                */
+               kvm_riscv_local_tlb_sanitize(vcpu);
+
                guest_timing_enter_irqoff();
 
                kvm_riscv_vcpu_enter_exit(vcpu);
index a72c15d..dbb09af 100644 (file)
@@ -412,7 +412,7 @@ static int emulate_store(struct kvm_vcpu *vcpu, struct kvm_run *run,
        return 0;
 }
 
-static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
+static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
                             struct kvm_cpu_trap *trap)
 {
        struct kvm_memory_slot *memslot;
@@ -440,7 +440,7 @@ static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
                };
        }
 
-       ret = kvm_riscv_stage2_map(vcpu, memslot, fault_addr, hva,
+       ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva,
                (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false);
        if (ret < 0)
                return ret;
@@ -686,7 +686,7 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
        case EXC_LOAD_GUEST_PAGE_FAULT:
        case EXC_STORE_GUEST_PAGE_FAULT:
                if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV)
-                       ret = stage2_page_fault(vcpu, run, trap);
+                       ret = gstage_page_fault(vcpu, run, trap);
                break;
        case EXC_SUPERVISOR_SYSCALL:
                if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV)
index 0f21736..4c034d8 100644 (file)
@@ -81,43 +81,41 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
                                      struct kvm_cpu_trap *utrap, bool *exit)
 {
        int ret = 0;
-       unsigned long i;
-       struct cpumask cm;
-       struct kvm_vcpu *tmp;
        struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
        unsigned long hmask = cp->a0;
        unsigned long hbase = cp->a1;
        unsigned long funcid = cp->a6;
 
-       cpumask_clear(&cm);
-       kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
-               if (hbase != -1UL) {
-                       if (tmp->vcpu_id < hbase)
-                               continue;
-                       if (!(hmask & (1UL << (tmp->vcpu_id - hbase))))
-                               continue;
-               }
-               if (tmp->cpu < 0)
-                       continue;
-               cpumask_set_cpu(tmp->cpu, &cm);
-       }
-
        switch (funcid) {
        case SBI_EXT_RFENCE_REMOTE_FENCE_I:
-               ret = sbi_remote_fence_i(&cm);
+               kvm_riscv_fence_i(vcpu->kvm, hbase, hmask);
                break;
        case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
-               ret = sbi_remote_hfence_vvma(&cm, cp->a2, cp->a3);
+               if (cp->a2 == 0 && cp->a3 == 0)
+                       kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask);
+               else
+                       kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask,
+                                                 cp->a2, cp->a3, PAGE_SHIFT);
                break;
        case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
-               ret = sbi_remote_hfence_vvma_asid(&cm, cp->a2,
-                                                 cp->a3, cp->a4);
+               if (cp->a2 == 0 && cp->a3 == 0)
+                       kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
+                                                      hbase, hmask, cp->a4);
+               else
+                       kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm,
+                                                      hbase, hmask,
+                                                      cp->a2, cp->a3,
+                                                      PAGE_SHIFT, cp->a4);
                break;
        case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA:
        case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID:
        case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA:
        case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID:
-       /* TODO: implement for nested hypervisor case */
+               /*
+                * Until nested virtualization is implemented, the
+                * SBI HFENCE calls should be treated as NOPs
+                */
+               break;
        default:
                ret = -EOPNOTSUPP;
        }
index da4d6c9..8a91a14 100644 (file)
@@ -23,7 +23,6 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
        int i, ret = 0;
        u64 next_cycle;
        struct kvm_vcpu *rvcpu;
-       struct cpumask cm;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
 
@@ -80,19 +79,29 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
                if (utrap->scause)
                        break;
 
-               cpumask_clear(&cm);
-               for_each_set_bit(i, &hmask, BITS_PER_LONG) {
-                       rvcpu = kvm_get_vcpu_by_id(vcpu->kvm, i);
-                       if (rvcpu->cpu < 0)
-                               continue;
-                       cpumask_set_cpu(rvcpu->cpu, &cm);
-               }
                if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I)
-                       ret = sbi_remote_fence_i(&cm);
-               else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA)
-                       ret = sbi_remote_hfence_vvma(&cm, cp->a1, cp->a2);
-               else
-                       ret = sbi_remote_hfence_vvma_asid(&cm, cp->a1, cp->a2, cp->a3);
+                       kvm_riscv_fence_i(vcpu->kvm, 0, hmask);
+               else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) {
+                       if (cp->a1 == 0 && cp->a2 == 0)
+                               kvm_riscv_hfence_vvma_all(vcpu->kvm,
+                                                         0, hmask);
+                       else
+                               kvm_riscv_hfence_vvma_gva(vcpu->kvm,
+                                                         0, hmask,
+                                                         cp->a1, cp->a2,
+                                                         PAGE_SHIFT);
+               } else {
+                       if (cp->a1 == 0 && cp->a2 == 0)
+                               kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
+                                                              0, hmask,
+                                                              cp->a3);
+                       else
+                               kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm,
+                                                              0, hmask,
+                                                              cp->a1, cp->a2,
+                                                              PAGE_SHIFT,
+                                                              cp->a3);
+               }
                break;
        default:
                ret = -EINVAL;
index c768f75..945a2bf 100644 (file)
@@ -31,13 +31,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
        int r;
 
-       r = kvm_riscv_stage2_alloc_pgd(kvm);
+       r = kvm_riscv_gstage_alloc_pgd(kvm);
        if (r)
                return r;
 
-       r = kvm_riscv_stage2_vmid_init(kvm);
+       r = kvm_riscv_gstage_vmid_init(kvm);
        if (r) {
-               kvm_riscv_stage2_free_pgd(kvm);
+               kvm_riscv_gstage_free_pgd(kvm);
                return r;
        }
 
@@ -75,7 +75,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = KVM_USER_MEM_SLOTS;
                break;
        case KVM_CAP_VM_GPA_BITS:
-               r = kvm_riscv_stage2_gpa_bits();
+               r = kvm_riscv_gstage_gpa_bits();
                break;
        default:
                r = 0;
index 2fa4f7b..9f764df 100644 (file)
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/module.h>
+#include <linux/smp.h>
 #include <linux/kvm_host.h>
 #include <asm/csr.h>
-#include <asm/sbi.h>
 
 static unsigned long vmid_version = 1;
 static unsigned long vmid_next;
 static unsigned long vmid_bits;
 static DEFINE_SPINLOCK(vmid_lock);
 
-void kvm_riscv_stage2_vmid_detect(void)
+void kvm_riscv_gstage_vmid_detect(void)
 {
        unsigned long old;
 
@@ -33,19 +33,19 @@ void kvm_riscv_stage2_vmid_detect(void)
        csr_write(CSR_HGATP, old);
 
        /* We polluted local TLB so flush all guest TLB */
-       __kvm_riscv_hfence_gvma_all();
+       kvm_riscv_local_hfence_gvma_all();
 
        /* We don't use VMID bits if they are not sufficient */
        if ((1UL << vmid_bits) < num_possible_cpus())
                vmid_bits = 0;
 }
 
-unsigned long kvm_riscv_stage2_vmid_bits(void)
+unsigned long kvm_riscv_gstage_vmid_bits(void)
 {
        return vmid_bits;
 }
 
-int kvm_riscv_stage2_vmid_init(struct kvm *kvm)
+int kvm_riscv_gstage_vmid_init(struct kvm *kvm)
 {
        /* Mark the initial VMID and VMID version invalid */
        kvm->arch.vmid.vmid_version = 0;
@@ -54,7 +54,7 @@ int kvm_riscv_stage2_vmid_init(struct kvm *kvm)
        return 0;
 }
 
-bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid)
+bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid)
 {
        if (!vmid_bits)
                return false;
@@ -63,13 +63,18 @@ bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid)
                        READ_ONCE(vmid_version));
 }
 
-void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu)
+static void __local_hfence_gvma_all(void *info)
+{
+       kvm_riscv_local_hfence_gvma_all();
+}
+
+void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu)
 {
        unsigned long i;
        struct kvm_vcpu *v;
        struct kvm_vmid *vmid = &vcpu->kvm->arch.vmid;
 
-       if (!kvm_riscv_stage2_vmid_ver_changed(vmid))
+       if (!kvm_riscv_gstage_vmid_ver_changed(vmid))
                return;
 
        spin_lock(&vmid_lock);
@@ -78,7 +83,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu)
         * We need to re-check the vmid_version here to ensure that if
         * another vcpu already allocated a valid vmid for this vm.
         */
-       if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) {
+       if (!kvm_riscv_gstage_vmid_ver_changed(vmid)) {
                spin_unlock(&vmid_lock);
                return;
        }
@@ -96,12 +101,13 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu)
                 * instances is invalid and we have force VMID re-assignement
                 * for all Guest instances. The Guest instances that were not
                 * running will automatically pick-up new VMIDs because will
-                * call kvm_riscv_stage2_vmid_update() whenever they enter
+                * call kvm_riscv_gstage_vmid_update() whenever they enter
                 * in-kernel run loop. For Guest instances that are already
                 * running, we force VM exits on all host CPUs using IPI and
                 * flush all Guest TLBs.
                 */
-               sbi_remote_hfence_gvma(cpu_online_mask, 0, 0);
+               on_each_cpu_mask(cpu_online_mask, __local_hfence_gvma_all,
+                                NULL, 1);
        }
 
        vmid->vmid = vmid_next;
@@ -112,7 +118,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu)
 
        spin_unlock(&vmid_lock);
 
-       /* Request stage2 page table update for all VCPUs */
+       /* Request G-stage page table update for all VCPUs */
        kvm_for_each_vcpu(i, v, vcpu->kvm)
                kvm_make_request(KVM_REQ_UPDATE_HGATP, v);
 }
index a2d376b..cfea7b7 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * Ultravisor Interfaces
  *
- * Copyright IBM Corp. 2019
+ * Copyright IBM Corp. 2019, 2022
  *
  * Author(s):
  *     Vasily Gorbik <gor@linux.ibm.com>
@@ -52,6 +52,7 @@
 #define UVC_CMD_UNPIN_PAGE_SHARED      0x0342
 #define UVC_CMD_SET_SHARED_ACCESS      0x1000
 #define UVC_CMD_REMOVE_SHARED_ACCESS   0x1001
+#define UVC_CMD_RETR_ATTEST            0x1020
 
 /* Bits in installed uv calls */
 enum uv_cmds_inst {
@@ -76,6 +77,7 @@ enum uv_cmds_inst {
        BIT_UVC_CMD_UNSHARE_ALL = 20,
        BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
        BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
+       BIT_UVC_CMD_RETR_ATTEST = 28,
 };
 
 enum uv_feat_ind {
@@ -219,6 +221,25 @@ struct uv_cb_share {
        u64 reserved28;
 } __packed __aligned(8);
 
+/* Retrieve Attestation Measurement */
+struct uv_cb_attest {
+       struct uv_cb_header header;     /* 0x0000 */
+       u64 reserved08[2];              /* 0x0008 */
+       u64 arcb_addr;                  /* 0x0018 */
+       u64 cont_token;                 /* 0x0020 */
+       u8  reserved28[6];              /* 0x0028 */
+       u16 user_data_len;              /* 0x002e */
+       u8  user_data[256];             /* 0x0030 */
+       u32 reserved130[3];             /* 0x0130 */
+       u32 meas_len;                   /* 0x013c */
+       u64 meas_addr;                  /* 0x0140 */
+       u8  config_uid[16];             /* 0x0148 */
+       u32 reserved158;                /* 0x0158 */
+       u32 add_data_len;               /* 0x015c */
+       u64 add_data_addr;              /* 0x0160 */
+       u64 reserved168[4];             /* 0x0168 */
+} __packed __aligned(8);
+
 static inline int __uv_call(unsigned long r1, unsigned long r2)
 {
        int cc;
diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h
new file mode 100644 (file)
index 0000000..10a5ac9
--- /dev/null
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *  Copyright IBM Corp. 2022
+ *  Author(s): Steffen Eiden <seiden@linux.ibm.com>
+ */
+#ifndef __S390_ASM_UVDEVICE_H
+#define __S390_ASM_UVDEVICE_H
+
+#include <linux/types.h>
+
+struct uvio_ioctl_cb {
+       __u32 flags;
+       __u16 uv_rc;                    /* UV header rc value */
+       __u16 uv_rrc;                   /* UV header rrc value */
+       __u64 argument_addr;            /* Userspace address of uvio argument */
+       __u32 argument_len;
+       __u8  reserved14[0x40 - 0x14];  /* must be zero */
+};
+
+#define UVIO_ATT_USER_DATA_LEN         0x100
+#define UVIO_ATT_UID_LEN               0x10
+struct uvio_attest {
+       __u64 arcb_addr;                                /* 0x0000 */
+       __u64 meas_addr;                                /* 0x0008 */
+       __u64 add_data_addr;                            /* 0x0010 */
+       __u8  user_data[UVIO_ATT_USER_DATA_LEN];        /* 0x0018 */
+       __u8  config_uid[UVIO_ATT_UID_LEN];             /* 0x0118 */
+       __u32 arcb_len;                                 /* 0x0128 */
+       __u32 meas_len;                                 /* 0x012c */
+       __u32 add_data_len;                             /* 0x0130 */
+       __u16 user_data_len;                            /* 0x0134 */
+       __u16 reserved136;                              /* 0x0136 */
+};
+
+/*
+ * The following max values define an upper length for the IOCTL in/out buffers.
+ * However, they do not represent the maximum the Ultravisor allows which is
+ * often way smaller. By allowing larger buffer sizes we hopefully do not need
+ * to update the code with every machine update. It is therefore possible for
+ * userspace to request more memory than actually used by kernel/UV.
+ */
+#define UVIO_ATT_ARCB_MAX_LEN          0x100000
+#define UVIO_ATT_MEASUREMENT_MAX_LEN   0x8000
+#define UVIO_ATT_ADDITIONAL_MAX_LEN    0x8000
+
+#define UVIO_DEVICE_NAME "uv"
+#define UVIO_TYPE_UVC 'u'
+
+#define UVIO_IOCTL_ATT _IOWR(UVIO_TYPE_UVC, 0x01, struct uvio_ioctl_cb)
+
+#endif /* __S390_ASM_UVDEVICE_H */
index d53a183..227ed00 100644 (file)
@@ -491,8 +491,8 @@ enum prot_type {
        PROT_TYPE_IEP  = 4,
 };
 
-static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
-                    u8 ar, enum gacc_mode mode, enum prot_type prot)
+static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
+                           enum gacc_mode mode, enum prot_type prot, bool terminate)
 {
        struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        struct trans_exc_code_bits *tec;
@@ -520,6 +520,11 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
                        tec->b61 = 1;
                        break;
                }
+               if (terminate) {
+                       tec->b56 = 0;
+                       tec->b60 = 0;
+                       tec->b61 = 0;
+               }
                fallthrough;
        case PGM_ASCE_TYPE:
        case PGM_PAGE_TRANSLATION:
@@ -552,6 +557,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
        return code;
 }
 
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
+                    enum gacc_mode mode, enum prot_type prot)
+{
+       return trans_exc_ending(vcpu, code, gva, ar, mode, prot, false);
+}
+
 static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
                         unsigned long ga, u8 ar, enum gacc_mode mode)
 {
@@ -1109,8 +1120,11 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
                data += fragment_len;
                ga = kvm_s390_logical_to_effective(vcpu, ga + fragment_len);
        }
-       if (rc > 0)
-               rc = trans_exc(vcpu, rc, ga, ar, mode, prot);
+       if (rc > 0) {
+               bool terminate = (mode == GACC_STORE) && (idx > 0);
+
+               rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate);
+       }
 out_unlock:
        if (need_ipte_lock)
                ipte_unlock(vcpu);
index 21bb78d..393f2bb 100644 (file)
 #define X86_FEATURE_SEV                        (19*32+ 1) /* AMD Secure Encrypted Virtualization */
 #define X86_FEATURE_VM_PAGE_FLUSH      (19*32+ 2) /* "" VM Page Flush MSR is supported */
 #define X86_FEATURE_SEV_ES             (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_V_TSC_AUX          (19*32+ 9) /* "" Virtual TSC_AUX */
 #define X86_FEATURE_SME_COHERENT       (19*32+10) /* "" AMD hardware-enforced cache coherency */
 
 /*
index 1a6d7e3..da47f60 100644 (file)
@@ -127,6 +127,7 @@ KVM_X86_OP_OPTIONAL(migrate_timers)
 KVM_X86_OP(msr_filter_changed)
 KVM_X86_OP(complete_emulated_msr)
 KVM_X86_OP(vcpu_deliver_sipi_vector)
+KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
 
 #undef KVM_X86_OP
 #undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
new file mode 100644 (file)
index 0000000..fdfd8e0
--- /dev/null
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_PMU_OP) || !defined(KVM_X86_PMU_OP_OPTIONAL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_PMU_OP() and KVM_X86_PMU_OP_OPTIONAL() are used to help generate
+ * both DECLARE/DEFINE_STATIC_CALL() invocations and
+ * "static_call_update()" calls.
+ *
+ * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have
+ * a NULL definition, for example if "static_call_cond()" will be used
+ * at the call sites.
+ */
+KVM_X86_PMU_OP(pmc_perf_hw_id)
+KVM_X86_PMU_OP(pmc_is_enabled)
+KVM_X86_PMU_OP(pmc_idx_to_pmc)
+KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
+KVM_X86_PMU_OP(msr_idx_to_pmc)
+KVM_X86_PMU_OP(is_valid_rdpmc_ecx)
+KVM_X86_PMU_OP(is_valid_msr)
+KVM_X86_PMU_OP(get_msr)
+KVM_X86_PMU_OP(set_msr)
+KVM_X86_PMU_OP(refresh)
+KVM_X86_PMU_OP(init)
+KVM_X86_PMU_OP(reset)
+KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
+KVM_X86_PMU_OP_OPTIONAL(cleanup)
+
+#undef KVM_X86_PMU_OP
+#undef KVM_X86_PMU_OP_OPTIONAL
index 4ff3661..959d66b 100644 (file)
@@ -281,11 +281,11 @@ struct kvm_kernel_irq_routing_entry;
 /*
  * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
  * also includes TDP pages) to determine whether or not a page can be used in
- * the given MMU context.  This is a subset of the overall kvm_mmu_role to
+ * the given MMU context.  This is a subset of the overall kvm_cpu_role to
  * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
  * 2 bytes per gfn instead of 4 bytes per gfn.
  *
- * Indirect upper-level shadow pages are tracked for write-protection via
+ * Upper-level shadow pages having gptes are tracked for write-protection via
  * gfn_track.  As above, gfn_track is a 16 bit counter, so KVM must not create
  * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
  * gfn_track will overflow and explosions will ensure.
@@ -331,7 +331,8 @@ union kvm_mmu_page_role {
                unsigned smap_andnot_wp:1;
                unsigned ad_disabled:1;
                unsigned guest_mode:1;
-               unsigned :6;
+               unsigned passthrough:1;
+               unsigned :5;
 
                /*
                 * This is left at the top of the word so that
@@ -367,8 +368,6 @@ union kvm_mmu_extended_role {
        struct {
                unsigned int valid:1;
                unsigned int execonly:1;
-               unsigned int cr0_pg:1;
-               unsigned int cr4_pae:1;
                unsigned int cr4_pse:1;
                unsigned int cr4_pke:1;
                unsigned int cr4_smap:1;
@@ -378,7 +377,7 @@ union kvm_mmu_extended_role {
        };
 };
 
-union kvm_mmu_role {
+union kvm_cpu_role {
        u64 as_u64;
        struct {
                union kvm_mmu_page_role base;
@@ -438,19 +437,8 @@ struct kvm_mmu {
                         struct kvm_mmu_page *sp);
        void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
        struct kvm_mmu_root_info root;
-       union kvm_mmu_role mmu_role;
-       u8 root_level;
-       u8 shadow_root_level;
-       u8 ept_ad;
-       bool direct_map;
-       struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
-
-       /*
-        * Bitmap; bit set = permission fault
-        * Byte index: page fault error code [4:1]
-        * Bit index: pte permissions in ACC_* format
-        */
-       u8 permissions[16];
+       union kvm_cpu_role cpu_role;
+       union kvm_mmu_page_role root_role;
 
        /*
        * The pkru_mask indicates if protection key checks are needed.  It
@@ -460,6 +448,15 @@ struct kvm_mmu {
        */
        u32 pkru_mask;
 
+       struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
+
+       /*
+        * Bitmap; bit set = permission fault
+        * Byte index: page fault error code [4:1]
+        * Bit index: pte permissions in ACC_* format
+        */
+       u8 permissions[16];
+
        u64 *pae_root;
        u64 *pml4_root;
        u64 *pml5_root;
@@ -607,16 +604,21 @@ struct kvm_vcpu_hv {
 struct kvm_vcpu_xen {
        u64 hypercall_rip;
        u32 current_runstate;
-       bool vcpu_info_set;
-       bool vcpu_time_info_set;
-       bool runstate_set;
-       struct gfn_to_hva_cache vcpu_info_cache;
-       struct gfn_to_hva_cache vcpu_time_info_cache;
-       struct gfn_to_hva_cache runstate_cache;
+       u8 upcall_vector;
+       struct gfn_to_pfn_cache vcpu_info_cache;
+       struct gfn_to_pfn_cache vcpu_time_info_cache;
+       struct gfn_to_pfn_cache runstate_cache;
        u64 last_steal;
        u64 runstate_entry_time;
        u64 runstate_times[4];
        unsigned long evtchn_pending_sel;
+       u32 vcpu_id; /* The Xen / ACPI vCPU ID */
+       u32 timer_virq;
+       u64 timer_expires; /* In guest epoch */
+       atomic_t timer_pending;
+       struct hrtimer timer;
+       int poll_evtchn;
+       struct timer_list poll_timer;
 };
 
 struct kvm_vcpu_arch {
@@ -753,8 +755,7 @@ struct kvm_vcpu_arch {
        gpa_t time;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned int hw_tsc_khz;
-       struct gfn_to_hva_cache pv_time;
-       bool pv_time_enabled;
+       struct gfn_to_pfn_cache pv_time;
        /* set guest stopped flag in pvclock flags field */
        bool pvclock_set_guest_stopped_request;
 
@@ -1024,9 +1025,12 @@ struct msr_bitmap_range {
 
 /* Xen emulation context */
 struct kvm_xen {
+       u32 xen_version;
        bool long_mode;
        u8 upcall_vector;
        struct gfn_to_pfn_cache shinfo_cache;
+       struct idr evtchn_ports;
+       unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
 };
 
 enum kvm_irqchip_mode {
@@ -1119,6 +1123,8 @@ struct kvm_arch {
        u64 cur_tsc_generation;
        int nr_vcpus_matched_tsc;
 
+       u32 default_tsc_khz;
+
        seqcount_raw_spinlock_t pvclock_sc;
        bool use_master_clock;
        u64 master_kernel_ns;
@@ -1263,7 +1269,12 @@ struct kvm_vm_stat {
 
 struct kvm_vcpu_stat {
        struct kvm_vcpu_stat_generic generic;
+       u64 pf_taken;
        u64 pf_fixed;
+       u64 pf_emulate;
+       u64 pf_spurious;
+       u64 pf_fast;
+       u64 pf_mmio_spte_created;
        u64 pf_guest;
        u64 tlb_flush;
        u64 invlpg;
@@ -1455,8 +1466,6 @@ struct kvm_x86_ops {
        int cpu_dirty_log_size;
        void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
 
-       /* pmu operations of sub-arch */
-       const struct kvm_pmu_ops *pmu_ops;
        const struct kvm_x86_nested_ops *nested_ops;
 
        void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
@@ -1499,11 +1508,18 @@ struct kvm_x86_ops {
        int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
 
        void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+
+       /*
+        * Returns vCPU specific APICv inhibit reasons
+        */
+       unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_nested_ops {
        void (*leave_nested)(struct kvm_vcpu *vcpu);
        int (*check_events)(struct kvm_vcpu *vcpu);
+       bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
+                                            struct x86_exception *fault);
        bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
        void (*triple_fault)(struct kvm_vcpu *vcpu);
        int (*get_state)(struct kvm_vcpu *vcpu,
@@ -1528,6 +1544,7 @@ struct kvm_x86_init_ops {
        unsigned int (*handle_intel_pt_intr)(void);
 
        struct kvm_x86_ops *runtime_ops;
+       struct kvm_pmu_ops *pmu_ops;
 };
 
 struct kvm_arch_async_pf {
@@ -1549,20 +1566,6 @@ extern struct kvm_x86_ops kvm_x86_ops;
 #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
 #include <asm/kvm-x86-ops.h>
 
-static inline void kvm_ops_static_call_update(void)
-{
-#define __KVM_X86_OP(func) \
-       static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP(func) \
-       WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
-#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
-#define KVM_X86_OP_OPTIONAL_RET0(func) \
-       static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
-                                          (void *)__static_call_return0);
-#include <asm/kvm-x86-ops.h>
-#undef __KVM_X86_OP
-}
-
 #define __KVM_HAVE_ARCH_VM_ALLOC
 static inline struct kvm *kvm_arch_alloc_vm(void)
 {
@@ -1800,6 +1803,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
                                struct x86_exception *exception);
 
 bool kvm_apicv_activated(struct kvm *kvm);
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
 void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
 void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
                                      enum kvm_apicv_inhibit reason, bool set);
@@ -1989,6 +1993,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
         KVM_X86_QUIRK_CD_NW_CLEARED |          \
         KVM_X86_QUIRK_LAPIC_MMIO_HOLE |        \
         KVM_X86_QUIRK_OUT_7E_INC_RIP |         \
-        KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)
+        KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT |   \
+        KVM_X86_QUIRK_FIX_HYPERCALL_INSN)
 
 #endif /* _ASM_X86_KVM_HOST_H */
index f78e2b3..35f222a 100644 (file)
@@ -382,6 +382,103 @@ do {                                                                      \
 
 #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
+#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+       bool success;                                                   \
+       __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);              \
+       __typeof__(*(_ptr)) __old = *_old;                              \
+       __typeof__(*(_ptr)) __new = (_new);                             \
+       asm_volatile_goto("\n"                                          \
+                    "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
+                    _ASM_EXTABLE_UA(1b, %l[label])                     \
+                    : CC_OUT(z) (success),                             \
+                      [ptr] "+m" (*_ptr),                              \
+                      [old] "+a" (__old)                               \
+                    : [new] ltype (__new)                              \
+                    : "memory"                                         \
+                    : label);                                          \
+       if (unlikely(!success))                                         \
+               *_old = __old;                                          \
+       likely(success);                                        })
+
+#ifdef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)     ({      \
+       bool success;                                                   \
+       __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);              \
+       __typeof__(*(_ptr)) __old = *_old;                              \
+       __typeof__(*(_ptr)) __new = (_new);                             \
+       asm_volatile_goto("\n"                                          \
+                    "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"             \
+                    _ASM_EXTABLE_UA(1b, %l[label])                     \
+                    : CC_OUT(z) (success),                             \
+                      "+A" (__old),                                    \
+                      [ptr] "+m" (*_ptr)                               \
+                    : "b" ((u32)__new),                                \
+                      "c" ((u32)((u64)__new >> 32))                    \
+                    : "memory"                                         \
+                    : label);                                          \
+       if (unlikely(!success))                                         \
+               *_old = __old;                                          \
+       likely(success);                                        })
+#endif // CONFIG_X86_32
+#else  // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+       int __err = 0;                                                  \
+       bool success;                                                   \
+       __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);              \
+       __typeof__(*(_ptr)) __old = *_old;                              \
+       __typeof__(*(_ptr)) __new = (_new);                             \
+       asm volatile("\n"                                               \
+                    "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
+                    CC_SET(z)                                          \
+                    "2:\n"                                             \
+                    _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,  \
+                                          %[errout])                   \
+                    : CC_OUT(z) (success),                             \
+                      [errout] "+r" (__err),                           \
+                      [ptr] "+m" (*_ptr),                              \
+                      [old] "+a" (__old)                               \
+                    : [new] ltype (__new)                              \
+                    : "memory", "cc");                                 \
+       if (unlikely(__err))                                            \
+               goto label;                                             \
+       if (unlikely(!success))                                         \
+               *_old = __old;                                          \
+       likely(success);                                        })
+
+#ifdef CONFIG_X86_32
+/*
+ * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error.
+ * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
+ * hardcoded by CMPXCHG8B, leaving only ESI and EDI.  If the compiler uses
+ * both ESI and EDI for the memory operand, compilation will fail if the error
+ * is an input+output as there will be no register available for input.
+ */
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)     ({      \
+       int __result;                                                   \
+       __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);              \
+       __typeof__(*(_ptr)) __old = *_old;                              \
+       __typeof__(*(_ptr)) __new = (_new);                             \
+       asm volatile("\n"                                               \
+                    "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"             \
+                    "mov $0, %%ecx\n\t"                                \
+                    "setz %%cl\n"                                      \
+                    "2:\n"                                             \
+                    _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \
+                    : [result]"=c" (__result),                         \
+                      "+A" (__old),                                    \
+                      [ptr] "+m" (*_ptr)                               \
+                    : "b" ((u32)__new),                                \
+                      "c" ((u32)((u64)__new >> 32))                    \
+                    : "memory", "cc");                                 \
+       if (unlikely(__result < 0))                                     \
+               goto label;                                             \
+       if (unlikely(!__result))                                        \
+               *_old = __old;                                          \
+       likely(__result);                                       })
+#endif // CONFIG_X86_32
+#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+
 /* FIXME: this hack is definitely wrong -AK */
 struct __large_struct { unsigned long buf[100]; };
 #define __m(x) (*(struct __large_struct __user *)(x))
@@ -474,6 +571,51 @@ do {                                                                               \
 } while (0)
 #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
+extern void __try_cmpxchg_user_wrong_size(void);
+
+#ifndef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label)           \
+       __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
+#endif
+
+/*
+ * Force the pointer to u<size> to match the size expected by the asm helper.
+ * clang/LLVM compiles all cases and only discards the unused paths after
+ * processing errors, which breaks i386 if the pointer is an 8-byte value.
+ */
+#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({                 \
+       bool __ret;                                                             \
+       __chk_user_ptr(_ptr);                                                   \
+       switch (sizeof(*(_ptr))) {                                              \
+       case 1: __ret = __try_cmpxchg_user_asm("b", "q",                        \
+                                              (__force u8 *)(_ptr), (_oldp),   \
+                                              (_nval), _label);                \
+               break;                                                          \
+       case 2: __ret = __try_cmpxchg_user_asm("w", "r",                        \
+                                              (__force u16 *)(_ptr), (_oldp),  \
+                                              (_nval), _label);                \
+               break;                                                          \
+       case 4: __ret = __try_cmpxchg_user_asm("l", "r",                        \
+                                              (__force u32 *)(_ptr), (_oldp),  \
+                                              (_nval), _label);                \
+               break;                                                          \
+       case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
+                                                (_nval), _label);              \
+               break;                                                          \
+       default: __try_cmpxchg_user_wrong_size();                               \
+       }                                                                       \
+       __ret;                                          })
+
+/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
+#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({              \
+       int __ret = -EFAULT;                                            \
+       __uaccess_begin_nospec();                                       \
+       __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label);   \
+_label:                                                                        \
+       __uaccess_end();                                                \
+       __ret;                                                          \
+                                                       })
+
 /*
  * We want the unsafe accessors to always be inlined and use
  * the error labels - thus the macro games.
index 0ffaa31..6c343c6 100644 (file)
@@ -543,16 +543,14 @@ enum vm_entry_failure_code {
 #define EPT_VIOLATION_ACC_READ_BIT     0
 #define EPT_VIOLATION_ACC_WRITE_BIT    1
 #define EPT_VIOLATION_ACC_INSTR_BIT    2
-#define EPT_VIOLATION_READABLE_BIT     3
-#define EPT_VIOLATION_WRITABLE_BIT     4
-#define EPT_VIOLATION_EXECUTABLE_BIT   5
+#define EPT_VIOLATION_RWX_SHIFT                3
+#define EPT_VIOLATION_GVA_IS_VALID_BIT 7
 #define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
 #define EPT_VIOLATION_ACC_READ         (1 << EPT_VIOLATION_ACC_READ_BIT)
 #define EPT_VIOLATION_ACC_WRITE                (1 << EPT_VIOLATION_ACC_WRITE_BIT)
 #define EPT_VIOLATION_ACC_INSTR                (1 << EPT_VIOLATION_ACC_INSTR_BIT)
-#define EPT_VIOLATION_READABLE         (1 << EPT_VIOLATION_READABLE_BIT)
-#define EPT_VIOLATION_WRITABLE         (1 << EPT_VIOLATION_WRITABLE_BIT)
-#define EPT_VIOLATION_EXECUTABLE       (1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_RWX_MASK         (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT)
+#define EPT_VIOLATION_GVA_IS_VALID     (1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
 #define EPT_VIOLATION_GVA_TRANSLATED   (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
 /*
index bf6e960..2161480 100644 (file)
@@ -428,11 +428,12 @@ struct kvm_sync_regs {
        struct kvm_vcpu_events events;
 };
 
-#define KVM_X86_QUIRK_LINT0_REENABLED     (1 << 0)
-#define KVM_X86_QUIRK_CD_NW_CLEARED       (1 << 1)
-#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE     (1 << 2)
-#define KVM_X86_QUIRK_OUT_7E_INC_RIP      (1 << 3)
-#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_LINT0_REENABLED          (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED            (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE          (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP           (1 << 3)
+#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT     (1 << 4)
+#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN       (1 << 5)
 
 #define KVM_STATE_NESTED_FORMAT_VMX    0
 #define KVM_STATE_NESTED_FORMAT_SVM    1
index b14533a..9b69821 100644 (file)
@@ -5,7 +5,7 @@
 
 #include <asm/ia32.h>
 
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
 #include <asm/kvm_para.h>
 #endif
 
@@ -20,7 +20,7 @@ int main(void)
        BLANK();
 #endif
 
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
        OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
        BLANK();
 #endif
index e28ab0e..0fdc807 100644 (file)
@@ -14,6 +14,8 @@
 #include <asm/traps.h>
 #include <asm/irq_regs.h>
 
+#include <uapi/asm/kvm.h>
+
 #include <linux/hardirq.h>
 #include <linux/pkeys.h>
 #include <linux/vmalloc.h>
@@ -232,7 +234,20 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
        gfpu->fpstate           = fpstate;
        gfpu->xfeatures         = fpu_user_cfg.default_features;
        gfpu->perm              = fpu_user_cfg.default_features;
-       gfpu->uabi_size         = fpu_user_cfg.default_size;
+
+       /*
+        * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
+        * to userspace, even when XSAVE is unsupported, so that restoring FPU
+        * state on a different CPU that does support XSAVE can cleanly load
+        * the incoming state using its natural XSAVE.  In other words, KVM's
+        * uABI size may be larger than this host's default size.  Conversely,
+        * the default size should never be larger than KVM's base uABI size;
+        * all features that can expand the uABI size must be opt-in.
+        */
+       gfpu->uabi_size         = sizeof(struct kvm_xsave);
+       if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
+               gfpu->uabi_size = fpu_user_cfg.default_size;
+
        fpu_init_guest_permissions(gfpu);
 
        return true;
index 8b1c45c..1a3658f 100644 (file)
@@ -191,7 +191,7 @@ void kvm_async_pf_task_wake(u32 token)
 {
        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
-       struct kvm_task_sleep_node *n;
+       struct kvm_task_sleep_node *n, *dummy = NULL;
 
        if (token == ~0) {
                apf_task_wake_all();
@@ -203,28 +203,41 @@ again:
        n = _find_apf_task(b, token);
        if (!n) {
                /*
-                * async PF was not yet handled.
-                * Add dummy entry for the token.
+                * Async #PF not yet handled, add a dummy entry for the token.
+                * Allocating the token must be down outside of the raw lock
+                * as the allocator is preemptible on PREEMPT_RT kernels.
                 */
-               n = kzalloc(sizeof(*n), GFP_ATOMIC);
-               if (!n) {
+               if (!dummy) {
+                       raw_spin_unlock(&b->lock);
+                       dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
+
                        /*
-                        * Allocation failed! Busy wait while other cpu
-                        * handles async PF.
+                        * Continue looping on allocation failure, eventually
+                        * the async #PF will be handled and allocating a new
+                        * node will be unnecessary.
+                        */
+                       if (!dummy)
+                               cpu_relax();
+
+                       /*
+                        * Recheck for async #PF completion before enqueueing
+                        * the dummy token to avoid duplicate list entries.
                         */
-                       raw_spin_unlock(&b->lock);
-                       cpu_relax();
                        goto again;
                }
-               n->token = token;
-               n->cpu = smp_processor_id();
-               init_swait_queue_head(&n->wq);
-               hlist_add_head(&n->link, &b->list);
+               dummy->token = token;
+               dummy->cpu = smp_processor_id();
+               init_swait_queue_head(&dummy->wq);
+               hlist_add_head(&dummy->link, &b->list);
+               dummy = NULL;
        } else {
                apf_task_wake_one(n);
        }
        raw_spin_unlock(&b->lock);
-       return;
+
+       /* A dummy token might be allocated and ultimately not used.  */
+       if (dummy)
+               kfree(dummy);
 }
 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
 
@@ -765,6 +778,42 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
 }
 #endif
 
+#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
+bool __kvm_vcpu_is_preempted(long cpu);
+
+__visible bool __kvm_vcpu_is_preempted(long cpu)
+{
+       struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+       return !!(src->preempted & KVM_VCPU_PREEMPTED);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+ASM_ENDBR
+"movq  __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb  $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne %al;"
+ASM_RET
+".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
+".popsection");
+
+#endif
+
 static void __init kvm_guest_init(void)
 {
        int i;
@@ -777,6 +826,9 @@ static void __init kvm_guest_init(void)
        if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
                has_steal_clock = 1;
                static_call_update(pv_steal_clock, kvm_steal_clock);
+
+               pv_ops.lock.vcpu_is_preempted =
+                       PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
        }
 
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -1018,40 +1070,6 @@ static void kvm_wait(u8 *ptr, u8 val)
        }
 }
 
-#ifdef CONFIG_X86_32
-__visible bool __kvm_vcpu_is_preempted(long cpu)
-{
-       struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
-
-       return !!(src->preempted & KVM_VCPU_PREEMPTED);
-}
-PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
-
-#else
-
-#include <asm/asm-offsets.h>
-
-extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
-
-/*
- * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
- * restoring to/from the stack.
- */
-asm(
-".pushsection .text;"
-".global __raw_callee_save___kvm_vcpu_is_preempted;"
-".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
-"__raw_callee_save___kvm_vcpu_is_preempted:"
-ASM_ENDBR
-"movq  __per_cpu_offset(,%rdi,8), %rax;"
-"cmpb  $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
-"setne %al;"
-ASM_RET
-".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
-".popsection");
-
-#endif
-
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
@@ -1095,10 +1113,6 @@ void __init kvm_spinlock_init(void)
        pv_ops.lock.wait = kvm_wait;
        pv_ops.lock.kick = kvm_kick_cpu;
 
-       if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
-               pv_ops.lock.vcpu_is_preempted =
-                       PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
-       }
        /*
         * When PV spinlock is enabled which is preferred over
         * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
index c5caa73..16333ba 100644 (file)
@@ -239,7 +239,7 @@ static void __init kvmclock_init_mem(void)
 
 static int __init kvm_setup_vsyscall_timeinfo(void)
 {
-       if (!kvm_para_available() || !kvmclock)
+       if (!kvm_para_available() || !kvmclock || nopv)
                return 0;
 
        kvmclock_init_mem();
index be99dc8..e1bb621 100644 (file)
@@ -252,7 +252,6 @@ int kvm_pic_read_irq(struct kvm *kvm)
                                 */
                                irq2 = 7;
                        intno = s->pics[1].irq_base + irq2;
-                       irq = irq2 + 8;
                } else
                        intno = s->pics[0].irq_base + irq;
        } else {
index 172b053..f371f12 100644 (file)
  */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
+       int r = 0;
+
        if (lapic_in_kernel(vcpu))
-               return apic_has_pending_timer(vcpu);
+               r = apic_has_pending_timer(vcpu);
+       if (kvm_xen_timer_enabled(vcpu))
+               r += kvm_xen_has_pending_timer(vcpu);
 
-       return 0;
+       return r;
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
@@ -143,6 +147,8 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
        if (lapic_in_kernel(vcpu))
                kvm_inject_apic_timer_irqs(vcpu);
+       if (kvm_xen_timer_enabled(vcpu))
+               kvm_xen_inject_timer_irqs(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 
index 6e0dab0..0687162 100644 (file)
@@ -181,7 +181,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
                if (!level)
                        return -1;
 
-               return kvm_xen_set_evtchn_fast(e, kvm);
+               return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
 #endif
        default:
                break;
index 66b0eb0..f1bdac3 100644 (file)
@@ -1548,6 +1548,7 @@ static void cancel_apic_timer(struct kvm_lapic *apic)
        if (apic->lapic_timer.hv_timer_in_use)
                cancel_hv_timer(apic);
        preempt_enable();
+       atomic_set(&apic->lapic_timer.pending, 0);
 }
 
 static void apic_update_lvtt(struct kvm_lapic *apic)
@@ -1648,10 +1649,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
        tsc_deadline = apic->lapic_timer.expired_tscdeadline;
        apic->lapic_timer.expired_tscdeadline = 0;
        guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-       apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
+       trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
 
        if (lapic_timer_advance_dynamic) {
-               adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
+               adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline);
                /*
                 * If the timer fired early, reread the TSC to account for the
                 * overhead of the above adjustment to avoid waiting longer
index 4e4f8a2..65bb2a8 100644 (file)
@@ -38,7 +38,6 @@ struct kvm_timer {
        u64 tscdeadline;
        u64 expired_tscdeadline;
        u32 timer_advance_ns;
-       s64 advance_expire_delta;
        atomic_t pending;                       /* accumulated triggered timers */
        bool hv_timer_in_use;
 };
index a335e7f..f819286 100644 (file)
@@ -89,7 +89,27 @@ static inline gfn_t kvm_mmu_max_gfn(void)
        return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
 }
 
+static inline u8 kvm_get_shadow_phys_bits(void)
+{
+       /*
+        * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+        * in CPU detection code, but the processor treats those reduced bits as
+        * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+        * the physical address bits reported by CPUID.
+        */
+       if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+               return cpuid_eax(0x80000008) & 0xff;
+
+       /*
+        * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+        * custom CPUID.  Proceed with whatever the kernel found since these features
+        * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+        */
+       return boot_cpu_data.x86_phys_bits;
+}
+
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
 void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
 
 void kvm_init_mmu(struct kvm_vcpu *vcpu);
@@ -138,94 +158,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
                return;
 
        static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
-                                         vcpu->arch.mmu->shadow_root_level);
-}
-
-struct kvm_page_fault {
-       /* arguments to kvm_mmu_do_page_fault.  */
-       const gpa_t addr;
-       const u32 error_code;
-       const bool prefetch;
-
-       /* Derived from error_code.  */
-       const bool exec;
-       const bool write;
-       const bool present;
-       const bool rsvd;
-       const bool user;
-
-       /* Derived from mmu and global state.  */
-       const bool is_tdp;
-       const bool nx_huge_page_workaround_enabled;
-
-       /*
-        * Whether a >4KB mapping can be created or is forbidden due to NX
-        * hugepages.
-        */
-       bool huge_page_disallowed;
-
-       /*
-        * Maximum page size that can be created for this fault; input to
-        * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
-        */
-       u8 max_level;
-
-       /*
-        * Page size that can be created based on the max_level and the
-        * page size used by the host mapping.
-        */
-       u8 req_level;
-
-       /*
-        * Page size that will be created based on the req_level and
-        * huge_page_disallowed.
-        */
-       u8 goal_level;
-
-       /* Shifted addr, or result of guest page table walk if addr is a gva.  */
-       gfn_t gfn;
-
-       /* The memslot containing gfn. May be NULL. */
-       struct kvm_memory_slot *slot;
-
-       /* Outputs of kvm_faultin_pfn.  */
-       kvm_pfn_t pfn;
-       hva_t hva;
-       bool map_writable;
-};
-
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
-
-extern int nx_huge_pages;
-static inline bool is_nx_huge_page_enabled(void)
-{
-       return READ_ONCE(nx_huge_pages);
-}
-
-static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                                       u32 err, bool prefetch)
-{
-       struct kvm_page_fault fault = {
-               .addr = cr2_or_gpa,
-               .error_code = err,
-               .exec = err & PFERR_FETCH_MASK,
-               .write = err & PFERR_WRITE_MASK,
-               .present = err & PFERR_PRESENT_MASK,
-               .rsvd = err & PFERR_RSVD_MASK,
-               .user = err & PFERR_USER_MASK,
-               .prefetch = prefetch,
-               .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
-               .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
-
-               .max_level = KVM_MAX_HUGEPAGE_LEVEL,
-               .req_level = PG_LEVEL_4K,
-               .goal_level = PG_LEVEL_4K,
-       };
-#ifdef CONFIG_RETPOLINE
-       if (fault.is_tdp)
-               return kvm_tdp_page_fault(vcpu, &fault);
-#endif
-       return vcpu->arch.mmu->page_fault(vcpu, &fault);
+                                         vcpu->arch.mmu->root_role.level);
 }
 
 /*
index 45e1573..f465368 100644 (file)
@@ -193,11 +193,12 @@ struct kvm_mmu_role_regs {
 
 /*
  * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
- * reading from the role_regs.  Once the mmu_role is constructed, it becomes
+ * reading from the role_regs.  Once the root_role is constructed, it becomes
  * the single source of truth for the MMU's state.
  */
 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)                  \
-static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
+static inline bool __maybe_unused                                      \
+____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)            \
 {                                                                      \
        return !!(regs->reg & flag);                                    \
 }
@@ -221,17 +222,26 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)                \
 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)       \
 {                                                              \
-       return !!(mmu->mmu_role. base_or_ext . reg##_##name);   \
+       return !!(mmu->cpu_role. base_or_ext . reg##_##name);   \
 }
-BUILD_MMU_ROLE_ACCESSOR(ext,  cr0, pg);
 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
-BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pae);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
+BUILD_MMU_ROLE_ACCESSOR(ext,  efer, lma);
+
+static inline bool is_cr0_pg(struct kvm_mmu *mmu)
+{
+        return mmu->cpu_role.base.level > 0;
+}
+
+static inline bool is_cr4_pae(struct kvm_mmu *mmu)
+{
+        return !mmu->cpu_role.base.has_4_byte_gpte;
+}
 
 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
 {
@@ -244,19 +254,6 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
        return regs;
 }
 
-static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs)
-{
-       if (!____is_cr0_pg(regs))
-               return 0;
-       else if (____is_efer_lma(regs))
-               return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
-                                              PT64_ROOT_4LEVEL;
-       else if (____is_cr4_pae(regs))
-               return PT32E_ROOT_LEVEL;
-       else
-               return PT32_ROOT_LEVEL;
-}
-
 static inline bool kvm_available_flush_tlb_with_range(void)
 {
        return kvm_x86_ops.tlb_remote_flush_with_range;
@@ -714,6 +711,9 @@ static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 
 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 {
+       if (sp->role.passthrough)
+               return sp->gfn;
+
        if (!sp->role.direct)
                return sp->gfns[index];
 
@@ -722,6 +722,11 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 
 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 {
+       if (sp->role.passthrough) {
+               WARN_ON_ONCE(gfn != sp->gfn);
+               return;
+       }
+
        if (!sp->role.direct) {
                sp->gfns[index] = gfn;
                return;
@@ -1478,9 +1483,11 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
 
 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
 {
-       if (++iterator->rmap <= iterator->end_rmap) {
+       while (++iterator->rmap <= iterator->end_rmap) {
                iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
-               return;
+
+               if (iterator->rmap->val)
+                       return;
        }
 
        if (++iterator->level > iterator->end_level) {
@@ -1833,27 +1840,35 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                    struct list_head *invalid_list);
 
+static bool sp_has_gptes(struct kvm_mmu_page *sp)
+{
+       if (sp->role.direct)
+               return false;
+
+       if (sp->role.passthrough)
+               return false;
+
+       return true;
+}
+
 #define for_each_valid_sp(_kvm, _sp, _list)                            \
        hlist_for_each_entry(_sp, _list, hash_link)                     \
                if (is_obsolete_sp((_kvm), (_sp))) {                    \
                } else
 
-#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                        \
+#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn)              \
        for_each_valid_sp(_kvm, _sp,                                    \
          &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
-               if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+               if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
 
-static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                         struct list_head *invalid_list)
 {
        int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
 
-       if (ret < 0) {
+       if (ret < 0)
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-               return false;
-       }
-
-       return !!ret;
+       return ret;
 }
 
 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
@@ -1975,7 +1990,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
 
                for_each_sp(pages, sp, parents, i) {
                        kvm_unlink_unsync_page(vcpu->kvm, sp);
-                       flush |= kvm_sync_page(vcpu, sp, &invalid_list);
+                       flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
                        mmu_pages_clear_parents(&parents);
                }
                if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
@@ -2011,15 +2026,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                             int direct,
                                             unsigned int access)
 {
-       bool direct_mmu = vcpu->arch.mmu->direct_map;
+       bool direct_mmu = vcpu->arch.mmu->root_role.direct;
        union kvm_mmu_page_role role;
        struct hlist_head *sp_list;
        unsigned quadrant;
        struct kvm_mmu_page *sp;
+       int ret;
        int collisions = 0;
        LIST_HEAD(invalid_list);
 
-       role = vcpu->arch.mmu->mmu_role.base;
+       role = vcpu->arch.mmu->root_role;
        role.level = level;
        role.direct = direct;
        role.access = access;
@@ -2028,6 +2044,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
        }
+       if (level <= vcpu->arch.mmu->cpu_role.base.level)
+               role.passthrough = 0;
 
        sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
        for_each_valid_sp(vcpu->kvm, sp, sp_list) {
@@ -2068,11 +2086,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                         * If the sync fails, the page is zapped.  If so, break
                         * in order to rebuild it.
                         */
-                       if (!kvm_sync_page(vcpu, sp, &invalid_list))
+                       ret = kvm_sync_page(vcpu, sp, &invalid_list);
+                       if (ret < 0)
                                break;
 
                        WARN_ON(!list_empty(&invalid_list));
-                       kvm_flush_remote_tlbs(vcpu->kvm);
+                       if (ret > 0)
+                               kvm_flush_remote_tlbs(vcpu->kvm);
                }
 
                __clear_sp_write_flooding_count(sp);
@@ -2089,7 +2109,7 @@ trace_get_page:
        sp->gfn = gfn;
        sp->role = role;
        hlist_add_head(&sp->hash_link, sp_list);
-       if (!direct) {
+       if (sp_has_gptes(sp)) {
                account_shadowed(vcpu->kvm, sp);
                if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
                        kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
@@ -2109,11 +2129,11 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 {
        iterator->addr = addr;
        iterator->shadow_addr = root;
-       iterator->level = vcpu->arch.mmu->shadow_root_level;
+       iterator->level = vcpu->arch.mmu->root_role.level;
 
        if (iterator->level >= PT64_ROOT_4LEVEL &&
-           vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
-           !vcpu->arch.mmu->direct_map)
+           vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
+           !vcpu->arch.mmu->root_role.direct)
                iterator->level = PT32E_ROOT_LEVEL;
 
        if (iterator->level == PT32E_ROOT_LEVEL) {
@@ -2298,7 +2318,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
        /* Zapping children means active_mmu_pages has become unstable. */
        list_unstable = *nr_zapped;
 
-       if (!sp->role.invalid && !sp->role.direct)
+       if (!sp->role.invalid && sp_has_gptes(sp))
                unaccount_shadowed(kvm, sp);
 
        if (sp->unsync)
@@ -2478,7 +2498,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
        pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
        r = 0;
        write_lock(&kvm->mmu_lock);
-       for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
+       for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
                pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
                         sp->role.word);
                r = 1;
@@ -2495,7 +2515,7 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
        gpa_t gpa;
        int r;
 
-       if (vcpu->arch.mmu->direct_map)
+       if (vcpu->arch.mmu->root_role.direct)
                return 0;
 
        gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2540,7 +2560,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
         * that case, KVM must complete emulation of the guest TLB flush before
         * allowing shadow pages to become unsync (writable by the guest).
         */
-       for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
+       for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
                if (!can_unsync)
                        return -EPERM;
 
@@ -2642,6 +2662,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
                 *sptep, write_fault, gfn);
 
        if (unlikely(is_noslot_pfn(pfn))) {
+               vcpu->stat.pf_mmio_spte_created++;
                mark_mmio_spte(vcpu, sptep, gfn, pte_access);
                return RET_PF_EMULATE;
        }
@@ -2962,7 +2983,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
                return ret;
 
        direct_pte_prefetch(vcpu, it.sptep);
-       ++vcpu->stat.pf_fixed;
        return ret;
 }
 
@@ -2989,14 +3009,12 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
        return -EFAULT;
 }
 
-static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
-                               unsigned int access, int *ret_val)
+static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+                              unsigned int access)
 {
        /* The pfn is invalid, report the error! */
-       if (unlikely(is_error_pfn(fault->pfn))) {
-               *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
-               return true;
-       }
+       if (unlikely(is_error_pfn(fault->pfn)))
+               return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
 
        if (unlikely(!fault->slot)) {
                gva_t gva = fault->is_tdp ? 0 : fault->addr;
@@ -3013,44 +3031,48 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
                 * and only if L1's MAXPHYADDR is inaccurate with respect to
                 * the hardware's).
                 */
-               if (unlikely(!shadow_mmio_value) ||
-                   unlikely(fault->gfn > kvm_mmu_max_gfn())) {
-                       *ret_val = RET_PF_EMULATE;
-                       return true;
-               }
+               if (unlikely(!enable_mmio_caching) ||
+                   unlikely(fault->gfn > kvm_mmu_max_gfn()))
+                       return RET_PF_EMULATE;
        }
 
-       return false;
+       return RET_PF_CONTINUE;
 }
 
 static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
 {
        /*
-        * Do not fix the mmio spte with invalid generation number which
-        * need to be updated by slow page fault path.
+        * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
+        * reach the common page fault handler if the SPTE has an invalid MMIO
+        * generation number.  Refreshing the MMIO generation needs to go down
+        * the slow path.  Note, EPT Misconfigs do NOT set the PRESENT flag!
         */
        if (fault->rsvd)
                return false;
 
-       /* See if the page fault is due to an NX violation */
-       if (unlikely(fault->exec && fault->present))
-               return false;
-
        /*
         * #PF can be fast if:
-        * 1. The shadow page table entry is not present, which could mean that
-        *    the fault is potentially caused by access tracking (if enabled).
-        * 2. The shadow page table entry is present and the fault
-        *    is caused by write-protect, that means we just need change the W
-        *    bit of the spte which can be done out of mmu-lock.
         *
-        * However, if access tracking is disabled we know that a non-present
-        * page must be a genuine page fault where we have to create a new SPTE.
-        * So, if access tracking is disabled, we return true only for write
-        * accesses to a present page.
+        * 1. The shadow page table entry is not present and A/D bits are
+        *    disabled _by KVM_, which could mean that the fault is potentially
+        *    caused by access tracking (if enabled).  If A/D bits are enabled
+        *    by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
+        *    bits for L2 and employ access tracking, but the fast page fault
+        *    mechanism only supports direct MMUs.
+        * 2. The shadow page table entry is present, the access is a write,
+        *    and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
+        *    the fault was caused by a write-protection violation.  If the
+        *    SPTE is MMU-writable (determined later), the fault can be fixed
+        *    by setting the Writable bit, which can be done out of mmu_lock.
         */
+       if (!fault->present)
+               return !kvm_ad_enabled();
 
-       return shadow_acc_track_mask != 0 || (fault->write && fault->present);
+       /*
+        * Note, instruction fetches and writes are mutually exclusive, ignore
+        * the "exec" flag.
+        */
+       return fault->write;
 }
 
 /*
@@ -3165,13 +3187,25 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 
                new_spte = spte;
 
-               if (is_access_track_spte(spte))
+               /*
+                * KVM only supports fixing page faults outside of MMU lock for
+                * direct MMUs, nested MMUs are always indirect, and KVM always
+                * uses A/D bits for non-nested MMUs.  Thus, if A/D bits are
+                * enabled, the SPTE can't be an access-tracked SPTE.
+                */
+               if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
                        new_spte = restore_acc_track_spte(new_spte);
 
                /*
-                * Currently, to simplify the code, write-protection can
-                * be removed in the fast path only if the SPTE was
-                * write-protected for dirty-logging or access tracking.
+                * To keep things simple, only SPTEs that are MMU-writable can
+                * be made fully writable outside of mmu_lock, e.g. only SPTEs
+                * that were write-protected for dirty-logging or access
+                * tracking are handled here.  Don't bother checking if the
+                * SPTE is writable to prioritize running with A/D bits enabled.
+                * The is_access_allowed() check above handles the common case
+                * of the fault being spurious, and the SPTE is known to be
+                * shadow-present, i.e. except for access tracking restoration
+                * making the new SPTE writable, the check is wasteful.
                 */
                if (fault->write && is_mmu_writable_spte(spte)) {
                        new_spte |= PT_WRITABLE_MASK;
@@ -3217,6 +3251,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
        trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
        walk_shadow_page_lockless_end(vcpu);
 
+       if (ret != RET_PF_INVALID)
+               vcpu->stat.pf_fast++;
+
        return ret;
 }
 
@@ -3303,7 +3340,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
         * This should not be called while L2 is active, L2 can't invalidate
         * _only_ its own roots, e.g. INVVPID unconditionally exits.
         */
-       WARN_ON_ONCE(mmu->mmu_role.base.guest_mode);
+       WARN_ON_ONCE(mmu->root_role.guest_mode);
 
        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
                root_hpa = mmu->prev_roots[i].hpa;
@@ -3346,7 +3383,7 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *mmu = vcpu->arch.mmu;
-       u8 shadow_root_level = mmu->shadow_root_level;
+       u8 shadow_root_level = mmu->root_role.level;
        hpa_t root;
        unsigned i;
        int r;
@@ -3470,7 +3507,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         * On SVM, reading PDPTRs might access guest memory, which might fault
         * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
         */
-       if (mmu->root_level == PT32E_ROOT_LEVEL) {
+       if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
                for (i = 0; i < 4; ++i) {
                        pdptrs[i] = mmu->get_pdptr(vcpu, i);
                        if (!(pdptrs[i] & PT_PRESENT_MASK))
@@ -3494,9 +3531,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         * Do we shadow a long mode page table? If so we need to
         * write-protect the guests page table root.
         */
-       if (mmu->root_level >= PT64_ROOT_4LEVEL) {
+       if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
                root = mmu_alloc_root(vcpu, root_gfn, 0,
-                                     mmu->shadow_root_level, false);
+                                     mmu->root_role.level, false);
                mmu->root.hpa = root;
                goto set_root_pgd;
        }
@@ -3511,8 +3548,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         * or a PAE 3-level page table. In either case we need to be aware that
         * the shadow page table may be a PAE or a long mode page table.
         */
-       pm_mask = PT_PRESENT_MASK | shadow_me_mask;
-       if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+       pm_mask = PT_PRESENT_MASK | shadow_me_value;
+       if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
                pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
                if (WARN_ON_ONCE(!mmu->pml4_root)) {
@@ -3521,7 +3558,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                }
                mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
 
-               if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
+               if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
                        if (WARN_ON_ONCE(!mmu->pml5_root)) {
                                r = -EIO;
                                goto out_unlock;
@@ -3533,7 +3570,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
        for (i = 0; i < 4; ++i) {
                WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
 
-               if (mmu->root_level == PT32E_ROOT_LEVEL) {
+               if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
                        if (!(pdptrs[i] & PT_PRESENT_MASK)) {
                                mmu->pae_root[i] = INVALID_PAE_ROOT;
                                continue;
@@ -3546,9 +3583,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                mmu->pae_root[i] = root | pm_mask;
        }
 
-       if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
+       if (mmu->root_role.level == PT64_ROOT_5LEVEL)
                mmu->root.hpa = __pa(mmu->pml5_root);
-       else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+       else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
                mmu->root.hpa = __pa(mmu->pml4_root);
        else
                mmu->root.hpa = __pa(mmu->pae_root);
@@ -3564,7 +3601,7 @@ out_unlock:
 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *mmu = vcpu->arch.mmu;
-       bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
+       bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
        u64 *pml5_root = NULL;
        u64 *pml4_root = NULL;
        u64 *pae_root;
@@ -3575,8 +3612,9 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
         * equivalent level in the guest's NPT to shadow.  Allocate the tables
         * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
         */
-       if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
-           mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+       if (mmu->root_role.direct ||
+           mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
+           mmu->root_role.level < PT64_ROOT_4LEVEL)
                return 0;
 
        /*
@@ -3672,7 +3710,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        int i;
        struct kvm_mmu_page *sp;
 
-       if (vcpu->arch.mmu->direct_map)
+       if (vcpu->arch.mmu->root_role.direct)
                return;
 
        if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
@@ -3680,7 +3718,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 
        vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
 
-       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+       if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
                hpa_t root = vcpu->arch.mmu->root.hpa;
                sp = to_shadow_page(root);
 
@@ -3902,14 +3940,33 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 
        arch.token = alloc_apf_token(vcpu);
        arch.gfn = gfn;
-       arch.direct_map = vcpu->arch.mmu->direct_map;
+       arch.direct_map = vcpu->arch.mmu->root_role.direct;
        arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
 
        return kvm_setup_async_pf(vcpu, cr2_or_gpa,
                                  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
-static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+       int r;
+
+       if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
+             work->wakeup_all)
+               return;
+
+       r = kvm_mmu_reload(vcpu);
+       if (unlikely(r))
+               return;
+
+       if (!vcpu->arch.mmu->root_role.direct &&
+             work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
+               return;
+
+       kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
+}
+
+static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
        struct kvm_memory_slot *slot = fault->slot;
        bool async;
@@ -3920,7 +3977,7 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
         * be zapped before KVM inserts a new MMIO SPTE for the gfn.
         */
        if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
-               goto out_retry;
+               return RET_PF_RETRY;
 
        if (!kvm_is_visible_memslot(slot)) {
                /* Don't expose private memslots to L2. */
@@ -3928,7 +3985,7 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
                        fault->slot = NULL;
                        fault->pfn = KVM_PFN_NOSLOT;
                        fault->map_writable = false;
-                       return false;
+                       return RET_PF_CONTINUE;
                }
                /*
                 * If the APIC access page exists but is disabled, go directly
@@ -3937,10 +3994,8 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
                 * when the AVIC is re-enabled.
                 */
                if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
-                   !kvm_apicv_activated(vcpu->kvm)) {
-                       *r = RET_PF_EMULATE;
-                       return true;
-               }
+                   !kvm_apicv_activated(vcpu->kvm))
+                       return RET_PF_EMULATE;
        }
 
        async = false;
@@ -3948,26 +4003,23 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
                                          fault->write, &fault->map_writable,
                                          &fault->hva);
        if (!async)
-               return false; /* *pfn has correct page already */
+               return RET_PF_CONTINUE; /* *pfn has correct page already */
 
        if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
                trace_kvm_try_async_get_page(fault->addr, fault->gfn);
                if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
                        trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
                        kvm_make_request(KVM_REQ_APF_HALT, vcpu);
-                       goto out_retry;
-               } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn))
-                       goto out_retry;
+                       return RET_PF_RETRY;
+               } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
+                       return RET_PF_RETRY;
+               }
        }
 
        fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
                                          fault->write, &fault->map_writable,
                                          &fault->hva);
-       return false;
-
-out_retry:
-       *r = RET_PF_RETRY;
-       return true;
+       return RET_PF_CONTINUE;
 }
 
 /*
@@ -4022,10 +4074,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
 
-       if (kvm_faultin_pfn(vcpu, fault, &r))
+       r = kvm_faultin_pfn(vcpu, fault);
+       if (r != RET_PF_CONTINUE)
                return r;
 
-       if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r))
+       r = handle_abnormal_pfn(vcpu, fault, ACC_ALL);
+       if (r != RET_PF_CONTINUE)
                return r;
 
        r = RET_PF_RETRY;
@@ -4120,7 +4174,6 @@ static void nonpaging_init_context(struct kvm_mmu *context)
        context->gva_to_gpa = nonpaging_gva_to_gpa;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = NULL;
-       context->direct_map = true;
 }
 
 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
@@ -4214,7 +4267,7 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
 {
        struct kvm_mmu *mmu = vcpu->arch.mmu;
-       union kvm_mmu_page_role new_role = mmu->mmu_role.base;
+       union kvm_mmu_page_role new_role = mmu->root_role;
 
        if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
                /* kvm_mmu_ensure_valid_pgd will set up a new root.  */
@@ -4391,12 +4444,12 @@ static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
                             guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
 }
 
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu *context)
+static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+                                       struct kvm_mmu *context)
 {
        __reset_rsvds_bits_mask(&context->guest_rsvd_check,
                                vcpu->arch.reserved_gpa_bits,
-                               context->root_level, is_efer_nx(context),
+                               context->cpu_role.base.level, is_efer_nx(context),
                                guest_can_use_gbpages(vcpu),
                                is_cr4_pse(context),
                                guest_cpuid_is_amd_or_hygon(vcpu));
@@ -4461,16 +4514,6 @@ static inline u64 reserved_hpa_bits(void)
 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
                                        struct kvm_mmu *context)
 {
-       /*
-        * KVM uses NX when TDP is disabled to handle a variety of scenarios,
-        * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
-        * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
-        * The iTLB multi-hit workaround can be toggled at any time, so assume
-        * NX can be used by any non-nested shadow MMU to avoid having to reset
-        * MMU contexts.  Note, KVM forces EFER.NX=1 when TDP is disabled.
-        */
-       bool uses_nx = is_efer_nx(context) || !tdp_enabled;
-
        /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
        bool is_amd = true;
        /* KVM doesn't use 2-level page tables for the shadow MMU. */
@@ -4478,19 +4521,28 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
        struct rsvd_bits_validate *shadow_zero_check;
        int i;
 
-       WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL);
+       WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
 
        shadow_zero_check = &context->shadow_zero_check;
        __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
-                               context->shadow_root_level, uses_nx,
+                               context->root_role.level,
+                               context->root_role.efer_nx,
                                guest_can_use_gbpages(vcpu), is_pse, is_amd);
 
        if (!shadow_me_mask)
                return;
 
-       for (i = context->shadow_root_level; --i >= 0;) {
-               shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
-               shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+       for (i = context->root_role.level; --i >= 0;) {
+               /*
+                * So far shadow_me_value is a constant during KVM's life
+                * time.  Bits in shadow_me_value are allowed to be set.
+                * Bits in shadow_me_mask but not in shadow_me_value are
+                * not allowed to be set.
+                */
+               shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
+               shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
+               shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
+               shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
        }
 
 }
@@ -4515,7 +4567,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
 
        if (boot_cpu_is_amd())
                __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
-                                       context->shadow_root_level, false,
+                                       context->root_role.level, false,
                                        boot_cpu_has(X86_FEATURE_GBPAGES),
                                        false, true);
        else
@@ -4526,7 +4578,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
        if (!shadow_me_mask)
                return;
 
-       for (i = context->shadow_root_level; --i >= 0;) {
+       for (i = context->root_role.level; --i >= 0;) {
                shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
                shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
        }
@@ -4700,7 +4752,7 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
        if (!is_cr0_pg(mmu))
                return;
 
-       reset_rsvds_bits_mask(vcpu, mmu);
+       reset_guest_rsvds_bits_mask(vcpu, mmu);
        update_permission_bitmask(mmu, false);
        update_pkru_bitmask(mmu);
 }
@@ -4711,7 +4763,6 @@ static void paging64_init_context(struct kvm_mmu *context)
        context->gva_to_gpa = paging64_gva_to_gpa;
        context->sync_page = paging64_sync_page;
        context->invlpg = paging64_invlpg;
-       context->direct_map = false;
 }
 
 static void paging32_init_context(struct kvm_mmu *context)
@@ -4720,51 +4771,45 @@ static void paging32_init_context(struct kvm_mmu *context)
        context->gva_to_gpa = paging32_gva_to_gpa;
        context->sync_page = paging32_sync_page;
        context->invlpg = paging32_invlpg;
-       context->direct_map = false;
-}
-
-static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
-                                                        struct kvm_mmu_role_regs *regs)
-{
-       union kvm_mmu_extended_role ext = {0};
-
-       if (____is_cr0_pg(regs)) {
-               ext.cr0_pg = 1;
-               ext.cr4_pae = ____is_cr4_pae(regs);
-               ext.cr4_smep = ____is_cr4_smep(regs);
-               ext.cr4_smap = ____is_cr4_smap(regs);
-               ext.cr4_pse = ____is_cr4_pse(regs);
-
-               /* PKEY and LA57 are active iff long mode is active. */
-               ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
-               ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
-               ext.efer_lma = ____is_efer_lma(regs);
-       }
-
-       ext.valid = 1;
-
-       return ext;
 }
 
-static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
-                                                  struct kvm_mmu_role_regs *regs,
-                                                  bool base_only)
+static union kvm_cpu_role
+kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 {
-       union kvm_mmu_role role = {0};
+       union kvm_cpu_role role = {0};
 
        role.base.access = ACC_ALL;
-       if (____is_cr0_pg(regs)) {
-               role.base.efer_nx = ____is_efer_nx(regs);
-               role.base.cr0_wp = ____is_cr0_wp(regs);
-       }
        role.base.smm = is_smm(vcpu);
        role.base.guest_mode = is_guest_mode(vcpu);
+       role.ext.valid = 1;
 
-       if (base_only)
+       if (!____is_cr0_pg(regs)) {
+               role.base.direct = 1;
                return role;
+       }
 
-       role.ext = kvm_calc_mmu_role_ext(vcpu, regs);
+       role.base.efer_nx = ____is_efer_nx(regs);
+       role.base.cr0_wp = ____is_cr0_wp(regs);
+       role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
+       role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
+       role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
 
+       if (____is_efer_lma(regs))
+               role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
+                                                       : PT64_ROOT_4LEVEL;
+       else if (____is_cr4_pae(regs))
+               role.base.level = PT32E_ROOT_LEVEL;
+       else
+               role.base.level = PT32_ROOT_LEVEL;
+
+       role.ext.cr4_smep = ____is_cr4_smep(regs);
+       role.ext.cr4_smap = ____is_cr4_smap(regs);
+       role.ext.cr4_pse = ____is_cr4_pse(regs);
+
+       /* PKEY and LA57 are active iff long mode is active. */
+       role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+       role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+       role.ext.efer_lma = ____is_efer_lma(regs);
        return role;
 }
 
@@ -4781,40 +4826,43 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
        return max_tdp_level;
 }
 
-static union kvm_mmu_role
+static union kvm_mmu_page_role
 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
-                               struct kvm_mmu_role_regs *regs, bool base_only)
+                               union kvm_cpu_role cpu_role)
 {
-       union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
+       union kvm_mmu_page_role role = {0};
 
-       role.base.ad_disabled = (shadow_accessed_mask == 0);
-       role.base.level = kvm_mmu_get_tdp_level(vcpu);
-       role.base.direct = true;
-       role.base.has_4_byte_gpte = false;
+       role.access = ACC_ALL;
+       role.cr0_wp = true;
+       role.efer_nx = true;
+       role.smm = cpu_role.base.smm;
+       role.guest_mode = cpu_role.base.guest_mode;
+       role.ad_disabled = !kvm_ad_enabled();
+       role.level = kvm_mmu_get_tdp_level(vcpu);
+       role.direct = true;
+       role.has_4_byte_gpte = false;
 
        return role;
 }
 
-static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
+                            union kvm_cpu_role cpu_role)
 {
        struct kvm_mmu *context = &vcpu->arch.root_mmu;
-       struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
-       union kvm_mmu_role new_role =
-               kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, false);
+       union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
 
-       if (new_role.as_u64 == context->mmu_role.as_u64)
+       if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+           root_role.word == context->root_role.word)
                return;
 
-       context->mmu_role.as_u64 = new_role.as_u64;
+       context->cpu_role.as_u64 = cpu_role.as_u64;
+       context->root_role.word = root_role.word;
        context->page_fault = kvm_tdp_page_fault;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = NULL;
-       context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
-       context->direct_map = true;
        context->get_guest_pgd = get_cr3;
        context->get_pdptr = kvm_pdptr_read;
        context->inject_page_fault = kvm_inject_page_fault;
-       context->root_level = role_regs_to_root_level(&regs);
 
        if (!is_cr0_pg(context))
                context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -4827,46 +4875,16 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        reset_tdp_shadow_zero_bits_mask(context);
 }
 
-static union kvm_mmu_role
-kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
-                                     struct kvm_mmu_role_regs *regs, bool base_only)
-{
-       union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
-
-       role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
-       role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
-       role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs);
-
-       return role;
-}
-
-static union kvm_mmu_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu_role_regs *regs, bool base_only)
-{
-       union kvm_mmu_role role =
-               kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only);
-
-       role.base.direct = !____is_cr0_pg(regs);
-
-       if (!____is_efer_lma(regs))
-               role.base.level = PT32E_ROOT_LEVEL;
-       else if (____is_cr4_la57(regs))
-               role.base.level = PT64_ROOT_5LEVEL;
-       else
-               role.base.level = PT64_ROOT_4LEVEL;
-
-       return role;
-}
-
 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-                                   struct kvm_mmu_role_regs *regs,
-                                   union kvm_mmu_role new_role)
+                                   union kvm_cpu_role cpu_role,
+                                   union kvm_mmu_page_role root_role)
 {
-       if (new_role.as_u64 == context->mmu_role.as_u64)
+       if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+           root_role.word == context->root_role.word)
                return;
 
-       context->mmu_role.as_u64 = new_role.as_u64;
+       context->cpu_role.as_u64 = cpu_role.as_u64;
+       context->root_role.word = root_role.word;
 
        if (!is_cr0_pg(context))
                nonpaging_init_context(context);
@@ -4874,35 +4892,34 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
                paging64_init_context(context);
        else
                paging32_init_context(context);
-       context->root_level = role_regs_to_root_level(regs);
 
        reset_guest_paging_metadata(vcpu, context);
-       context->shadow_root_level = new_role.base.level;
-
        reset_shadow_zero_bits_mask(vcpu, context);
 }
 
 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
-                               struct kvm_mmu_role_regs *regs)
+                               union kvm_cpu_role cpu_role)
 {
        struct kvm_mmu *context = &vcpu->arch.root_mmu;
-       union kvm_mmu_role new_role =
-               kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false);
+       union kvm_mmu_page_role root_role;
 
-       shadow_mmu_init_context(vcpu, context, regs, new_role);
-}
+       root_role = cpu_role.base;
 
-static union kvm_mmu_role
-kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu_role_regs *regs)
-{
-       union kvm_mmu_role role =
-               kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
+       /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
+       root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
 
-       role.base.direct = false;
-       role.base.level = kvm_mmu_get_tdp_level(vcpu);
+       /*
+        * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
+        * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+        * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+        * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+        * The iTLB multi-hit workaround can be toggled at any time, so assume
+        * NX can be used by any non-nested shadow MMU to avoid having to reset
+        * MMU contexts.
+        */
+       root_role.efer_nx = true;
 
-       return role;
+       shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
 }
 
 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
@@ -4914,24 +4931,34 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
                .cr4 = cr4 & ~X86_CR4_PKE,
                .efer = efer,
        };
-       union kvm_mmu_role new_role;
+       union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+       union kvm_mmu_page_role root_role;
+
+       /* NPT requires CR0.PG=1. */
+       WARN_ON_ONCE(cpu_role.base.direct);
 
-       new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
+       root_role = cpu_role.base;
+       root_role.level = kvm_mmu_get_tdp_level(vcpu);
+       if (root_role.level == PT64_ROOT_5LEVEL &&
+           cpu_role.base.level == PT64_ROOT_4LEVEL)
+               root_role.passthrough = 1;
 
-       shadow_mmu_init_context(vcpu, context, &regs, new_role);
+       shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
        kvm_mmu_new_pgd(vcpu, nested_cr3);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
 
-static union kvm_mmu_role
+static union kvm_cpu_role
 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
                                   bool execonly, u8 level)
 {
-       union kvm_mmu_role role = {0};
-
-       /* SMM flag is inherited from root_mmu */
-       role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
+       union kvm_cpu_role role = {0};
 
+       /*
+        * KVM does not support SMM transfer monitors, and consequently does not
+        * support the "entry to SMM" control either.  role.base.smm is always 0.
+        */
+       WARN_ON_ONCE(is_smm(vcpu));
        role.base.level = level;
        role.base.has_4_byte_gpte = false;
        role.base.direct = false;
@@ -4939,7 +4966,6 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
        role.base.guest_mode = true;
        role.base.access = ACC_ALL;
 
-       /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
        role.ext.word = 0;
        role.ext.execonly = execonly;
        role.ext.valid = 1;
@@ -4953,22 +4979,20 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 {
        struct kvm_mmu *context = &vcpu->arch.guest_mmu;
        u8 level = vmx_eptp_page_walk_level(new_eptp);
-       union kvm_mmu_role new_role =
+       union kvm_cpu_role new_mode =
                kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
                                                   execonly, level);
 
-       if (new_role.as_u64 != context->mmu_role.as_u64) {
-               context->mmu_role.as_u64 = new_role.as_u64;
+       if (new_mode.as_u64 != context->cpu_role.as_u64) {
+               /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
+               context->cpu_role.as_u64 = new_mode.as_u64;
+               context->root_role.word = new_mode.base.word;
 
-               context->shadow_root_level = level;
-
-               context->ept_ad = accessed_dirty;
                context->page_fault = ept_page_fault;
                context->gva_to_gpa = ept_gva_to_gpa;
                context->sync_page = ept_sync_page;
                context->invlpg = ept_invlpg;
-               context->root_level = level;
-               context->direct_map = false;
+
                update_permission_bitmask(context, true);
                context->pkru_mask = 0;
                reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
@@ -4979,49 +5003,30 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
-static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
+                            union kvm_cpu_role cpu_role)
 {
        struct kvm_mmu *context = &vcpu->arch.root_mmu;
-       struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
 
-       kvm_init_shadow_mmu(vcpu, &regs);
+       kvm_init_shadow_mmu(vcpu, cpu_role);
 
        context->get_guest_pgd     = get_cr3;
        context->get_pdptr         = kvm_pdptr_read;
        context->inject_page_fault = kvm_inject_page_fault;
 }
 
-static union kvm_mmu_role
-kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs)
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
+                               union kvm_cpu_role new_mode)
 {
-       union kvm_mmu_role role;
-
-       role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
-
-       /*
-        * Nested MMUs are used only for walking L2's gva->gpa, they never have
-        * shadow pages of their own and so "direct" has no meaning.   Set it
-        * to "true" to try to detect bogus usage of the nested MMU.
-        */
-       role.base.direct = true;
-       role.base.level = role_regs_to_root_level(regs);
-       return role;
-}
-
-static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
-       union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, &regs);
        struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
-       if (new_role.as_u64 == g_context->mmu_role.as_u64)
+       if (new_mode.as_u64 == g_context->cpu_role.as_u64)
                return;
 
-       g_context->mmu_role.as_u64 = new_role.as_u64;
+       g_context->cpu_role.as_u64   = new_mode.as_u64;
        g_context->get_guest_pgd     = get_cr3;
        g_context->get_pdptr         = kvm_pdptr_read;
        g_context->inject_page_fault = kvm_inject_page_fault;
-       g_context->root_level        = new_role.base.level;
 
        /*
         * L2 page tables are never shadowed, so there is no need to sync
@@ -5051,12 +5056,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 
 void kvm_init_mmu(struct kvm_vcpu *vcpu)
 {
+       struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+       union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+
        if (mmu_is_nested(vcpu))
-               init_kvm_nested_mmu(vcpu);
+               init_kvm_nested_mmu(vcpu, cpu_role);
        else if (tdp_enabled)
-               init_kvm_tdp_mmu(vcpu);
+               init_kvm_tdp_mmu(vcpu, cpu_role);
        else
-               init_kvm_softmmu(vcpu);
+               init_kvm_softmmu(vcpu, cpu_role);
 }
 EXPORT_SYMBOL_GPL(kvm_init_mmu);
 
@@ -5074,9 +5082,12 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
         * problem is swept under the rug; KVM's CPUID API is horrific and
         * it's all but impossible to solve it without introducing a new API.
         */
-       vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
-       vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
-       vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
+       vcpu->arch.root_mmu.root_role.word = 0;
+       vcpu->arch.guest_mmu.root_role.word = 0;
+       vcpu->arch.nested_mmu.root_role.word = 0;
+       vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
+       vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
+       vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
        kvm_mmu_reset_context(vcpu);
 
        /*
@@ -5097,13 +5108,13 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
        int r;
 
-       r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
+       r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
        if (r)
                goto out;
        r = mmu_alloc_special_roots(vcpu);
        if (r)
                goto out;
-       if (vcpu->arch.mmu->direct_map)
+       if (vcpu->arch.mmu->root_role.direct)
                r = mmu_alloc_direct_roots(vcpu);
        else
                r = mmu_alloc_shadow_roots(vcpu);
@@ -5330,7 +5341,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
        ++vcpu->kvm->stat.mmu_pte_write;
 
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+       for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
                if (detect_write_misaligned(sp, gpa, bytes) ||
                      detect_write_flooding(sp)) {
                        kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
@@ -5356,11 +5367,11 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        write_unlock(&vcpu->kvm->mmu_lock);
 }
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                       void *insn, int insn_len)
 {
        int r, emulation_type = EMULTYPE_PF;
-       bool direct = vcpu->arch.mmu->direct_map;
+       bool direct = vcpu->arch.mmu->root_role.direct;
 
        if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
                return RET_PF_RETRY;
@@ -5391,7 +5402,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
         * paging in both guests. If true, we simply unprotect the page
         * and resume the guest.
         */
-       if (vcpu->arch.mmu->direct_map &&
+       if (vcpu->arch.mmu->root_role.direct &&
            (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
                kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
                return 1;
@@ -5625,7 +5636,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
        if (!tdp_enabled)
                set_memory_decrypted((unsigned long)mmu->pae_root, 1);
        else
-               WARN_ON_ONCE(shadow_me_mask);
+               WARN_ON_ONCE(shadow_me_value);
 
        for (i = 0; i < 4; ++i)
                mmu->pae_root[i] = INVALID_PAE_ROOT;
@@ -6287,7 +6298,7 @@ int kvm_mmu_vendor_module_init(void)
         */
        BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
        BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
-       BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
+       BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
 
        kvm_mmu_reset_all_pte_masks();
 
index 1bff453..bd2a268 100644 (file)
@@ -140,9 +140,72 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
                                        u64 start_gfn, u64 pages);
 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
 
+extern int nx_huge_pages;
+static inline bool is_nx_huge_page_enabled(void)
+{
+       return READ_ONCE(nx_huge_pages);
+}
+
+struct kvm_page_fault {
+       /* arguments to kvm_mmu_do_page_fault.  */
+       const gpa_t addr;
+       const u32 error_code;
+       const bool prefetch;
+
+       /* Derived from error_code.  */
+       const bool exec;
+       const bool write;
+       const bool present;
+       const bool rsvd;
+       const bool user;
+
+       /* Derived from mmu and global state.  */
+       const bool is_tdp;
+       const bool nx_huge_page_workaround_enabled;
+
+       /*
+        * Whether a >4KB mapping can be created or is forbidden due to NX
+        * hugepages.
+        */
+       bool huge_page_disallowed;
+
+       /*
+        * Maximum page size that can be created for this fault; input to
+        * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
+        */
+       u8 max_level;
+
+       /*
+        * Page size that can be created based on the max_level and the
+        * page size used by the host mapping.
+        */
+       u8 req_level;
+
+       /*
+        * Page size that will be created based on the req_level and
+        * huge_page_disallowed.
+        */
+       u8 goal_level;
+
+       /* Shifted addr, or result of guest page table walk if addr is a gva.  */
+       gfn_t gfn;
+
+       /* The memslot containing gfn. May be NULL. */
+       struct kvm_memory_slot *slot;
+
+       /* Outputs of kvm_faultin_pfn.  */
+       kvm_pfn_t pfn;
+       hva_t hva;
+       bool map_writable;
+};
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
 /*
- * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
+ * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
+ * and of course kvm_mmu_do_page_fault().
  *
+ * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
  * RET_PF_RETRY: let CPU fault again on the address.
  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
@@ -151,15 +214,71 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
  *
  * Any names added to this enum should be exported to userspace for use in
  * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
+ *
+ * Note, all values must be greater than or equal to zero so as not to encroach
+ * on -errno return values.  Somewhat arbitrarily use '0' for CONTINUE, which
+ * will allow for efficient machine code when checking for CONTINUE, e.g.
+ * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
  */
 enum {
-       RET_PF_RETRY = 0,
+       RET_PF_CONTINUE = 0,
+       RET_PF_RETRY,
        RET_PF_EMULATE,
        RET_PF_INVALID,
        RET_PF_FIXED,
        RET_PF_SPURIOUS,
 };
 
+static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                                       u32 err, bool prefetch)
+{
+       struct kvm_page_fault fault = {
+               .addr = cr2_or_gpa,
+               .error_code = err,
+               .exec = err & PFERR_FETCH_MASK,
+               .write = err & PFERR_WRITE_MASK,
+               .present = err & PFERR_PRESENT_MASK,
+               .rsvd = err & PFERR_RSVD_MASK,
+               .user = err & PFERR_USER_MASK,
+               .prefetch = prefetch,
+               .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
+               .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
+
+               .max_level = KVM_MAX_HUGEPAGE_LEVEL,
+               .req_level = PG_LEVEL_4K,
+               .goal_level = PG_LEVEL_4K,
+       };
+       int r;
+
+       /*
+        * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
+        * guest perspective and have already been counted at the time of the
+        * original fault.
+        */
+       if (!prefetch)
+               vcpu->stat.pf_taken++;
+
+       if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
+               r = kvm_tdp_page_fault(vcpu, &fault);
+       else
+               r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+
+       /*
+        * Similar to above, prefetch faults aren't truly spurious, and the
+        * async #PF path doesn't do emulation.  Do count faults that are fixed
+        * by the async #PF handler though, otherwise they'll never be counted.
+        */
+       if (r == RET_PF_FIXED)
+               vcpu->stat.pf_fixed++;
+       else if (prefetch)
+               ;
+       else if (r == RET_PF_EMULATE)
+               vcpu->stat.pf_emulate++;
+       else if (r == RET_PF_SPURIOUS)
+               vcpu->stat.pf_spurious++;
+       return r;
+}
+
 int kvm_mmu_max_mapping_level(struct kvm *kvm,
                              const struct kvm_memory_slot *slot, gfn_t gfn,
                              kvm_pfn_t pfn, int max_level);
index 12247b9..ae86820 100644 (file)
@@ -54,6 +54,7 @@
        { PFERR_RSVD_MASK, "RSVD" },    \
        { PFERR_FETCH_MASK, "F" }
 
+TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
 TRACE_DEFINE_ENUM(RET_PF_RETRY);
 TRACE_DEFINE_ENUM(RET_PF_EMULATE);
 TRACE_DEFINE_ENUM(RET_PF_INVALID);
index 01fee5f..db80f7c 100644 (file)
@@ -63,7 +63,7 @@
        #define PT_LEVEL_BITS PT64_LEVEL_BITS
        #define PT_GUEST_DIRTY_SHIFT 9
        #define PT_GUEST_ACCESSED_SHIFT 8
-       #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
+       #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
        #ifdef CONFIG_X86_64
        #define CMPXCHG "cmpxchgq"
        #endif
@@ -144,42 +144,6 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
               FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
 }
 
-static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-                              pt_element_t __user *ptep_user, unsigned index,
-                              pt_element_t orig_pte, pt_element_t new_pte)
-{
-       signed char r;
-
-       if (!user_access_begin(ptep_user, sizeof(pt_element_t)))
-               return -EFAULT;
-
-#ifdef CMPXCHG
-       asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n"
-                    "setnz %b[r]\n"
-                    "2:"
-                    _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
-                    : [ptr] "+m" (*ptep_user),
-                      [old] "+a" (orig_pte),
-                      [r] "=q" (r)
-                    : [new] "r" (new_pte)
-                    : "memory");
-#else
-       asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n"
-                    "setnz %b[r]\n"
-                    "2:"
-                    _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
-                    : [ptr] "+m" (*ptep_user),
-                      [old] "+A" (orig_pte),
-                      [r] "=q" (r)
-                    : [new_lo] "b" ((u32)new_pte),
-                      [new_hi] "c" ((u32)(new_pte >> 32))
-                    : "memory");
-#endif
-
-       user_access_end();
-       return r;
-}
-
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
                                  struct kvm_mmu_page *sp, u64 *spte,
                                  u64 gpte)
@@ -187,7 +151,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
        if (!FNAME(is_present_gpte)(gpte))
                goto no_present;
 
-       /* if accessed bit is not supported prefetch non accessed gpte */
+       /* Prefetch only accessed entries (unless A/D bits are disabled). */
        if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
            !(gpte & PT_GUEST_ACCESSED_MASK))
                goto no_present;
@@ -278,7 +242,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
                if (unlikely(!walker->pte_writable[level - 1]))
                        continue;
 
-               ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
+               ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
                if (ret)
                        return ret;
 
@@ -317,7 +281,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
         * is not reserved and does not indicate a large page at this level,
         * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
         */
-       gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse);
+       gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse);
 #endif
        /*
         * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
@@ -355,7 +319,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 
        trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
-       walker->level = mmu->root_level;
+       walker->level = mmu->cpu_role.base.level;
        pte           = mmu->get_guest_pgd(vcpu);
        have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
 
@@ -515,14 +479,21 @@ error:
         * The other bits are set to 0.
         */
        if (!(errcode & PFERR_RSVD_MASK)) {
-               vcpu->arch.exit_qualification &= 0x180;
+               vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
+                                                 EPT_VIOLATION_GVA_TRANSLATED);
                if (write_fault)
                        vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
                if (user_fault)
                        vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
                if (fetch_fault)
                        vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
-               vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
+
+               /*
+                * Note, pte_access holds the raw RWX bits from the EPTE, not
+                * ACC_*_MASK flags!
+                */
+               vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
+                                                EPT_VIOLATION_RWX_SHIFT;
        }
 #endif
        walker->fault.address = addr;
@@ -650,7 +621,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
        WARN_ON_ONCE(gw->gfn != base_gfn);
        direct_access = gw->pte_access;
 
-       top_level = vcpu->arch.mmu->root_level;
+       top_level = vcpu->arch.mmu->cpu_role.base.level;
        if (top_level == PT32E_ROOT_LEVEL)
                top_level = PT32_ROOT_LEVEL;
        /*
@@ -752,7 +723,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
                return ret;
 
        FNAME(pte_prefetch)(vcpu, gw, it.sptep);
-       ++vcpu->stat.pf_fixed;
        return ret;
 
 out_gpte_changed:
@@ -867,10 +837,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
 
-       if (kvm_faultin_pfn(vcpu, fault, &r))
+       r = kvm_faultin_pfn(vcpu, fault);
+       if (r != RET_PF_CONTINUE)
                return r;
 
-       if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r))
+       r = handle_abnormal_pfn(vcpu, fault, walker.pte_access);
+       if (r != RET_PF_CONTINUE)
                return r;
 
        /*
@@ -1017,7 +989,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-       union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base;
+       union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
        int i;
        bool host_writable;
        gpa_t first_pte_gpa;
@@ -1036,6 +1008,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                .level = 0xf,
                .access = 0x7,
                .quadrant = 0x3,
+               .passthrough = 0x1,
        };
 
        /*
@@ -1045,7 +1018,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
         * reserved bits checks will be wrong, etc...
         */
        if (WARN_ON_ONCE(sp->role.direct ||
-                        (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word))
+                        (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
                return -1;
 
        first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
index e5c0b6d..b5960bb 100644 (file)
@@ -19,7 +19,7 @@
 #include <asm/memtype.h>
 #include <asm/vmx.h>
 
-static bool __read_mostly enable_mmio_caching = true;
+bool __read_mostly enable_mmio_caching = true;
 module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
 
 u64 __read_mostly shadow_host_writable_mask;
@@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value;
 u64 __read_mostly shadow_mmio_mask;
 u64 __read_mostly shadow_mmio_access_mask;
 u64 __read_mostly shadow_present_mask;
+u64 __read_mostly shadow_me_value;
 u64 __read_mostly shadow_me_mask;
 u64 __read_mostly shadow_acc_track_mask;
 
@@ -167,8 +168,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
        else
                pte_access &= ~ACC_WRITE_MASK;
 
-       if (!kvm_is_mmio_pfn(pfn))
-               spte |= shadow_me_mask;
+       if (shadow_me_value && !kvm_is_mmio_pfn(pfn))
+               spte |= shadow_me_value;
 
        spte |= (u64)pfn << PAGE_SHIFT;
 
@@ -284,7 +285,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
        u64 spte = SPTE_MMU_PRESENT_MASK;
 
        spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
-               shadow_user_mask | shadow_x_mask | shadow_me_mask;
+               shadow_user_mask | shadow_x_mask | shadow_me_value;
 
        if (ad_disabled)
                spte |= SPTE_TDP_AD_DISABLED_MASK;
@@ -310,25 +311,6 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
        return new_spte;
 }
 
-static u8 kvm_get_shadow_phys_bits(void)
-{
-       /*
-        * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
-        * in CPU detection code, but the processor treats those reduced bits as
-        * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
-        * the physical address bits reported by CPUID.
-        */
-       if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
-               return cpuid_eax(0x80000008) & 0xff;
-
-       /*
-        * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
-        * custom CPUID.  Proceed with whatever the kernel found since these features
-        * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
-        */
-       return boot_cpu_data.x86_phys_bits;
-}
-
 u64 mark_spte_for_access_track(u64 spte)
 {
        if (spte_ad_enabled(spte))
@@ -379,12 +361,26 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
            WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
                mmio_value = 0;
 
+       if (!mmio_value)
+               enable_mmio_caching = false;
+
        shadow_mmio_value = mmio_value;
        shadow_mmio_mask  = mmio_mask;
        shadow_mmio_access_mask = access_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
+{
+       /* shadow_me_value must be a subset of shadow_me_mask */
+       if (WARN_ON(me_value & ~me_mask))
+               me_value = me_mask = 0;
+
+       shadow_me_value = me_value;
+       shadow_me_mask = me_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask);
+
 void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 {
        shadow_user_mask        = VMX_EPT_READABLE_MASK;
@@ -394,8 +390,6 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
        shadow_x_mask           = VMX_EPT_EXECUTABLE_MASK;
        shadow_present_mask     = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
        shadow_acc_track_mask   = VMX_EPT_RWX_MASK;
-       shadow_me_mask          = 0ull;
-
        shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
        shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
 
@@ -446,7 +440,8 @@ void kvm_mmu_reset_all_pte_masks(void)
        shadow_x_mask           = 0;
        shadow_present_mask     = PT_PRESENT_MASK;
        shadow_acc_track_mask   = 0;
-       shadow_me_mask          = sme_me_mask;
+       shadow_me_mask          = 0;
+       shadow_me_value         = 0;
 
        shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE;
        shadow_mmu_writable_mask  = DEFAULT_SPTE_MMU_WRITABLE;
index 80ab0f5..0127bb6 100644 (file)
@@ -5,6 +5,8 @@
 
 #include "mmu_internal.h"
 
+extern bool __read_mostly enable_mmio_caching;
+
 /*
  * A MMU present SPTE is backed by actual memory and may or may not be present
  * in hardware.  E.g. MMIO SPTEs are not considered present.  Use bit 11, as it
@@ -149,6 +151,7 @@ extern u64 __read_mostly shadow_mmio_value;
 extern u64 __read_mostly shadow_mmio_mask;
 extern u64 __read_mostly shadow_mmio_access_mask;
 extern u64 __read_mostly shadow_present_mask;
+extern u64 __read_mostly shadow_me_value;
 extern u64 __read_mostly shadow_me_mask;
 
 /*
@@ -204,7 +207,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
 static inline bool is_mmio_spte(u64 spte)
 {
        return (spte & shadow_mmio_mask) == shadow_mmio_value &&
-              likely(shadow_mmio_value);
+              likely(enable_mmio_caching);
 }
 
 static inline bool is_shadow_present_pte(u64 pte)
@@ -212,6 +215,17 @@ static inline bool is_shadow_present_pte(u64 pte)
        return !!(pte & SPTE_MMU_PRESENT_MASK);
 }
 
+/*
+ * Returns true if A/D bits are supported in hardware and are enabled by KVM.
+ * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
+ * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
+ * scenario where KVM is using A/D bits for L1, but not L2.
+ */
+static inline bool kvm_ad_enabled(void)
+{
+       return !!shadow_accessed_mask;
+}
+
 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 {
        return sp->role.ad_disabled;
index 922b06b..841feaa 100644 (file)
@@ -310,7 +310,7 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
 
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 {
-       union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
+       union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_page *root;
 
@@ -1100,6 +1100,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 
        /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
        if (unlikely(is_mmio_spte(new_spte))) {
+               vcpu->stat.pf_mmio_spte_created++;
                trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
                                     new_spte);
                ret = RET_PF_EMULATE;
@@ -1108,13 +1109,6 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
                                       rcu_dereference(iter->sptep));
        }
 
-       /*
-        * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
-        * consistent with legacy MMU behavior.
-        */
-       if (ret != RET_PF_SPURIOUS)
-               vcpu->stat.pf_fixed++;
-
        return ret;
 }
 
@@ -1136,7 +1130,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
                           struct kvm_mmu_page *sp, bool account_nx,
                           bool shared)
 {
-       u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
+       u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
        int ret = 0;
 
        if (shared) {
@@ -1859,7 +1853,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
        gfn_t gfn = addr >> PAGE_SHIFT;
        int leaf = -1;
 
-       *root_level = vcpu->arch.mmu->shadow_root_level;
+       *root_level = vcpu->arch.mmu->root_role.level;
 
        tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
                leaf = iter.level;
index 0604bc2..3f868fe 100644 (file)
  *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
  */
 
+static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
+
+#define KVM_X86_PMU_OP(func)                                        \
+       DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func,                          \
+                               *(((struct kvm_pmu_ops *)0)->func));
+#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
+{
+       memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
+
+#define __KVM_X86_PMU_OP(func) \
+       static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
+#define KVM_X86_PMU_OP(func) \
+       WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
+#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+#undef __KVM_X86_PMU_OP
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+       return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
+}
+
 static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
 {
        struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
@@ -216,7 +242,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
                          ARCH_PERFMON_EVENTSEL_CMASK |
                          HSW_IN_TX |
                          HSW_IN_TX_CHECKPOINTED))) {
-               config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
+               config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
                if (config != PERF_COUNT_HW_MAX)
                        type = PERF_TYPE_HARDWARE;
        }
@@ -266,7 +292,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
 
        pmc->current_config = (u64)ctrl;
        pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
-                             kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc),
+                             static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc),
                              !(en_field & 0x2), /* exclude user */
                              !(en_field & 0x1), /* exclude kernel */
                              pmi);
@@ -275,7 +301,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
 
 void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
 {
-       struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
+       struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx);
 
        if (!pmc)
                return;
@@ -297,7 +323,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
        int bit;
 
        for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
-               struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit);
+               struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
 
                if (unlikely(!pmc || !pmc->perf_event)) {
                        clear_bit(bit, pmu->reprogram_pmi);
@@ -319,7 +345,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
 /* check if idx is a valid index to access PMU */
 bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
 {
-       return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
+       return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
 }
 
 bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -369,7 +395,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
        if (is_vmware_backdoor_pmc(idx))
                return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
 
-       pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask);
+       pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
        if (!pmc)
                return 1;
 
@@ -385,22 +411,21 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
 {
        if (lapic_in_kernel(vcpu)) {
-               if (kvm_x86_ops.pmu_ops->deliver_pmi)
-                       kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
+               static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
                kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
        }
 }
 
 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 {
-       return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) ||
-               kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr);
+       return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
+               static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
 }
 
 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
 {
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
-       struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr);
+       struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
 
        if (pmc)
                __set_bit(pmc->idx, pmu->pmc_in_use);
@@ -408,13 +433,13 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
 
 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
-       return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info);
+       return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
 }
 
 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
-       return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info);
+       return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
 }
 
 /* refresh PMU settings. This function generally is called when underlying
@@ -423,7 +448,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  */
 void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
 {
-       kvm_x86_ops.pmu_ops->refresh(vcpu);
+       static_call(kvm_x86_pmu_refresh)(vcpu);
 }
 
 void kvm_pmu_reset(struct kvm_vcpu *vcpu)
@@ -431,7 +456,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 
        irq_work_sync(&pmu->irq_work);
-       kvm_x86_ops.pmu_ops->reset(vcpu);
+       static_call(kvm_x86_pmu_reset)(vcpu);
 }
 
 void kvm_pmu_init(struct kvm_vcpu *vcpu)
@@ -439,7 +464,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 
        memset(pmu, 0, sizeof(*pmu));
-       kvm_x86_ops.pmu_ops->init(vcpu);
+       static_call(kvm_x86_pmu_init)(vcpu);
        init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
        pmu->event_count = 0;
        pmu->need_cleanup = false;
@@ -471,14 +496,13 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
                      pmu->pmc_in_use, X86_PMC_IDX_MAX);
 
        for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
-               pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
+               pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
 
                if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
                        pmc_stop_counter(pmc);
        }
 
-       if (kvm_x86_ops.pmu_ops->cleanup)
-               kvm_x86_ops.pmu_ops->cleanup(vcpu);
+       static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
 
        bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
 }
@@ -508,7 +532,7 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
        unsigned int config;
 
        pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
-       config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
+       config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
        pmc->eventsel = old_eventsel;
        return config == perf_hw_id;
 }
@@ -536,7 +560,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
        int i;
 
        for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
-               pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
+               pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
 
                if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
                        continue;
index 22992b0..e745f44 100644 (file)
@@ -39,6 +39,8 @@ struct kvm_pmu_ops {
        void (*cleanup)(struct kvm_vcpu *vcpu);
 };
 
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
+
 static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
 {
        struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -86,11 +88,6 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
        return pmc->type == KVM_PMC_FIXED;
 }
 
-static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
-{
-       return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc);
-}
-
 static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
                                                 u64 data)
 {
index 4216195..54fe037 100644 (file)
@@ -165,9 +165,8 @@ free_avic:
        return err;
 }
 
-void avic_init_vmcb(struct vcpu_svm *svm)
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
 {
-       struct vmcb *vmcb = svm->vmcb;
        struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
        phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
        phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
@@ -285,11 +284,77 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
        put_cpu();
 }
 
-static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
-                                  u32 icrl, u32 icrh)
+/*
+ * A fast-path version of avic_kick_target_vcpus(), which attempts to match
+ * destination APIC ID to vCPU without looping through all vCPUs.
+ */
+static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
+                                      u32 icrl, u32 icrh, u32 index)
 {
+       u32 dest, apic_id;
        struct kvm_vcpu *vcpu;
+       int dest_mode = icrl & APIC_DEST_MASK;
+       int shorthand = icrl & APIC_SHORT_MASK;
+       struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+       u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
+
+       if (shorthand != APIC_DEST_NOSHORT)
+               return -EINVAL;
+
+       /*
+        * The AVIC incomplete IPI #vmexit info provides index into
+        * the physical APIC ID table, which can be used to derive
+        * guest physical APIC ID.
+        */
+       if (dest_mode == APIC_DEST_PHYSICAL) {
+               apic_id = index;
+       } else {
+               if (!apic_x2apic_mode(source)) {
+                       /* For xAPIC logical mode, the index is for logical APIC table. */
+                       apic_id = avic_logical_id_table[index] & 0x1ff;
+               } else {
+                       return -EINVAL;
+               }
+       }
+
+       /*
+        * Assuming vcpu ID is the same as physical apic ID,
+        * and use it to retrieve the target vCPU.
+        */
+       vcpu = kvm_get_vcpu_by_id(kvm, apic_id);
+       if (!vcpu)
+               return -EINVAL;
+
+       if (apic_x2apic_mode(vcpu->arch.apic))
+               dest = icrh;
+       else
+               dest = GET_APIC_DEST_FIELD(icrh);
+
+       /*
+        * Try matching the destination APIC ID with the vCPU.
+        */
+       if (kvm_apic_match_dest(vcpu, source, shorthand, dest, dest_mode)) {
+               vcpu->arch.apic->irr_pending = true;
+               svm_complete_interrupt_delivery(vcpu,
+                                               icrl & APIC_MODE_MASK,
+                                               icrl & APIC_INT_LEVELTRIG,
+                                               icrl & APIC_VECTOR_MASK);
+               return 0;
+       }
+
+       return -EINVAL;
+}
+
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+                                  u32 icrl, u32 icrh, u32 index)
+{
        unsigned long i;
+       struct kvm_vcpu *vcpu;
+
+       if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
+               return;
+
+       trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
 
        /*
         * Wake any target vCPUs that are blocking, i.e. waiting for a wake
@@ -316,7 +381,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
        u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
        u32 icrl = svm->vmcb->control.exit_info_1;
        u32 id = svm->vmcb->control.exit_info_2 >> 32;
-       u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
+       u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
        struct kvm_lapic *apic = vcpu->arch.apic;
 
        trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
@@ -343,7 +408,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
                 * set the appropriate IRR bits on the valid target
                 * vcpus. So, we just need to kick the appropriate vcpu.
                 */
-               avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
+               avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
                break;
        case AVIC_IPI_FAILURE_INVALID_TARGET:
                break;
@@ -357,6 +422,13 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+{
+       if (is_guest_mode(vcpu))
+               return APICV_INHIBIT_REASON_NESTED;
+       return 0;
+}
+
 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
 {
        struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
index 96bab46..bed5e16 100644 (file)
@@ -36,40 +36,45 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
                                       struct x86_exception *fault)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb = svm->vmcb;
 
-       if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+       if (vmcb->control.exit_code != SVM_EXIT_NPF) {
                /*
                 * TODO: track the cause of the nested page fault, and
                 * correctly fill in the high bits of exit_info_1.
                 */
-               svm->vmcb->control.exit_code = SVM_EXIT_NPF;
-               svm->vmcb->control.exit_code_hi = 0;
-               svm->vmcb->control.exit_info_1 = (1ULL << 32);
-               svm->vmcb->control.exit_info_2 = fault->address;
+               vmcb->control.exit_code = SVM_EXIT_NPF;
+               vmcb->control.exit_code_hi = 0;
+               vmcb->control.exit_info_1 = (1ULL << 32);
+               vmcb->control.exit_info_2 = fault->address;
        }
 
-       svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
-       svm->vmcb->control.exit_info_1 |= fault->error_code;
+       vmcb->control.exit_info_1 &= ~0xffffffffULL;
+       vmcb->control.exit_info_1 |= fault->error_code;
 
        nested_svm_vmexit(svm);
 }
 
-static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+static bool nested_svm_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
+                                                   struct x86_exception *fault)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-       WARN_ON(!is_guest_mode(vcpu));
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb = svm->vmcb;
+
+       WARN_ON(!is_guest_mode(vcpu));
 
        if (vmcb12_is_intercept(&svm->nested.ctl,
                                INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
-           !svm->nested.nested_run_pending) {
-               svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
-               svm->vmcb->control.exit_code_hi = 0;
-               svm->vmcb->control.exit_info_1 = fault->error_code;
-               svm->vmcb->control.exit_info_2 = fault->address;
-               nested_svm_vmexit(svm);
-       } else {
-               kvm_inject_page_fault(vcpu, fault);
-       }
+           !WARN_ON_ONCE(svm->nested.nested_run_pending)) {
+               vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
+               vmcb->control.exit_code_hi = 0;
+               vmcb->control.exit_info_1 = fault->error_code;
+               vmcb->control.exit_info_2 = fault->address;
+               nested_svm_vmexit(svm);
+               return true;
+       }
+
+       return false;
 }
 
 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
@@ -121,6 +126,20 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 }
 
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+       if (!svm->v_vmload_vmsave_enabled)
+               return true;
+
+       if (!nested_npt_enabled(svm))
+               return true;
+
+       if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+               return true;
+
+       return false;
+}
+
 void recalc_intercepts(struct vcpu_svm *svm)
 {
        struct vmcb_control_area *c, *h;
@@ -162,8 +181,17 @@ void recalc_intercepts(struct vcpu_svm *svm)
        if (!intercept_smi)
                vmcb_clr_intercept(c, INTERCEPT_SMI);
 
-       vmcb_set_intercept(c, INTERCEPT_VMLOAD);
-       vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+       if (nested_vmcb_needs_vls_intercept(svm)) {
+               /*
+                * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+                * we must intercept these instructions to correctly
+                * emulate them in case L1 doesn't intercept them.
+                */
+               vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+               vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+       } else {
+               WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+       }
 }
 
 /*
@@ -413,6 +441,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
                 */
                mask &= ~V_IRQ_MASK;
        }
+
+       if (nested_vgif_enabled(svm))
+               mask |= V_GIF_MASK;
+
        svm->nested.ctl.int_ctl        &= ~mask;
        svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
 }
@@ -454,11 +486,6 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
        vmcb12->control.exit_int_info = exit_int_info;
 }
 
-static inline bool nested_npt_enabled(struct vcpu_svm *svm)
-{
-       return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
-}
-
 static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
 {
        /*
@@ -515,6 +542,8 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
 static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
 {
        bool new_vmcb12 = false;
+       struct vmcb *vmcb01 = svm->vmcb01.ptr;
+       struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
 
        nested_vmcb02_compute_g_pat(svm);
 
@@ -526,18 +555,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
        }
 
        if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
-               svm->vmcb->save.es = vmcb12->save.es;
-               svm->vmcb->save.cs = vmcb12->save.cs;
-               svm->vmcb->save.ss = vmcb12->save.ss;
-               svm->vmcb->save.ds = vmcb12->save.ds;
-               svm->vmcb->save.cpl = vmcb12->save.cpl;
-               vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+               vmcb02->save.es = vmcb12->save.es;
+               vmcb02->save.cs = vmcb12->save.cs;
+               vmcb02->save.ss = vmcb12->save.ss;
+               vmcb02->save.ds = vmcb12->save.ds;
+               vmcb02->save.cpl = vmcb12->save.cpl;
+               vmcb_mark_dirty(vmcb02, VMCB_SEG);
        }
 
        if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
-               svm->vmcb->save.gdtr = vmcb12->save.gdtr;
-               svm->vmcb->save.idtr = vmcb12->save.idtr;
-               vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+               vmcb02->save.gdtr = vmcb12->save.gdtr;
+               vmcb02->save.idtr = vmcb12->save.idtr;
+               vmcb_mark_dirty(vmcb02, VMCB_DT);
        }
 
        kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
@@ -554,47 +583,59 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
        kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
 
        /* In case we don't even reach vcpu_run, the fields are not updated */
-       svm->vmcb->save.rax = vmcb12->save.rax;
-       svm->vmcb->save.rsp = vmcb12->save.rsp;
-       svm->vmcb->save.rip = vmcb12->save.rip;
+       vmcb02->save.rax = vmcb12->save.rax;
+       vmcb02->save.rsp = vmcb12->save.rsp;
+       vmcb02->save.rip = vmcb12->save.rip;
 
        /* These bits will be set properly on the first execution when new_vmc12 is true */
        if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
-               svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+               vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
                svm->vcpu.arch.dr6  = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
-               vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+               vmcb_mark_dirty(vmcb02, VMCB_DR);
+       }
+
+       if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+               /*
+                * Reserved bits of DEBUGCTL are ignored.  Be consistent with
+                * svm_set_msr's definition of reserved bits.
+                */
+               svm_copy_lbrs(vmcb02, vmcb12);
+               vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+               svm_update_lbrv(&svm->vcpu);
+
+       } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+               svm_copy_lbrs(vmcb02, vmcb01);
        }
 }
 
 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 {
-       const u32 int_ctl_vmcb01_bits =
-               V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK;
-
-       const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+       u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
+       u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
 
        struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vmcb *vmcb01 = svm->vmcb01.ptr;
+       struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
 
        /*
         * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
         * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
         */
 
-       /*
-        * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
-        * avic_physical_id.
-        */
-       WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+       if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+               int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+       else
+               int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
 
        /* Copied from vmcb01.  msrpm_base can be overwritten later.  */
-       svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
-       svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
-       svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+       vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
+       vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
+       vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
 
        /* Done at vmrun: asid.  */
 
        /* Also overwritten later if necessary.  */
-       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+       vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
 
        /* nested_cr3.  */
        if (nested_npt_enabled(svm))
@@ -605,21 +646,53 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
                        svm->nested.ctl.tsc_offset,
                        svm->tsc_ratio_msr);
 
-       svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+       vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
 
        if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
                WARN_ON(!svm->tsc_scaling_enabled);
                nested_svm_update_tsc_ratio_msr(vcpu);
        }
 
-       svm->vmcb->control.int_ctl             =
+       vmcb02->control.int_ctl             =
                (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
-               (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits);
-
-       svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
-       svm->vmcb->control.int_state           = svm->nested.ctl.int_state;
-       svm->vmcb->control.event_inj           = svm->nested.ctl.event_inj;
-       svm->vmcb->control.event_inj_err       = svm->nested.ctl.event_inj_err;
+               (vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
+
+       vmcb02->control.int_vector          = svm->nested.ctl.int_vector;
+       vmcb02->control.int_state           = svm->nested.ctl.int_state;
+       vmcb02->control.event_inj           = svm->nested.ctl.event_inj;
+       vmcb02->control.event_inj_err       = svm->nested.ctl.event_inj_err;
+
+       vmcb02->control.virt_ext            = vmcb01->control.virt_ext &
+                                             LBR_CTL_ENABLE_MASK;
+       if (svm->lbrv_enabled)
+               vmcb02->control.virt_ext  |=
+                       (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
+
+       if (!nested_vmcb_needs_vls_intercept(svm))
+               vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+       if (kvm_pause_in_guest(svm->vcpu.kvm)) {
+               /* use guest values since host doesn't use them */
+               vmcb02->control.pause_filter_count =
+                               svm->pause_filter_enabled ?
+                               svm->nested.ctl.pause_filter_count : 0;
+
+               vmcb02->control.pause_filter_thresh =
+                               svm->pause_threshold_enabled ?
+                               svm->nested.ctl.pause_filter_thresh : 0;
+
+       } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+               /* use host values when guest doesn't use them */
+               vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
+               vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
+       } else {
+               /*
+                * Intercept every PAUSE otherwise and
+                * ignore both host and guest values
+                */
+               vmcb02->control.pause_filter_count = 0;
+               vmcb02->control.pause_filter_thresh = 0;
+       }
 
        nested_svm_transition_tlb_flush(vcpu);
 
@@ -680,14 +753,14 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
        if (ret)
                return ret;
 
-       if (!npt_enabled)
-               vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
-
        if (!from_vmrun)
                kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
        svm_set_gif(svm, true);
 
+       if (kvm_vcpu_apicv_active(vcpu))
+               kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
        return 0;
 }
 
@@ -698,6 +771,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
        struct vmcb *vmcb12;
        struct kvm_host_map map;
        u64 vmcb12_gpa;
+       struct vmcb *vmcb01 = svm->vmcb01.ptr;
 
        if (!svm->nested.hsave_msr) {
                kvm_inject_gp(vcpu, 0);
@@ -741,14 +815,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
         * Since vmcb01 is not in use, we can use it to store some of the L1
         * state.
         */
-       svm->vmcb01.ptr->save.efer   = vcpu->arch.efer;
-       svm->vmcb01.ptr->save.cr0    = kvm_read_cr0(vcpu);
-       svm->vmcb01.ptr->save.cr4    = vcpu->arch.cr4;
-       svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
-       svm->vmcb01.ptr->save.rip    = kvm_rip_read(vcpu);
+       vmcb01->save.efer   = vcpu->arch.efer;
+       vmcb01->save.cr0    = kvm_read_cr0(vcpu);
+       vmcb01->save.cr4    = vcpu->arch.cr4;
+       vmcb01->save.rflags = kvm_get_rflags(vcpu);
+       vmcb01->save.rip    = kvm_rip_read(vcpu);
 
        if (!npt_enabled)
-               svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
+               vmcb01->save.cr3 = kvm_read_cr3(vcpu);
 
        svm->nested.nested_run_pending = 1;
 
@@ -814,14 +888,12 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 int nested_svm_vmexit(struct vcpu_svm *svm)
 {
        struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vmcb *vmcb01 = svm->vmcb01.ptr;
+       struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
        struct vmcb *vmcb12;
-       struct vmcb *vmcb = svm->vmcb;
        struct kvm_host_map map;
        int rc;
 
-       /* Triple faults in L2 should never escape. */
-       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
-
        rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
        if (rc) {
                if (rc == -EINVAL)
@@ -843,57 +915,68 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 
        /* Give the current vmcb to the guest */
 
-       vmcb12->save.es     = vmcb->save.es;
-       vmcb12->save.cs     = vmcb->save.cs;
-       vmcb12->save.ss     = vmcb->save.ss;
-       vmcb12->save.ds     = vmcb->save.ds;
-       vmcb12->save.gdtr   = vmcb->save.gdtr;
-       vmcb12->save.idtr   = vmcb->save.idtr;
+       vmcb12->save.es     = vmcb02->save.es;
+       vmcb12->save.cs     = vmcb02->save.cs;
+       vmcb12->save.ss     = vmcb02->save.ss;
+       vmcb12->save.ds     = vmcb02->save.ds;
+       vmcb12->save.gdtr   = vmcb02->save.gdtr;
+       vmcb12->save.idtr   = vmcb02->save.idtr;
        vmcb12->save.efer   = svm->vcpu.arch.efer;
        vmcb12->save.cr0    = kvm_read_cr0(vcpu);
        vmcb12->save.cr3    = kvm_read_cr3(vcpu);
-       vmcb12->save.cr2    = vmcb->save.cr2;
+       vmcb12->save.cr2    = vmcb02->save.cr2;
        vmcb12->save.cr4    = svm->vcpu.arch.cr4;
        vmcb12->save.rflags = kvm_get_rflags(vcpu);
        vmcb12->save.rip    = kvm_rip_read(vcpu);
        vmcb12->save.rsp    = kvm_rsp_read(vcpu);
        vmcb12->save.rax    = kvm_rax_read(vcpu);
-       vmcb12->save.dr7    = vmcb->save.dr7;
+       vmcb12->save.dr7    = vmcb02->save.dr7;
        vmcb12->save.dr6    = svm->vcpu.arch.dr6;
-       vmcb12->save.cpl    = vmcb->save.cpl;
+       vmcb12->save.cpl    = vmcb02->save.cpl;
 
-       vmcb12->control.int_state         = vmcb->control.int_state;
-       vmcb12->control.exit_code         = vmcb->control.exit_code;
-       vmcb12->control.exit_code_hi      = vmcb->control.exit_code_hi;
-       vmcb12->control.exit_info_1       = vmcb->control.exit_info_1;
-       vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;
+       vmcb12->control.int_state         = vmcb02->control.int_state;
+       vmcb12->control.exit_code         = vmcb02->control.exit_code;
+       vmcb12->control.exit_code_hi      = vmcb02->control.exit_code_hi;
+       vmcb12->control.exit_info_1       = vmcb02->control.exit_info_1;
+       vmcb12->control.exit_info_2       = vmcb02->control.exit_info_2;
 
        if (vmcb12->control.exit_code != SVM_EXIT_ERR)
                nested_save_pending_event_to_vmcb12(svm, vmcb12);
 
        if (svm->nrips_enabled)
-               vmcb12->control.next_rip  = vmcb->control.next_rip;
+               vmcb12->control.next_rip  = vmcb02->control.next_rip;
 
        vmcb12->control.int_ctl           = svm->nested.ctl.int_ctl;
        vmcb12->control.tlb_ctl           = svm->nested.ctl.tlb_ctl;
        vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
        vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;
 
+       if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
+               vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+
        nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
 
        svm_switch_vmcb(svm, &svm->vmcb01);
 
+       if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+               svm_copy_lbrs(vmcb12, vmcb02);
+               svm_update_lbrv(vcpu);
+       } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+               svm_copy_lbrs(vmcb01, vmcb02);
+               svm_update_lbrv(vcpu);
+       }
+
        /*
         * On vmexit the  GIF is set to false and
         * no event can be injected in L1.
         */
        svm_set_gif(svm, false);
-       svm->vmcb->control.exit_int_info = 0;
+       vmcb01->control.exit_int_info = 0;
 
        svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
-       if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
-               svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
-               vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+       if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+               vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+               vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
        }
 
        if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
@@ -907,13 +990,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        /*
         * Restore processor state that had been saved in vmcb01
         */
-       kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
-       svm_set_efer(vcpu, svm->vmcb->save.efer);
-       svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
-       svm_set_cr4(vcpu, svm->vmcb->save.cr4);
-       kvm_rax_write(vcpu, svm->vmcb->save.rax);
-       kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
-       kvm_rip_write(vcpu, svm->vmcb->save.rip);
+       kvm_set_rflags(vcpu, vmcb01->save.rflags);
+       svm_set_efer(vcpu, vmcb01->save.efer);
+       svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
+       svm_set_cr4(vcpu, vmcb01->save.cr4);
+       kvm_rax_write(vcpu, vmcb01->save.rax);
+       kvm_rsp_write(vcpu, vmcb01->save.rsp);
+       kvm_rip_write(vcpu, vmcb01->save.rip);
 
        svm->vcpu.arch.dr7 = DR7_FIXED_1;
        kvm_update_dr7(&svm->vcpu);
@@ -931,7 +1014,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 
        nested_svm_uninit_mmu_context(vcpu);
 
-       rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true);
+       rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
        if (rc)
                return 1;
 
@@ -949,9 +1032,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
         * right now so that it an be accounted for before we execute
         * L1's next instruction.
         */
-       if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+       if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
                kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
 
+       /*
+        * Un-inhibit the AVIC right away, so that other vCPUs can start
+        * to benefit from it right away.
+        */
+       if (kvm_apicv_activated(vcpu->kvm))
+               kvm_vcpu_update_apicv(vcpu);
+
        return 0;
 }
 
@@ -1162,12 +1252,13 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm)
 static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
 {
        unsigned int nr = svm->vcpu.arch.exception.nr;
+       struct vmcb *vmcb = svm->vmcb;
 
-       svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
-       svm->vmcb->control.exit_code_hi = 0;
+       vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+       vmcb->control.exit_code_hi = 0;
 
        if (svm->vcpu.arch.exception.has_error_code)
-               svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
+               vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
 
        /*
         * EXITINFO2 is undefined for all exception intercepts other
@@ -1175,11 +1266,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
         */
        if (nr == PF_VECTOR) {
                if (svm->vcpu.arch.exception.nested_apf)
-                       svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+                       vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
                else if (svm->vcpu.arch.exception.has_payload)
-                       svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+                       vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
                else
-                       svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+                       vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
        } else if (nr == DB_VECTOR) {
                /* See inject_pending_event.  */
                kvm_deliver_exception_payload(&svm->vcpu);
@@ -1567,6 +1658,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
 struct kvm_x86_nested_ops svm_nested_ops = {
        .leave_nested = svm_leave_nested,
        .check_events = svm_check_nested_events,
+       .handle_page_fault_workaround = nested_svm_handle_page_fault_workaround,
        .triple_fault = nested_svm_triple_fault,
        .get_nested_state_pages = svm_get_nested_state_pages,
        .get_state = svm_get_nested_state,
index 16a5ebb..136039f 100644 (file)
@@ -342,7 +342,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
        }
 }
 
-struct kvm_pmu_ops amd_pmu_ops = {
+struct kvm_pmu_ops amd_pmu_ops __initdata = {
        .pmc_perf_hw_id = amd_pmc_perf_hw_id,
        .pmc_is_enabled = amd_pmc_is_enabled,
        .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
index 636c77e..51fd985 100644 (file)
@@ -688,7 +688,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
                if (params.len > SEV_FW_BLOB_MAX_SIZE)
                        return -EINVAL;
 
-               blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
+               blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
                if (!blob)
                        return -ENOMEM;
 
@@ -808,7 +808,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
        if (!IS_ALIGNED(dst_paddr, 16) ||
            !IS_ALIGNED(paddr,     16) ||
            !IS_ALIGNED(size,      16)) {
-               tpage = (void *)alloc_page(GFP_KERNEL);
+               tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO);
                if (!tpage)
                        return -ENOMEM;
 
@@ -1094,7 +1094,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
                if (params.len > SEV_FW_BLOB_MAX_SIZE)
                        return -EINVAL;
 
-               blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
+               blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
                if (!blob)
                        return -ENOMEM;
 
@@ -1176,7 +1176,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
                return -EINVAL;
 
        /* allocate the memory to hold the session data blob */
-       session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+       session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
        if (!session_data)
                return -ENOMEM;
 
@@ -1300,11 +1300,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
        /* allocate memory for header and transport buffer */
        ret = -ENOMEM;
-       hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+       hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
        if (!hdr)
                goto e_unpin;
 
-       trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+       trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
        if (!trans_data)
                goto e_free_hdr;
 
@@ -2769,8 +2769,12 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
                pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
                        reason_set, reason_code);
 
-               ret = -EINVAL;
-               break;
+               vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+               vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+               vcpu->run->system_event.ndata = 1;
+               vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+               return 0;
        }
        default:
                /* Error, keep GHCB MSR value as-is */
@@ -2953,6 +2957,14 @@ void sev_es_init_vmcb(struct vcpu_svm *svm)
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+       if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) &&
+           (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) ||
+            guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) {
+               set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1);
+               if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP))
+                       svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+       }
 }
 
 void sev_es_vcpu_reset(struct vcpu_svm *svm)
index 17d334e..200045f 100644 (file)
@@ -62,8 +62,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
-#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-
 static bool erratum_383_found __read_mostly;
 
 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
@@ -101,6 +99,7 @@ static const struct svm_direct_access_msrs {
        { .index = MSR_EFER,                            .always = false },
        { .index = MSR_IA32_CR_PAT,                     .always = false },
        { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
+       { .index = MSR_TSC_AUX,                         .always = false },
        { .index = MSR_INVALID,                         .always = false },
 };
 
@@ -172,7 +171,7 @@ static int vls = true;
 module_param(vls, int, 0444);
 
 /* enable/disable Virtual GIF */
-static int vgif = true;
+int vgif = true;
 module_param(vgif, int, 0444);
 
 /* enable/disable LBR virtualization */
@@ -189,6 +188,9 @@ module_param(tsc_scaling, int, 0444);
 static bool avic;
 module_param(avic, bool, 0444);
 
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
+
 bool __read_mostly dump_invalid_vmcb;
 module_param(dump_invalid_vmcb, bool, 0644);
 
@@ -790,6 +792,17 @@ static void init_msrpm_offsets(void)
        }
 }
 
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+       to_vmcb->save.dbgctl            = from_vmcb->save.dbgctl;
+       to_vmcb->save.br_from           = from_vmcb->save.br_from;
+       to_vmcb->save.br_to             = from_vmcb->save.br_to;
+       to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
+       to_vmcb->save.last_excp_to      = from_vmcb->save.last_excp_to;
+
+       vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+}
+
 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -799,6 +812,10 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+       /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+       if (is_guest_mode(vcpu))
+               svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
 }
 
 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
@@ -810,6 +827,67 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+
+       /*
+        * Move the LBR msrs back to the vmcb01 to avoid copying them
+        * on nested guest entries.
+        */
+       if (is_guest_mode(vcpu))
+               svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
+}
+
+static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+{
+       /*
+        * If the LBR virtualization is disabled, the LBR msrs are always
+        * kept in the vmcb01 to avoid copying them on nested guest entries.
+        *
+        * If nested, and the LBR virtualization is enabled/disabled, the msrs
+        * are moved between the vmcb01 and vmcb02 as needed.
+        */
+       struct vmcb *vmcb =
+               (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
+                       svm->vmcb : svm->vmcb01.ptr;
+
+       switch (index) {
+       case MSR_IA32_DEBUGCTLMSR:
+               return vmcb->save.dbgctl;
+       case MSR_IA32_LASTBRANCHFROMIP:
+               return vmcb->save.br_from;
+       case MSR_IA32_LASTBRANCHTOIP:
+               return vmcb->save.br_to;
+       case MSR_IA32_LASTINTFROMIP:
+               return vmcb->save.last_excp_from;
+       case MSR_IA32_LASTINTTOIP:
+               return vmcb->save.last_excp_to;
+       default:
+               KVM_BUG(false, svm->vcpu.kvm,
+                       "%s: Unknown MSR 0x%x", __func__, index);
+               return 0;
+       }
+}
+
+void svm_update_lbrv(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
+                                          DEBUGCTLMSR_LBR;
+
+       bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
+                                     LBR_CTL_ENABLE_MASK);
+
+       if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
+               if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
+                       enable_lbrv = true;
+
+       if (enable_lbrv == current_enable_lbrv)
+               return;
+
+       if (enable_lbrv)
+               svm_enable_lbrv(vcpu);
+       else
+               svm_disable_lbrv(vcpu);
 }
 
 void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -831,6 +909,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
        struct vmcb_control_area *control = &svm->vmcb->control;
        int old = control->pause_filter_count;
 
+       if (kvm_pause_in_guest(vcpu->kvm) || !old)
+               return;
+
        control->pause_filter_count = __grow_ple_window(old,
                                                        pause_filter_count,
                                                        pause_filter_count_grow,
@@ -849,6 +930,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
        struct vmcb_control_area *control = &svm->vmcb->control;
        int old = control->pause_filter_count;
 
+       if (kvm_pause_in_guest(vcpu->kvm) || !old)
+               return;
+
        control->pause_filter_count =
                                __shrink_ple_window(old,
                                                    pause_filter_count,
@@ -960,6 +1044,8 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
 
                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+
+               svm->v_vmload_vmsave_enabled = false;
        } else {
                /*
                 * If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -979,8 +1065,9 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
 static void init_vmcb(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb_control_area *control = &svm->vmcb->control;
-       struct vmcb_save_area *save = &svm->vmcb->save;
+       struct vmcb *vmcb = svm->vmcb01.ptr;
+       struct vmcb_control_area *control = &vmcb->control;
+       struct vmcb_save_area *save = &vmcb->save;
 
        svm_set_intercept(svm, INTERCEPT_CR0_READ);
        svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1104,7 +1191,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
 
        if (kvm_vcpu_apicv_active(vcpu))
-               avic_init_vmcb(svm);
+               avic_init_vmcb(svm, vmcb);
 
        if (vgif) {
                svm_clr_intercept(svm, INTERCEPT_STGI);
@@ -1122,10 +1209,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                }
        }
 
-       svm_hv_init_vmcb(svm->vmcb);
+       svm_hv_init_vmcb(vmcb);
        init_vmcb_after_set_cpuid(vcpu);
 
-       vmcb_mark_all_dirty(svm->vmcb);
+       vmcb_mark_all_dirty(vmcb);
 
        enable_gif(svm);
 }
@@ -1380,7 +1467,7 @@ static void svm_set_vintr(struct vcpu_svm *svm)
        /*
         * The following fields are ignored when AVIC is enabled
         */
-       WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+       WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
 
        svm_set_intercept(svm, INTERCEPT_VINTR);
 
@@ -2142,7 +2229,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
                 * Likewise, clear the VINTR intercept, we will set it
                 * again while processing KVM_REQ_EVENT if needed.
                 */
-               if (vgif_enabled(svm))
+               if (vgif)
                        svm_clr_intercept(svm, INTERCEPT_STGI);
                if (svm_is_intercept(svm, INTERCEPT_VINTR))
                        svm_clear_vintr(svm);
@@ -2160,7 +2247,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
                 * in use, we still rely on the VINTR intercept (rather than
                 * STGI) to detect an open interrupt window.
                */
-               if (!vgif_enabled(svm))
+               if (!vgif)
                        svm_clear_vintr(svm);
        }
 }
@@ -2575,25 +2662,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_TSC_AUX:
                msr_info->data = svm->tsc_aux;
                break;
-       /*
-        * Nobody will change the following 5 values in the VMCB so we can
-        * safely return them on rdmsr. They will always be 0 until LBRV is
-        * implemented.
-        */
        case MSR_IA32_DEBUGCTLMSR:
-               msr_info->data = svm->vmcb->save.dbgctl;
-               break;
        case MSR_IA32_LASTBRANCHFROMIP:
-               msr_info->data = svm->vmcb->save.br_from;
-               break;
        case MSR_IA32_LASTBRANCHTOIP:
-               msr_info->data = svm->vmcb->save.br_to;
-               break;
        case MSR_IA32_LASTINTFROMIP:
-               msr_info->data = svm->vmcb->save.last_excp_from;
-               break;
        case MSR_IA32_LASTINTTOIP:
-               msr_info->data = svm->vmcb->save.last_excp_to;
+               msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
                break;
        case MSR_VM_HSAVE_PA:
                msr_info->data = svm->nested.hsave_msr;
@@ -2839,12 +2913,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                if (data & DEBUGCTL_RESERVED_BITS)
                        return 1;
 
-               svm->vmcb->save.dbgctl = data;
-               vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
-               if (data & (1ULL<<0))
-                       svm_enable_lbrv(vcpu);
+               if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
+                       svm->vmcb->save.dbgctl = data;
                else
-                       svm_disable_lbrv(vcpu);
+                       svm->vmcb01.ptr->save.dbgctl = data;
+
+               svm_update_lbrv(vcpu);
+
                break;
        case MSR_VM_HSAVE_PA:
                /*
@@ -2901,9 +2976,16 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
        svm_clear_vintr(to_svm(vcpu));
 
        /*
-        * For AVIC, the only reason to end up here is ExtINTs.
+        * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
         * In this case AVIC was temporarily disabled for
         * requesting the IRQ window and we have to re-enable it.
+        *
+        * If running nested, still remove the VM wide AVIC inhibit to
+        * support case in which the interrupt window was requested when the
+        * vCPU was not running nested.
+
+        * All vCPUs which run still run nested, will remain to have their
+        * AVIC still inhibited due to per-cpu AVIC inhibition.
         */
        kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
 
@@ -2914,7 +2996,6 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
 static int pause_interception(struct kvm_vcpu *vcpu)
 {
        bool in_kernel;
-
        /*
         * CPL is not made available for an SEV-ES guest, therefore
         * vcpu->arch.preempted_in_kernel can never be true.  Just
@@ -2922,8 +3003,7 @@ static int pause_interception(struct kvm_vcpu *vcpu)
         */
        in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
 
-       if (!kvm_pause_in_guest(vcpu->kvm))
-               grow_ple_window(vcpu);
+       grow_ple_window(vcpu);
 
        kvm_vcpu_on_spin(vcpu, in_kernel);
        return kvm_skip_emulated_instruction(vcpu);
@@ -3496,14 +3576,20 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
         * enabled, the STGI interception will not occur. Enable the irq
         * window under the assumption that the hardware will set the GIF.
         */
-       if (vgif_enabled(svm) || gif_set(svm)) {
+       if (vgif || gif_set(svm)) {
                /*
                 * IRQ window is not needed when AVIC is enabled,
                 * unless we have pending ExtINT since it cannot be injected
-                * via AVIC. In such case, we need to temporarily disable AVIC,
+                * via AVIC. In such case, KVM needs to temporarily disable AVIC,
                 * and fallback to injecting IRQ via V_IRQ.
+                *
+                * If running nested, AVIC is already locally inhibited
+                * on this vCPU, therefore there is no need to request
+                * the VM wide AVIC inhibition.
                 */
-               kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+               if (!is_guest_mode(vcpu))
+                       kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
                svm_set_vintr(svm);
        }
 }
@@ -3516,7 +3602,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
                return; /* IRET will cause a vm exit */
 
        if (!gif_set(svm)) {
-               if (vgif_enabled(svm))
+               if (vgif)
                        svm_set_intercept(svm, INTERCEPT_STGI);
                return; /* STGI will cause a vm exit */
        }
@@ -3865,7 +3951,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
                hv_track_root_tdp(vcpu, root_hpa);
 
                cr3 = vcpu->arch.cr3;
-       } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+       } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) {
                cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
        } else {
                /* PCID in the guest should be impossible with a 32-bit MMU. */
@@ -3946,6 +4032,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                             guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
 
        svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
+       svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
+
+       svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
+
+       svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
+                       guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+
+       svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
+                       guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+
+       svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
 
        svm_recalc_instruction_intercepts(vcpu, svm);
 
@@ -3963,13 +4060,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                 */
                if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
                        kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
-
-               /*
-                * Currently, AVIC does not work with nested virtualization.
-                * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
-                */
-               if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
-                       kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED);
        }
        init_vmcb_after_set_cpuid(vcpu);
 }
@@ -4224,7 +4314,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
 
-       ret = nested_svm_vmexit(svm);
+       ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
        if (ret)
                return ret;
 
@@ -4321,7 +4411,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        if (!gif_set(svm)) {
-               if (vgif_enabled(svm))
+               if (vgif)
                        svm_set_intercept(svm, INTERCEPT_STGI);
                /* STGI will cause a vm exit */
        } else {
@@ -4605,7 +4695,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 
        .sched_in = svm_sched_in,
 
-       .pmu_ops = &amd_pmu_ops,
        .nested_ops = &svm_nested_ops,
 
        .deliver_interrupt = svm_deliver_interrupt,
@@ -4633,6 +4722,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .complete_emulated_msr = svm_complete_emulated_msr,
 
        .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+       .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
 };
 
 /*
@@ -4696,6 +4786,20 @@ static __init void svm_set_cpu_caps(void)
                if (tsc_scaling)
                        kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
 
+               if (vls)
+                       kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
+               if (lbrv)
+                       kvm_cpu_cap_set(X86_FEATURE_LBRV);
+
+               if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
+                       kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
+
+               if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
+                       kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+
+               if (vgif)
+                       kvm_cpu_cap_set(X86_FEATURE_VGIF);
+
                /* Nested VM can receive #VMEXIT instead of triggering #GP */
                kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
        }
@@ -4789,6 +4893,9 @@ static __init int svm_hardware_setup(void)
                          get_npt_level(), PG_LEVEL_1G);
        pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
 
+       /* Setup shadow_me_value and shadow_me_mask */
+       kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
+
        /* Note, SEV setup consumes npt_enabled. */
        sev_hardware_setup();
 
@@ -4807,15 +4914,20 @@ static __init int svm_hardware_setup(void)
                        nrips = false;
        }
 
-       enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
+       enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
 
        if (enable_apicv) {
-               pr_info("AVIC enabled\n");
+               if (!boot_cpu_has(X86_FEATURE_AVIC)) {
+                       pr_warn("AVIC is not supported in CPUID but force enabled");
+                       pr_warn("Your system might crash and burn");
+               } else
+                       pr_info("AVIC enabled\n");
 
                amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
        } else {
                svm_x86_ops.vcpu_blocking = NULL;
                svm_x86_ops.vcpu_unblocking = NULL;
+               svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
        }
 
        if (vls) {
@@ -4880,6 +4992,7 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
        .check_processor_compatibility = svm_check_processor_compat,
 
        .runtime_ops = &svm_x86_ops,
+       .pmu_ops = &amd_pmu_ops,
 };
 
 static int __init svm_init(void)
index 2d83845..21c5460 100644 (file)
 #define        IOPM_SIZE PAGE_SIZE * 3
 #define        MSRPM_SIZE PAGE_SIZE * 2
 
-#define MAX_DIRECT_ACCESS_MSRS 20
+#define MAX_DIRECT_ACCESS_MSRS 21
 #define MSRPM_OFFSETS  16
 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
+extern int vgif;
 extern bool intercept_smi;
 
 /*
@@ -231,9 +232,14 @@ struct vcpu_svm {
        unsigned int3_injected;
        unsigned long int3_rip;
 
-       /* cached guest cpuid flags for faster access */
+       /* optional nested SVM features that are enabled for this guest  */
        bool nrips_enabled                : 1;
        bool tsc_scaling_enabled          : 1;
+       bool v_vmload_vmsave_enabled      : 1;
+       bool lbrv_enabled                 : 1;
+       bool pause_filter_enabled         : 1;
+       bool pause_threshold_enabled      : 1;
+       bool vgif_enabled                 : 1;
 
        u32 ldr_reg;
        u32 dfr_reg;
@@ -452,44 +458,70 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
        return vmcb_is_intercept(&svm->vmcb->control, bit);
 }
 
-static inline bool vgif_enabled(struct vcpu_svm *svm)
+static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
 {
-       return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+       return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
+{
+       if (!vgif)
+               return NULL;
+
+       if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
+               return svm->nested.vmcb02.ptr;
+       else
+               return svm->vmcb01.ptr;
 }
 
 static inline void enable_gif(struct vcpu_svm *svm)
 {
-       if (vgif_enabled(svm))
-               svm->vmcb->control.int_ctl |= V_GIF_MASK;
+       struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+       if (vmcb)
+               vmcb->control.int_ctl |= V_GIF_MASK;
        else
                svm->vcpu.arch.hflags |= HF_GIF_MASK;
 }
 
 static inline void disable_gif(struct vcpu_svm *svm)
 {
-       if (vgif_enabled(svm))
-               svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+       struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+       if (vmcb)
+               vmcb->control.int_ctl &= ~V_GIF_MASK;
        else
                svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
 }
 
 static inline bool gif_set(struct vcpu_svm *svm)
 {
-       if (vgif_enabled(svm))
-               return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+       struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+       if (vmcb)
+               return !!(vmcb->control.int_ctl & V_GIF_MASK);
        else
                return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
+static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+{
+       return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+}
+
 /* svm.c */
 #define MSR_INVALID                            0xffffffffU
 
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
 extern bool dump_invalid_vmcb;
 
 u32 svm_msrpm_offset(u32 msr);
 u32 *svm_vcpu_alloc_msrpm(void);
 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
 void svm_vcpu_free_msrpm(u32 *msrpm);
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+void svm_update_lbrv(struct kvm_vcpu *vcpu);
 
 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
@@ -574,7 +606,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
 int avic_ga_log_notifier(u32 ga_tag);
 void avic_vm_destroy(struct kvm *kvm);
 int avic_vm_init(struct kvm *kvm);
-void avic_init_vmcb(struct vcpu_svm *svm);
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
 int avic_init_vcpu(struct vcpu_svm *svm);
@@ -592,6 +624,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void avic_ring_doorbell(struct kvm_vcpu *vcpu);
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
 
 /* sev.c */
 
index e3a24b8..de47625 100644 (file)
@@ -1459,6 +1459,26 @@ TRACE_EVENT(kvm_avic_ga_log,
                  __entry->vmid, __entry->vcpuid)
 );
 
+TRACE_EVENT(kvm_avic_kick_vcpu_slowpath,
+           TP_PROTO(u32 icrh, u32 icrl, u32 index),
+           TP_ARGS(icrh, icrl, index),
+
+       TP_STRUCT__entry(
+               __field(u32, icrh)
+               __field(u32, icrl)
+               __field(u32, index)
+       ),
+
+       TP_fast_assign(
+               __entry->icrh = icrh;
+               __entry->icrl = icrl;
+               __entry->index = index;
+       ),
+
+       TP_printk("icrh:icrl=%#08x:%08x, index=%u",
+                 __entry->icrh, __entry->icrl, __entry->index)
+);
+
 TRACE_EVENT(kvm_hv_timer_state,
                TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
                TP_ARGS(vcpu_id, hv_timer_in_use),
index 856c875..f5cb18e 100644 (file)
@@ -476,24 +476,23 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
        return 0;
 }
 
-
-static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
-               struct x86_exception *fault)
+static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
+                                                   struct x86_exception *fault)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
        WARN_ON(!is_guest_mode(vcpu));
 
        if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
-               !to_vmx(vcpu)->nested.nested_run_pending) {
+           !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
                vmcs12->vm_exit_intr_error_code = fault->error_code;
                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
                                  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
                                  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
                                  fault->address);
-       } else {
-               kvm_inject_page_fault(vcpu, fault);
+               return true;
        }
+       return false;
 }
 
 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
@@ -2614,9 +2613,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
        }
 
-       if (!enable_ept)
-               vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
-
        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
            WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
                                     vmcs12->guest_ia32_perf_global_ctrl))) {
@@ -3695,12 +3691,34 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 }
 
 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
-                                     struct vmcs12 *vmcs12)
+                                     struct vmcs12 *vmcs12,
+                                     u32 vm_exit_reason, u32 exit_intr_info)
 {
        u32 idt_vectoring;
        unsigned int nr;
 
-       if (vcpu->arch.exception.injected) {
+       /*
+        * Per the SDM, VM-Exits due to double and triple faults are never
+        * considered to occur during event delivery, even if the double/triple
+        * fault is the result of an escalating vectoring issue.
+        *
+        * Note, the SDM qualifies the double fault behavior with "The original
+        * event results in a double-fault exception".  It's unclear why the
+        * qualification exists since exits due to double fault can occur only
+        * while vectoring a different exception (injected events are never
+        * subject to interception), i.e. there's _always_ an original event.
+        *
+        * The SDM also uses NMI as a confusing example for the "original event
+        * causes the VM exit directly" clause.  NMI isn't special in any way,
+        * the same rule applies to all events that cause an exit directly.
+        * NMI is an odd choice for the example because NMIs can only occur on
+        * instruction boundaries, i.e. they _can't_ occur during vectoring.
+        */
+       if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
+           ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
+            is_double_fault(exit_intr_info))) {
+               vmcs12->idt_vectoring_info_field = 0;
+       } else if (vcpu->arch.exception.injected) {
                nr = vcpu->arch.exception.nr;
                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
 
@@ -3733,6 +3751,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
                        idt_vectoring |= INTR_TYPE_EXT_INTR;
 
                vmcs12->idt_vectoring_info_field = idt_vectoring;
+       } else {
+               vmcs12->idt_vectoring_info_field = 0;
        }
 }
 
@@ -4202,12 +4222,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        if (to_vmx(vcpu)->exit_reason.enclave_mode)
                vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
        vmcs12->exit_qualification = exit_qualification;
-       vmcs12->vm_exit_intr_info = exit_intr_info;
-
-       vmcs12->idt_vectoring_info_field = 0;
-       vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-       vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 
+       /*
+        * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
+        * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
+        * exit info fields are unmodified.
+        */
        if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
                vmcs12->launch_state = 1;
 
@@ -4219,7 +4239,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 * Transfer the event that L0 or L1 may wanted to inject into
                 * L2 to IDT_VECTORING_INFO_FIELD.
                 */
-               vmcs12_save_pending_event(vcpu, vmcs12);
+               vmcs12_save_pending_event(vcpu, vmcs12,
+                                         vm_exit_reason, exit_intr_info);
+
+               vmcs12->vm_exit_intr_info = exit_intr_info;
+               vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 
                /*
                 * According to spec, there's no need to store the guest's
@@ -4518,9 +4543,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
-       /* Similarly, triple faults in L2 should never escape. */
-       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
-
        if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
                /*
                 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
@@ -6809,6 +6831,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 struct kvm_x86_nested_ops vmx_nested_ops = {
        .leave_nested = vmx_leave_nested,
        .check_events = vmx_check_nested_events,
+       .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
        .hv_timer_pending = nested_vmx_preemption_timer_pending,
        .triple_fault = nested_vmx_triple_fault,
        .get_state = vmx_get_nested_state,
index b82b670..37e9eb3 100644 (file)
@@ -719,7 +719,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
                intel_pmu_release_guest_lbr_event(vcpu);
 }
 
-struct kvm_pmu_ops intel_pmu_ops = {
+struct kvm_pmu_ops intel_pmu_ops __initdata = {
        .pmc_perf_hw_id = intel_pmc_perf_hw_id,
        .pmc_is_enabled = intel_pmc_is_enabled,
        .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
index 3834bb3..07e5fcf 100644 (file)
@@ -202,16 +202,17 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 void pi_wakeup_handler(void)
 {
        int cpu = smp_processor_id();
+       struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
+       raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
        struct vcpu_vmx *vmx;
 
-       raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
-       list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu),
-                           pi_wakeup_list) {
+       raw_spin_lock(spinlock);
+       list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) {
 
                if (pi_test_on(&vmx->pi_desc))
                        kvm_vcpu_wake_up(&vmx->vcpu);
        }
-       raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
+       raw_spin_unlock(spinlock);
 }
 
 void __init pi_init_cpu(int cpu)
@@ -311,7 +312,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
                        continue;
                }
 
-               vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
+               vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                vcpu_info.vector = irq.vector;
 
                trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
index e325c29..2b9d7a7 100644 (file)
@@ -104,6 +104,11 @@ static inline bool is_breakpoint(u32 intr_info)
        return is_exception_n(intr_info, BP_VECTOR);
 }
 
+static inline bool is_double_fault(u32 intr_info)
+{
+       return is_exception_n(intr_info, DF_VECTOR);
+}
+
 static inline bool is_page_fault(u32 intr_info)
 {
        return is_exception_n(intr_info, PF_VECTOR);
index 610355b..f5aeade 100644 (file)
@@ -2444,7 +2444,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                                &_cpu_based_exec_control) < 0)
                return -EIO;
 #ifdef CONFIG_X86_64
-       if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+       if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)
                _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
                                           ~CPU_BASED_CR8_STORE_EXITING;
 #endif
@@ -2948,7 +2948,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
 
        if (enable_ept)
                ept_sync_context(construct_eptp(vcpu, root_hpa,
-                                               mmu->shadow_root_level));
+                                               mmu->root_role.level));
        else
                vpid_sync_context(vmx_get_current_vpid(vcpu));
 }
@@ -4385,7 +4385,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
        if (cpu_has_secondary_exec_ctrls())
                secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
 
-       if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+       if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
                vmcs_write64(EOI_EXIT_BITMAP0, 0);
                vmcs_write64(EOI_EXIT_BITMAP1, 0);
                vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -5410,9 +5410,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
                      ? PFERR_FETCH_MASK : 0;
        /* ept page table entry is present? */
-       error_code |= (exit_qualification &
-                      (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
-                       EPT_VIOLATION_EXECUTABLE))
+       error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
                      ? PFERR_PRESENT_MASK : 0;
 
        error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
@@ -7823,7 +7821,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .cpu_dirty_log_size = PML_ENTITY_NUM,
        .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
 
-       .pmu_ops = &intel_pmu_ops,
        .nested_ops = &vmx_nested_ops,
 
        .pi_update_irte = vmx_pi_update_irte,
@@ -7856,7 +7853,7 @@ static unsigned int vmx_handle_intel_pt_intr(void)
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
 
        /* '0' on failure so that the !PT case can use a RET0 static call. */
-       if (!kvm_arch_pmi_in_guest(vcpu))
+       if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
                return 0;
 
        kvm_make_request(KVM_REQ_PMI, vcpu);
@@ -7891,6 +7888,31 @@ static __init void vmx_setup_user_return_msrs(void)
                kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
 }
 
+static void __init vmx_setup_me_spte_mask(void)
+{
+       u64 me_mask = 0;
+
+       /*
+        * kvm_get_shadow_phys_bits() returns shadow_phys_bits.  Use
+        * the former to avoid exposing shadow_phys_bits.
+        *
+        * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
+        * shadow_phys_bits.  On MKTME and/or TDX capable systems,
+        * boot_cpu_data.x86_phys_bits holds the actual physical address
+        * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
+        * reported by CPUID.  Those bits between are KeyID bits.
+        */
+       if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+               me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
+                       kvm_get_shadow_phys_bits() - 1);
+       /*
+        * Unlike SME, host kernel doesn't support setting up any
+        * MKTME KeyID on Intel platforms.  No memory encryption
+        * bits should be included into the SPTE.
+        */
+       kvm_mmu_set_me_spte_mask(0, me_mask);
+}
+
 static struct kvm_x86_init_ops vmx_init_ops __initdata;
 
 static __init int hardware_setup(void)
@@ -7993,6 +8015,12 @@ static __init int hardware_setup(void)
                kvm_mmu_set_ept_masks(enable_ept_ad_bits,
                                      cpu_has_vmx_ept_execute_only());
 
+       /*
+        * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
+        * bits to shadow_zero_check.
+        */
+       vmx_setup_me_spte_mask();
+
        kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
                          ept_caps_to_lpage_level(vmx_capability.ept));
 
@@ -8077,6 +8105,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = {
        .handle_intel_pt_intr = NULL,
 
        .runtime_ops = &vmx_x86_ops,
+       .pmu_ops = &intel_pmu_ops,
 };
 
 static void vmx_cleanup_l1d_flush(void)
index 4790f0d..b81ef4f 100644 (file)
@@ -266,7 +266,12 @@ const struct kvm_stats_header kvm_vm_stats_header = {
 
 const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
        KVM_GENERIC_VCPU_STATS(),
+       STATS_DESC_COUNTER(VCPU, pf_taken),
        STATS_DESC_COUNTER(VCPU, pf_fixed),
+       STATS_DESC_COUNTER(VCPU, pf_emulate),
+       STATS_DESC_COUNTER(VCPU, pf_spurious),
+       STATS_DESC_COUNTER(VCPU, pf_fast),
+       STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created),
        STATS_DESC_COUNTER(VCPU, pf_guest),
        STATS_DESC_COUNTER(VCPU, tlb_flush),
        STATS_DESC_COUNTER(VCPU, invlpg),
@@ -748,6 +753,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
+/* Returns true if the page fault was immediately morphed into a VM-Exit. */
 bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                    struct x86_exception *fault)
 {
@@ -766,8 +772,26 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
                                       fault_mmu->root.hpa);
 
+       /*
+        * A workaround for KVM's bad exception handling.  If KVM injected an
+        * exception into L2, and L2 encountered a #PF while vectoring the
+        * injected exception, manually check to see if L1 wants to intercept
+        * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
+        * In all other cases, defer the check to nested_ops->check_events(),
+        * which will correctly handle priority (this does not).  Note, other
+        * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
+        * most problematic, e.g. when L0 and L1 are both intercepting #PF for
+        * shadow paging.
+        *
+        * TODO: Rewrite exception handling to track injected and pending
+        *       (VM-Exit) exceptions separately.
+        */
+       if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
+           kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
+               return true;
+
        fault_mmu->inject_page_fault(vcpu, fault);
-       return fault->nested_page_fault;
+       return false;
 }
 EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
 
@@ -961,11 +985,13 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
                        wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
        }
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (static_cpu_has(X86_FEATURE_PKU) &&
-           (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
-            (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
-           vcpu->arch.pkru != vcpu->arch.host_pkru)
+           vcpu->arch.pkru != vcpu->arch.host_pkru &&
+           ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+            kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
                write_pkru(vcpu->arch.pkru);
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 }
 EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
 
@@ -974,13 +1000,15 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
        if (vcpu->arch.guest_state_protected)
                return;
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (static_cpu_has(X86_FEATURE_PKU) &&
-           (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
-            (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+           ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+            kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
                vcpu->arch.pkru = rdpkru();
                if (vcpu->arch.pkru != vcpu->arch.host_pkru)
                        write_pkru(vcpu->arch.host_pkru);
        }
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 
        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
 
@@ -2249,14 +2277,13 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 
        /* we verify if the enable bit is set... */
-       vcpu->arch.pv_time_enabled = false;
-       if (!(system_time & 1))
-               return;
-
-       if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
-                                      &vcpu->arch.pv_time, system_time & ~1ULL,
-                                      sizeof(struct pvclock_vcpu_time_info)))
-               vcpu->arch.pv_time_enabled = true;
+       if (system_time & 1) {
+               kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
+                                         KVM_HOST_USES_PFN, system_time & ~1ULL,
+                                         sizeof(struct pvclock_vcpu_time_info));
+       } else {
+               kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
+       }
 
        return;
 }
@@ -2961,63 +2988,55 @@ u64 get_kvmclock_ns(struct kvm *kvm)
        return data.clock;
 }
 
-static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
-                                  struct gfn_to_hva_cache *cache,
-                                  unsigned int offset)
+static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
+                                   struct gfn_to_pfn_cache *gpc,
+                                   unsigned int offset)
 {
        struct kvm_vcpu_arch *vcpu = &v->arch;
-       struct pvclock_vcpu_time_info guest_hv_clock;
+       struct pvclock_vcpu_time_info *guest_hv_clock;
+       unsigned long flags;
 
-       if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
-               &guest_hv_clock, offset, sizeof(guest_hv_clock))))
-               return;
+       read_lock_irqsave(&gpc->lock, flags);
+       while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+                                          offset + sizeof(*guest_hv_clock))) {
+               read_unlock_irqrestore(&gpc->lock, flags);
 
-       /* This VCPU is paused, but it's legal for a guest to read another
+               if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+                                                offset + sizeof(*guest_hv_clock)))
+                       return;
+
+               read_lock_irqsave(&gpc->lock, flags);
+       }
+
+       guest_hv_clock = (void *)(gpc->khva + offset);
+
+       /*
+        * This VCPU is paused, but it's legal for a guest to read another
         * VCPU's kvmclock, so we really have to follow the specification where
         * it says that version is odd if data is being modified, and even after
         * it is consistent.
-        *
-        * Version field updates must be kept separate.  This is because
-        * kvm_write_guest_cached might use a "rep movs" instruction, and
-        * writes within a string instruction are weakly ordered.  So there
-        * are three writes overall.
-        *
-        * As a small optimization, only write the version field in the first
-        * and third write.  The vcpu->pv_time cache is still valid, because the
-        * version field is the first in the struct.
         */
-       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
-       if (guest_hv_clock.version & 1)
-               ++guest_hv_clock.version;  /* first time write, random junk */
-
-       vcpu->hv_clock.version = guest_hv_clock.version + 1;
-       kvm_write_guest_offset_cached(v->kvm, cache,
-                                     &vcpu->hv_clock, offset,
-                                     sizeof(vcpu->hv_clock.version));
 
+       guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
        smp_wmb();
 
        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-       vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+       vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
 
        if (vcpu->pvclock_set_guest_stopped_request) {
                vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
                vcpu->pvclock_set_guest_stopped_request = false;
        }
 
-       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+       memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
+       smp_wmb();
 
-       kvm_write_guest_offset_cached(v->kvm, cache,
-                                     &vcpu->hv_clock, offset,
-                                     sizeof(vcpu->hv_clock));
+       guest_hv_clock->version = ++vcpu->hv_clock.version;
 
-       smp_wmb();
+       mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+       read_unlock_irqrestore(&gpc->lock, flags);
 
-       vcpu->hv_clock.version++;
-       kvm_write_guest_offset_cached(v->kvm, cache,
-                                    &vcpu->hv_clock, offset,
-                                    sizeof(vcpu->hv_clock.version));
+       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 }
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -3106,13 +3125,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
        vcpu->hv_clock.flags = pvclock_flags;
 
-       if (vcpu->pv_time_enabled)
-               kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
-       if (vcpu->xen.vcpu_info_set)
-               kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
-                                      offsetof(struct compat_vcpu_info, time));
-       if (vcpu->xen.vcpu_time_info_set)
-               kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
+       if (vcpu->pv_time.active)
+               kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+       if (vcpu->xen.vcpu_info_cache.active)
+               kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
+                                       offsetof(struct compat_vcpu_info, time));
+       if (vcpu->xen.vcpu_time_info_cache.active)
+               kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
        kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
        return 0;
 }
@@ -3300,7 +3319,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
 
 static void kvmclock_reset(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.pv_time_enabled = false;
+       kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
        vcpu->arch.time = 0;
 }
 
@@ -4284,7 +4303,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
                    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
                    KVM_XEN_HVM_CONFIG_SHARED_INFO |
-                   KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
+                   KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
+                   KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
                if (sched_info_on())
                        r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
                break;
@@ -4331,6 +4351,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = boot_cpu_has(X86_FEATURE_XSAVE);
                break;
        case KVM_CAP_TSC_CONTROL:
+       case KVM_CAP_VM_TSC_CONTROL:
                r = kvm_has_tsc_control;
                break;
        case KVM_CAP_X2APIC_API:
@@ -5102,7 +5123,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
  */
 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 {
-       if (!vcpu->arch.pv_time_enabled)
+       if (!vcpu->arch.pv_time.active)
                return -EINVAL;
        vcpu->arch.pvclock_set_guest_stopped_request = true;
        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -6186,7 +6207,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
 
        mutex_lock(&kvm->lock);
        kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!vcpu->arch.pv_time_enabled)
+               if (!vcpu->arch.pv_time.active)
                        continue;
 
                ret = kvm_set_guest_paused(vcpu);
@@ -6513,6 +6534,15 @@ set_pit2_out:
                r = kvm_xen_hvm_set_attr(kvm, &xha);
                break;
        }
+       case KVM_XEN_HVM_EVTCHN_SEND: {
+               struct kvm_irq_routing_xen_evtchn uxe;
+
+               r = -EFAULT;
+               if (copy_from_user(&uxe, argp, sizeof(uxe)))
+                       goto out;
+               r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
+               break;
+       }
 #endif
        case KVM_SET_CLOCK:
                r = kvm_vm_ioctl_set_clock(kvm, argp);
@@ -6520,6 +6550,28 @@ set_pit2_out:
        case KVM_GET_CLOCK:
                r = kvm_vm_ioctl_get_clock(kvm, argp);
                break;
+       case KVM_SET_TSC_KHZ: {
+               u32 user_tsc_khz;
+
+               r = -EINVAL;
+               user_tsc_khz = (u32)arg;
+
+               if (kvm_has_tsc_control &&
+                   user_tsc_khz >= kvm_max_guest_tsc_khz)
+                       goto out;
+
+               if (user_tsc_khz == 0)
+                       user_tsc_khz = tsc_khz;
+
+               WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+               r = 0;
+
+               goto out;
+       }
+       case KVM_GET_TSC_KHZ: {
+               r = READ_ONCE(kvm->arch.default_tsc_khz);
+               goto out;
+       }
        case KVM_MEMORY_ENCRYPT_OP: {
                r = -ENOTTY;
                if (!kvm_x86_ops.mem_enc_ioctl)
@@ -7229,15 +7281,8 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
                                   exception, &write_emultor);
 }
 
-#define CMPXCHG_TYPE(t, ptr, old, new) \
-       (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
-
-#ifdef CONFIG_X86_64
-#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
-#else
-#  define CMPXCHG64(ptr, old, new) \
-       (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
-#endif
+#define emulator_try_cmpxchg_user(t, ptr, old, new) \
+       (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
 
 static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                                     unsigned long addr,
@@ -7246,12 +7291,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                                     unsigned int bytes,
                                     struct x86_exception *exception)
 {
-       struct kvm_host_map map;
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        u64 page_line_mask;
+       unsigned long hva;
        gpa_t gpa;
-       char *kaddr;
-       bool exchanged;
+       int r;
 
        /* guests cmpxchg8b have to be emulated atomically */
        if (bytes > 8 || (bytes & (bytes - 1)))
@@ -7275,31 +7319,32 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
        if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
                goto emul_write;
 
-       if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
+       hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
+       if (kvm_is_error_hva(hva))
                goto emul_write;
 
-       kaddr = map.hva + offset_in_page(gpa);
+       hva += offset_in_page(gpa);
 
        switch (bytes) {
        case 1:
-               exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+               r = emulator_try_cmpxchg_user(u8, hva, old, new);
                break;
        case 2:
-               exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+               r = emulator_try_cmpxchg_user(u16, hva, old, new);
                break;
        case 4:
-               exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+               r = emulator_try_cmpxchg_user(u32, hva, old, new);
                break;
        case 8:
-               exchanged = CMPXCHG64(kaddr, old, new);
+               r = emulator_try_cmpxchg_user(u64, hva, old, new);
                break;
        default:
                BUG();
        }
 
-       kvm_vcpu_unmap(vcpu, &map, true);
-
-       if (!exchanged)
+       if (r < 0)
+               return X86EMUL_UNHANDLEABLE;
+       if (r)
                return X86EMUL_CMPXCHG_FAILED;
 
        kvm_page_track_write(vcpu, gpa, new, bytes);
@@ -8061,7 +8106,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
            WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                return false;
 
-       if (!vcpu->arch.mmu->direct_map) {
+       if (!vcpu->arch.mmu->root_role.direct) {
                /*
                 * Write permission should be allowed since only
                 * write access need to be emulated.
@@ -8094,7 +8139,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        kvm_release_pfn_clean(pfn);
 
        /* The instructions are well-emulated on direct mmu. */
-       if (vcpu->arch.mmu->direct_map) {
+       if (vcpu->arch.mmu->root_role.direct) {
                unsigned int indirect_shadow_pages;
 
                write_lock(&vcpu->kvm->mmu_lock);
@@ -8162,7 +8207,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
        vcpu->arch.last_retry_eip = ctxt->eip;
        vcpu->arch.last_retry_addr = cr2_or_gpa;
 
-       if (!vcpu->arch.mmu->direct_map)
+       if (!vcpu->arch.mmu->root_role.direct)
                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
 
        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -8251,7 +8296,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
 
-static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
 {
        if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
            (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
@@ -8320,25 +8365,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
 }
 
 /*
- * Decode to be emulated instruction. Return EMULATION_OK if success.
+ * Decode an instruction for emulation.  The caller is responsible for handling
+ * code breakpoints.  Note, manually detecting code breakpoints is unnecessary
+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
+ * code breakpoints have higher priority and thus have already been done by
+ * hardware.
+ *
+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
+ *     response to a machine check.
  */
 int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
                                    void *insn, int insn_len)
 {
-       int r = EMULATION_OK;
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+       int r;
 
        init_emulate_ctxt(vcpu);
 
-       /*
-        * We will reenter on the same instruction since we do not set
-        * complete_userspace_io. This does not handle watchpoints yet,
-        * those would be handled in the emulate_ops.
-        */
-       if (!(emulation_type & EMULTYPE_SKIP) &&
-           kvm_vcpu_check_breakpoint(vcpu, &r))
-               return r;
-
        r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
 
        trace_kvm_emulate_insn_start(vcpu);
@@ -8371,6 +8414,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                kvm_clear_exception_queue(vcpu);
 
+               /*
+                * Return immediately if RIP hits a code breakpoint, such #DBs
+                * are fault-like and are higher priority than any faults on
+                * the code fetch itself.
+                */
+               if (!(emulation_type & EMULTYPE_SKIP) &&
+                   kvm_vcpu_check_code_breakpoint(vcpu, &r))
+                       return r;
+
                r = x86_decode_emulated_instruction(vcpu, emulation_type,
                                                    insn, insn_len);
                if (r != EMULATION_OK)  {
@@ -8442,7 +8494,7 @@ restart:
                ctxt->exception.address = cr2_or_gpa;
 
                /* With shadow page tables, cr2 contains a GVA or nGPA. */
-               if (vcpu->arch.mmu->direct_map) {
+               if (vcpu->arch.mmu->root_role.direct) {
                        ctxt->gpa_available = true;
                        ctxt->gpa_val = cr2_or_gpa;
                }
@@ -8789,22 +8841,22 @@ static int kvmclock_cpu_online(unsigned int cpu)
 
 static void kvm_timer_init(void)
 {
-       max_tsc_khz = tsc_khz;
-
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_CPU_FREQ
-               struct cpufreq_policy *policy;
-               int cpu;
-
-               cpu = get_cpu();
-               policy = cpufreq_cpu_get(cpu);
-               if (policy) {
-                       if (policy->cpuinfo.max_freq)
-                               max_tsc_khz = policy->cpuinfo.max_freq;
-                       cpufreq_cpu_put(policy);
+               max_tsc_khz = tsc_khz;
+
+               if (IS_ENABLED(CONFIG_CPU_FREQ)) {
+                       struct cpufreq_policy *policy;
+                       int cpu;
+
+                       cpu = get_cpu();
+                       policy = cpufreq_cpu_get(cpu);
+                       if (policy) {
+                               if (policy->cpuinfo.max_freq)
+                                       max_tsc_khz = policy->cpuinfo.max_freq;
+                               cpufreq_cpu_put(policy);
+                       }
+                       put_cpu();
                }
-               put_cpu();
-#endif
                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
                                          CPUFREQ_TRANSITION_NOTIFIER);
        }
@@ -9089,6 +9141,14 @@ bool kvm_apicv_activated(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_activated);
 
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
+{
+       ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
+       ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
+
+       return (vm_reasons | vcpu_reasons) == 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
 
 static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
                                       enum kvm_apicv_inhibit reason, bool set)
@@ -9266,6 +9326,17 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
        char instruction[3];
        unsigned long rip = kvm_rip_read(vcpu);
 
+       /*
+        * If the quirk is disabled, synthesize a #UD and let the guest pick up
+        * the pieces.
+        */
+       if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
+               ctxt->exception.error_code_valid = false;
+               ctxt->exception.vector = UD_VECTOR;
+               ctxt->have_exception = true;
+               return X86EMUL_PROPAGATE_FAULT;
+       }
+
        static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
 
        return emulator_write_emulated(ctxt, rip, instruction, 3,
@@ -9763,7 +9834,8 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
 
        down_read(&vcpu->kvm->arch.apicv_update_lock);
 
-       activate = kvm_apicv_activated(vcpu->kvm);
+       activate = kvm_vcpu_apicv_activated(vcpu);
+
        if (vcpu->arch.apicv_active == activate)
                goto out;
 
@@ -10171,7 +10243,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 * per-VM state, and responsing vCPUs must wait for the update
                 * to complete before servicing KVM_REQ_APICV_UPDATE.
                 */
-               WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+               WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
 
                exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
                if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
@@ -10250,14 +10322,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         */
        guest_timing_exit_irqoff();
 
-       if (lapic_in_kernel(vcpu)) {
-               s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
-               if (delta != S64_MIN) {
-                       trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
-                       vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
-               }
-       }
-
        local_irq_enable();
        preempt_enable();
 
@@ -10368,6 +10432,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
                        break;
 
                kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
+               if (kvm_xen_has_pending_events(vcpu))
+                       kvm_xen_inject_pending_events(vcpu);
+
                if (kvm_cpu_has_pending_timer(vcpu))
                        kvm_inject_pending_timer_irqs(vcpu);
 
@@ -11263,9 +11330,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
        vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
        vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+       kvm_xen_init_vcpu(vcpu);
        kvm_vcpu_mtrr_init(vcpu);
        vcpu_load(vcpu);
-       kvm_set_tsc_khz(vcpu, max_tsc_khz);
+       kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
        kvm_vcpu_reset(vcpu, false);
        kvm_init_mmu(vcpu);
        vcpu_put(vcpu);
@@ -11320,6 +11388,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
        fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
 
+       kvm_xen_destroy_vcpu(vcpu);
        kvm_hv_vcpu_uninit(vcpu);
        kvm_pmu_destroy(vcpu);
        kfree(vcpu->arch.mce_banks);
@@ -11581,6 +11650,24 @@ void kvm_arch_hardware_disable(void)
        drop_user_return_notifiers();
 }
 
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+       memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+       static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+       WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+       static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+                                          (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+       kvm_pmu_ops_update(ops->pmu_ops);
+}
+
 int kvm_arch_hardware_setup(void *opaque)
 {
        struct kvm_x86_init_ops *ops = opaque;
@@ -11595,8 +11682,7 @@ int kvm_arch_hardware_setup(void *opaque)
        if (r != 0)
                return r;
 
-       memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
-       kvm_ops_static_call_update();
+       kvm_ops_update(ops);
 
        kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
 
@@ -11712,6 +11798,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        pvclock_update_vm_gtod_copy(kvm);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
+       kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
        kvm->arch.guest_can_read_msr_platform_info = true;
        kvm->arch.enable_pmu = enable_pmu;
 
@@ -11747,20 +11834,15 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
        vcpu_put(vcpu);
 }
 
-static void kvm_free_vcpus(struct kvm *kvm)
+static void kvm_unload_vcpu_mmus(struct kvm *kvm)
 {
        unsigned long i;
        struct kvm_vcpu *vcpu;
 
-       /*
-        * Unpin any mmu pages first.
-        */
        kvm_for_each_vcpu(i, vcpu, kvm) {
                kvm_clear_async_pf_completion_queue(vcpu);
                kvm_unload_vcpu_mmu(vcpu);
        }
-
-       kvm_destroy_vcpus(kvm);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
@@ -11866,11 +11948,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
                mutex_unlock(&kvm->slots_lock);
        }
+       kvm_unload_vcpu_mmus(kvm);
        static_call_cond(kvm_x86_vm_destroy)(kvm);
        kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
        kvm_pic_destroy(kvm);
        kvm_ioapic_destroy(kvm);
-       kvm_free_vcpus(kvm);
+       kvm_destroy_vcpus(kvm);
        kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
        kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
        kvm_mmu_uninit_vm(kvm);
@@ -12193,6 +12276,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
            kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
                return true;
 
+       if (kvm_xen_has_pending_events(vcpu))
+               return true;
+
+       if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu))
+               return true;
+
        return false;
 }
 
@@ -12290,25 +12379,6 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 
-void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
-{
-       int r;
-
-       if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
-             work->wakeup_all)
-               return;
-
-       r = kvm_mmu_reload(vcpu);
-       if (unlikely(r))
-               return;
-
-       if (!vcpu->arch.mmu->direct_map &&
-             work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
-               return;
-
-       kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
-}
-
 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
 {
        BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
@@ -13000,6 +13070,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
index bf6cc25..610beba 100644 (file)
@@ -9,17 +9,25 @@
 #include "x86.h"
 #include "xen.h"
 #include "hyperv.h"
+#include "lapic.h"
 
+#include <linux/eventfd.h>
 #include <linux/kvm_host.h>
 #include <linux/sched/stat.h>
 
 #include <trace/events/kvm.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/version.h>
 #include <xen/interface/event_channel.h>
+#include <xen/interface/sched.h>
 
 #include "trace.h"
 
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
+
 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
 
 static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
@@ -102,6 +110,66 @@ out:
        return ret;
 }
 
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+       if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
+               struct kvm_xen_evtchn e;
+
+               e.vcpu_id = vcpu->vcpu_id;
+               e.vcpu_idx = vcpu->vcpu_idx;
+               e.port = vcpu->arch.xen.timer_virq;
+               e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+               kvm_xen_set_evtchn(&e, vcpu->kvm);
+
+               vcpu->arch.xen.timer_expires = 0;
+               atomic_set(&vcpu->arch.xen.timer_pending, 0);
+       }
+}
+
+static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
+{
+       struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
+                                            arch.xen.timer);
+       if (atomic_read(&vcpu->arch.xen.timer_pending))
+               return HRTIMER_NORESTART;
+
+       atomic_inc(&vcpu->arch.xen.timer_pending);
+       kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+       kvm_vcpu_kick(vcpu);
+
+       return HRTIMER_NORESTART;
+}
+
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+{
+       atomic_set(&vcpu->arch.xen.timer_pending, 0);
+       vcpu->arch.xen.timer_expires = guest_abs;
+
+       if (delta_ns <= 0) {
+               xen_timer_callback(&vcpu->arch.xen.timer);
+       } else {
+               ktime_t ktime_now = ktime_get();
+               hrtimer_start(&vcpu->arch.xen.timer,
+                             ktime_add_ns(ktime_now, delta_ns),
+                             HRTIMER_MODE_ABS_HARD);
+       }
+}
+
+static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
+{
+       hrtimer_cancel(&vcpu->arch.xen.timer);
+       vcpu->arch.xen.timer_expires = 0;
+       atomic_set(&vcpu->arch.xen.timer_pending, 0);
+}
+
+static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
+{
+       hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_ABS_HARD);
+       vcpu->arch.xen.timer.function = xen_timer_callback;
+}
+
 static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 {
        struct kvm_vcpu_xen *vx = &v->arch.xen;
@@ -133,27 +201,36 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 {
        struct kvm_vcpu_xen *vx = &v->arch.xen;
-       struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
-       struct kvm_memslots *slots = kvm_memslots(v->kvm);
-       bool atomic = (state == RUNSTATE_runnable);
-       uint64_t state_entry_time;
-       int __user *user_state;
-       uint64_t __user *user_times;
+       struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
+       uint64_t *user_times;
+       unsigned long flags;
+       size_t user_len;
+       int *user_state;
 
        kvm_xen_update_runstate(v, state);
 
-       if (!vx->runstate_set)
+       if (!vx->runstate_cache.active)
                return;
 
-       if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
-           kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
-               return;
+       if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+               user_len = sizeof(struct vcpu_runstate_info);
+       else
+               user_len = sizeof(struct compat_vcpu_runstate_info);
 
-       /* We made sure it fits in a single page */
-       BUG_ON(!ghc->memslot);
+       read_lock_irqsave(&gpc->lock, flags);
+       while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+                                          user_len)) {
+               read_unlock_irqrestore(&gpc->lock, flags);
 
-       if (atomic)
-               pagefault_disable();
+               /* When invoked from kvm_sched_out() we cannot sleep */
+               if (state == RUNSTATE_runnable)
+                       return;
+
+               if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
+                       return;
+
+               read_lock_irqsave(&gpc->lock, flags);
+       }
 
        /*
         * The only difference between 32-bit and 64-bit versions of the
@@ -167,38 +244,33 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
         */
        BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
        BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
-       user_state = (int __user *)ghc->hva;
-
        BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
-
-       user_times = (uint64_t __user *)(ghc->hva +
-                                        offsetof(struct compat_vcpu_runstate_info,
-                                                 state_entry_time));
 #ifdef CONFIG_X86_64
        BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
                     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
        BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
                     offsetof(struct compat_vcpu_runstate_info, time) + 4);
-
-       if (v->kvm->arch.xen.long_mode)
-               user_times = (uint64_t __user *)(ghc->hva +
-                                                offsetof(struct vcpu_runstate_info,
-                                                         state_entry_time));
 #endif
+
+       user_state = gpc->khva;
+
+       if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+               user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
+                                                 state_entry_time);
+       else
+               user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
+                                                 state_entry_time);
+
        /*
         * First write the updated state_entry_time at the appropriate
         * location determined by 'offset'.
         */
-       state_entry_time = vx->runstate_entry_time;
-       state_entry_time |= XEN_RUNSTATE_UPDATE;
-
        BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
-                    sizeof(state_entry_time));
+                    sizeof(user_times[0]));
        BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
-                    sizeof(state_entry_time));
+                    sizeof(user_times[0]));
 
-       if (__put_user(state_entry_time, user_times))
-               goto out;
+       user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
        smp_wmb();
 
        /*
@@ -212,8 +284,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
        BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
                     sizeof(vx->current_runstate));
 
-       if (__put_user(vx->current_runstate, user_state))
-               goto out;
+       *user_state = vx->current_runstate;
 
        /*
         * Write the actual runstate times immediately after the
@@ -228,42 +299,114 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
        BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
                     sizeof(vx->runstate_times));
 
-       if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
-               goto out;
+       memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
        smp_wmb();
 
        /*
         * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
         * runstate_entry_time field.
         */
-       state_entry_time &= ~XEN_RUNSTATE_UPDATE;
-       __put_user(state_entry_time, user_times);
+       user_times[0] &= ~XEN_RUNSTATE_UPDATE;
        smp_wmb();
 
- out:
-       mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+       read_unlock_irqrestore(&gpc->lock, flags);
 
-       if (atomic)
-               pagefault_enable();
+       mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
 }
 
-int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+{
+       struct kvm_lapic_irq irq = { };
+       int r;
+
+       irq.dest_id = v->vcpu_id;
+       irq.vector = v->arch.xen.upcall_vector;
+       irq.dest_mode = APIC_DEST_PHYSICAL;
+       irq.shorthand = APIC_DEST_NOSHORT;
+       irq.delivery_mode = APIC_DM_FIXED;
+       irq.level = 1;
+
+       /* The fast version will always work for physical unicast */
+       WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+}
+
+/*
+ * On event channel delivery, the vcpu_info may not have been accessible.
+ * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
+ * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
+ * Do so now that we can sleep in the context of the vCPU to bring the
+ * page in, and refresh the pfn cache for it.
+ */
+void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 {
        unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
-       bool atomic = in_atomic() || !task_is_running(current);
-       int err;
+       struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+       unsigned long flags;
+
+       if (!evtchn_pending_sel)
+               return;
+
+       /*
+        * Yes, this is an open-coded loop. But that's just what put_user()
+        * does anyway. Page it in and retry the instruction. We're just a
+        * little more honest about it.
+        */
+       read_lock_irqsave(&gpc->lock, flags);
+       while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+                                          sizeof(struct vcpu_info))) {
+               read_unlock_irqrestore(&gpc->lock, flags);
+
+               if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+                                                sizeof(struct vcpu_info)))
+                       return;
+
+               read_lock_irqsave(&gpc->lock, flags);
+       }
+
+       /* Now gpc->khva is a valid kernel address for the vcpu_info */
+       if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+               struct vcpu_info *vi = gpc->khva;
+
+               asm volatile(LOCK_PREFIX "orq %0, %1\n"
+                            "notq %0\n"
+                            LOCK_PREFIX "andq %0, %2\n"
+                            : "=r" (evtchn_pending_sel),
+                              "+m" (vi->evtchn_pending_sel),
+                              "+m" (v->arch.xen.evtchn_pending_sel)
+                            : "0" (evtchn_pending_sel));
+               WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+       } else {
+               u32 evtchn_pending_sel32 = evtchn_pending_sel;
+               struct compat_vcpu_info *vi = gpc->khva;
+
+               asm volatile(LOCK_PREFIX "orl %0, %1\n"
+                            "notl %0\n"
+                            LOCK_PREFIX "andl %0, %2\n"
+                            : "=r" (evtchn_pending_sel32),
+                              "+m" (vi->evtchn_pending_sel),
+                              "+m" (v->arch.xen.evtchn_pending_sel)
+                            : "0" (evtchn_pending_sel32));
+               WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+       }
+       read_unlock_irqrestore(&gpc->lock, flags);
+
+       /* For the per-vCPU lapic vector, deliver it as MSI. */
+       if (v->arch.xen.upcall_vector)
+               kvm_xen_inject_vcpu_vector(v);
+
+       mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+       struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+       unsigned long flags;
        u8 rc = 0;
 
        /*
         * If the global upcall vector (HVMIRQ_callback_vector) is set and
         * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
         */
-       struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
-       struct kvm_memslots *slots = kvm_memslots(v->kvm);
-       bool ghc_valid = slots->generation == ghc->generation &&
-               !kvm_is_error_hva(ghc->hva) && ghc->memslot;
-
-       unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
 
        /* No need for compat handling here */
        BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
@@ -273,101 +416,35 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
        BUILD_BUG_ON(sizeof(rc) !=
                     sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
 
-       /*
-        * For efficiency, this mirrors the checks for using the valid
-        * cache in kvm_read_guest_offset_cached(), but just uses
-        * __get_user() instead. And falls back to the slow path.
-        */
-       if (!evtchn_pending_sel && ghc_valid) {
-               /* Fast path */
-               pagefault_disable();
-               err = __get_user(rc, (u8 __user *)ghc->hva + offset);
-               pagefault_enable();
-               if (!err)
-                       return rc;
-       }
-
-       /* Slow path */
+       read_lock_irqsave(&gpc->lock, flags);
+       while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+                                          sizeof(struct vcpu_info))) {
+               read_unlock_irqrestore(&gpc->lock, flags);
 
-       /*
-        * This function gets called from kvm_vcpu_block() after setting the
-        * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
-        * from a HLT. So we really mustn't sleep. If the page ended up absent
-        * at that point, just return 1 in order to trigger an immediate wake,
-        * and we'll end up getting called again from a context where we *can*
-        * fault in the page and wait for it.
-        */
-       if (atomic)
-               return 1;
+               /*
+                * This function gets called from kvm_vcpu_block() after setting the
+                * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+                * from a HLT. So we really mustn't sleep. If the page ended up absent
+                * at that point, just return 1 in order to trigger an immediate wake,
+                * and we'll end up getting called again from a context where we *can*
+                * fault in the page and wait for it.
+                */
+               if (in_atomic() || !task_is_running(current))
+                       return 1;
 
-       if (!ghc_valid) {
-               err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
-               if (err || !ghc->memslot) {
+               if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+                                                sizeof(struct vcpu_info))) {
                        /*
                         * If this failed, userspace has screwed up the
                         * vcpu_info mapping. No interrupts for you.
                         */
                        return 0;
                }
+               read_lock_irqsave(&gpc->lock, flags);
        }
 
-       /*
-        * Now we have a valid (protected by srcu) userspace HVA in
-        * ghc->hva which points to the struct vcpu_info. If there
-        * are any bits in the in-kernel evtchn_pending_sel then
-        * we need to write those to the guest vcpu_info and set
-        * its evtchn_upcall_pending flag. If there aren't any bits
-        * to add, we only want to *check* evtchn_upcall_pending.
-        */
-       if (evtchn_pending_sel) {
-               bool long_mode = v->kvm->arch.xen.long_mode;
-
-               if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
-                       return 0;
-
-               if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
-                       struct vcpu_info __user *vi = (void __user *)ghc->hva;
-
-                       /* Attempt to set the evtchn_pending_sel bits in the
-                        * guest, and if that succeeds then clear the same
-                        * bits in the in-kernel version. */
-                       asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
-                                    "\tnotq %0\n"
-                                    "\t" LOCK_PREFIX "andq %0, %2\n"
-                                    "2:\n"
-                                    _ASM_EXTABLE_UA(1b, 2b)
-                                    : "=r" (evtchn_pending_sel),
-                                      "+m" (vi->evtchn_pending_sel),
-                                      "+m" (v->arch.xen.evtchn_pending_sel)
-                                    : "0" (evtchn_pending_sel));
-               } else {
-                       struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
-                       u32 evtchn_pending_sel32 = evtchn_pending_sel;
-
-                       /* Attempt to set the evtchn_pending_sel bits in the
-                        * guest, and if that succeeds then clear the same
-                        * bits in the in-kernel version. */
-                       asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
-                                    "\tnotl %0\n"
-                                    "\t" LOCK_PREFIX "andl %0, %2\n"
-                                    "2:\n"
-                                    _ASM_EXTABLE_UA(1b, 2b)
-                                    : "=r" (evtchn_pending_sel32),
-                                      "+m" (vi->evtchn_pending_sel),
-                                      "+m" (v->arch.xen.evtchn_pending_sel)
-                                    : "0" (evtchn_pending_sel32));
-               }
-               rc = 1;
-               unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
-
-       err:
-               user_access_end();
-
-               mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
-       } else {
-               __get_user(rc, (u8 __user *)ghc->hva + offset);
-       }
-
+       rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
+       read_unlock_irqrestore(&gpc->lock, flags);
        return rc;
 }
 
@@ -375,36 +452,51 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 {
        int r = -ENOENT;
 
-       mutex_lock(&kvm->lock);
 
        switch (data->type) {
        case KVM_XEN_ATTR_TYPE_LONG_MODE:
                if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
                        r = -EINVAL;
                } else {
+                       mutex_lock(&kvm->lock);
                        kvm->arch.xen.long_mode = !!data->u.long_mode;
+                       mutex_unlock(&kvm->lock);
                        r = 0;
                }
                break;
 
        case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+               mutex_lock(&kvm->lock);
                r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+               mutex_unlock(&kvm->lock);
                break;
 
        case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
                if (data->u.vector && data->u.vector < 0x10)
                        r = -EINVAL;
                else {
+                       mutex_lock(&kvm->lock);
                        kvm->arch.xen.upcall_vector = data->u.vector;
+                       mutex_unlock(&kvm->lock);
                        r = 0;
                }
                break;
 
+       case KVM_XEN_ATTR_TYPE_EVTCHN:
+               r = kvm_xen_setattr_evtchn(kvm, data);
+               break;
+
+       case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+               mutex_lock(&kvm->lock);
+               kvm->arch.xen.xen_version = data->u.xen_version;
+               mutex_unlock(&kvm->lock);
+               r = 0;
+               break;
+
        default:
                break;
        }
 
-       mutex_unlock(&kvm->lock);
        return r;
 }
 
@@ -433,6 +525,11 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
                r = 0;
                break;
 
+       case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+               data->u.xen_version = kvm->arch.xen.xen_version;
+               r = 0;
+               break;
+
        default:
                break;
        }
@@ -457,48 +554,34 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                             offsetof(struct compat_vcpu_info, time));
 
                if (data->u.gpa == GPA_INVALID) {
-                       vcpu->arch.xen.vcpu_info_set = false;
+                       kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
                        r = 0;
                        break;
                }
 
-               /* It must fit within a single page */
-               if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
-                       r = -EINVAL;
-                       break;
-               }
-
-               r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+               r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
                                              &vcpu->arch.xen.vcpu_info_cache,
-                                             data->u.gpa,
+                                             NULL, KVM_HOST_USES_PFN, data->u.gpa,
                                              sizeof(struct vcpu_info));
-               if (!r) {
-                       vcpu->arch.xen.vcpu_info_set = true;
+               if (!r)
                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-               }
+
                break;
 
        case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
                if (data->u.gpa == GPA_INVALID) {
-                       vcpu->arch.xen.vcpu_time_info_set = false;
+                       kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+                                                    &vcpu->arch.xen.vcpu_time_info_cache);
                        r = 0;
                        break;
                }
 
-               /* It must fit within a single page */
-               if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
-                       r = -EINVAL;
-                       break;
-               }
-
-               r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+               r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
                                              &vcpu->arch.xen.vcpu_time_info_cache,
-                                             data->u.gpa,
+                                             NULL, KVM_HOST_USES_PFN, data->u.gpa,
                                              sizeof(struct pvclock_vcpu_time_info));
-               if (!r) {
-                       vcpu->arch.xen.vcpu_time_info_set = true;
+               if (!r)
                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-               }
                break;
 
        case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
@@ -507,24 +590,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                        break;
                }
                if (data->u.gpa == GPA_INVALID) {
-                       vcpu->arch.xen.runstate_set = false;
+                       kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+                                                    &vcpu->arch.xen.runstate_cache);
                        r = 0;
                        break;
                }
 
-               /* It must fit within a single page */
-               if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
-                       r = -EINVAL;
-                       break;
-               }
-
-               r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+               r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
                                              &vcpu->arch.xen.runstate_cache,
-                                             data->u.gpa,
+                                             NULL, KVM_HOST_USES_PFN, data->u.gpa,
                                              sizeof(struct vcpu_runstate_info));
-               if (!r) {
-                       vcpu->arch.xen.runstate_set = true;
-               }
                break;
 
        case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
@@ -622,6 +697,46 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                r = 0;
                break;
 
+       case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+               if (data->u.vcpu_id >= KVM_MAX_VCPUS)
+                       r = -EINVAL;
+               else {
+                       vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
+                       r = 0;
+               }
+               break;
+
+       case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+               if (data->u.timer.port) {
+                       if (data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
+                               r = -EINVAL;
+                               break;
+                       }
+                       vcpu->arch.xen.timer_virq = data->u.timer.port;
+                       kvm_xen_init_timer(vcpu);
+
+                       /* Restart the timer if it's set */
+                       if (data->u.timer.expires_ns)
+                               kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
+                                                   data->u.timer.expires_ns -
+                                                   get_kvmclock_ns(vcpu->kvm));
+               } else if (kvm_xen_timer_enabled(vcpu)) {
+                       kvm_xen_stop_timer(vcpu);
+                       vcpu->arch.xen.timer_virq = 0;
+               }
+
+               r = 0;
+               break;
+
+       case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+               if (data->u.vector && data->u.vector < 0x10)
+                       r = -EINVAL;
+               else {
+                       vcpu->arch.xen.upcall_vector = data->u.vector;
+                       r = 0;
+               }
+               break;
+
        default:
                break;
        }
@@ -639,7 +754,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 
        switch (data->type) {
        case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
-               if (vcpu->arch.xen.vcpu_info_set)
+               if (vcpu->arch.xen.vcpu_info_cache.active)
                        data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
                else
                        data->u.gpa = GPA_INVALID;
@@ -647,7 +762,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                break;
 
        case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
-               if (vcpu->arch.xen.vcpu_time_info_set)
+               if (vcpu->arch.xen.vcpu_time_info_cache.active)
                        data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
                else
                        data->u.gpa = GPA_INVALID;
@@ -659,7 +774,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                        r = -EOPNOTSUPP;
                        break;
                }
-               if (vcpu->arch.xen.runstate_set) {
+               if (vcpu->arch.xen.runstate_cache.active) {
                        data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
                        r = 0;
                }
@@ -697,6 +812,23 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                r = -EINVAL;
                break;
 
+       case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+               data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
+               r = 0;
+               break;
+
+       case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+               data->u.timer.port = vcpu->arch.xen.timer_virq;
+               data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+               data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+               r = 0;
+               break;
+
+       case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+               data->u.vector = vcpu->arch.xen.upcall_vector;
+               r = 0;
+               break;
+
        default:
                break;
        }
@@ -777,7 +909,11 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 {
-       if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
+       /* Only some feature flags need to be *enabled* by userspace */
+       u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+               KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
+       if (xhc->flags & ~permitted_flags)
                return -EINVAL;
 
        /*
@@ -802,18 +938,6 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
        return 0;
 }
 
-void kvm_xen_init_vm(struct kvm *kvm)
-{
-}
-
-void kvm_xen_destroy_vm(struct kvm *kvm)
-{
-       kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
-
-       if (kvm->arch.xen_hvm_config.msr)
-               static_branch_slow_dec_deferred(&kvm_xen_enabled);
-}
-
 static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
 {
        kvm_rax_write(vcpu, result);
@@ -830,10 +954,268 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
        return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
 }
 
+static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
+                              evtchn_port_t *ports)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+       unsigned long *pending_bits;
+       unsigned long flags;
+       bool ret = true;
+       int idx, i;
+
+       read_lock_irqsave(&gpc->lock, flags);
+       idx = srcu_read_lock(&kvm->srcu);
+       if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+               goto out_rcu;
+
+       ret = false;
+       if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+               struct shared_info *shinfo = gpc->khva;
+               pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+       } else {
+               struct compat_shared_info *shinfo = gpc->khva;
+               pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+       }
+
+       for (i = 0; i < nr_ports; i++) {
+               if (test_bit(ports[i], pending_bits)) {
+                       ret = true;
+                       break;
+               }
+       }
+
+ out_rcu:
+       srcu_read_unlock(&kvm->srcu, idx);
+       read_unlock_irqrestore(&gpc->lock, flags);
+
+       return ret;
+}
+
+static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
+                                u64 param, u64 *r)
+{
+       int idx, i;
+       struct sched_poll sched_poll;
+       evtchn_port_t port, *ports;
+       gpa_t gpa;
+
+       if (!longmode || !lapic_in_kernel(vcpu) ||
+           !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
+               return false;
+
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+       gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+       if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll,
+                                       sizeof(sched_poll))) {
+               *r = -EFAULT;
+               return true;
+       }
+
+       if (unlikely(sched_poll.nr_ports > 1)) {
+               /* Xen (unofficially) limits number of pollers to 128 */
+               if (sched_poll.nr_ports > 128) {
+                       *r = -EINVAL;
+                       return true;
+               }
+
+               ports = kmalloc_array(sched_poll.nr_ports,
+                                     sizeof(*ports), GFP_KERNEL);
+               if (!ports) {
+                       *r = -ENOMEM;
+                       return true;
+               }
+       } else
+               ports = &port;
+
+       for (i = 0; i < sched_poll.nr_ports; i++) {
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
+               gpa = kvm_mmu_gva_to_gpa_system(vcpu,
+                                               (gva_t)(sched_poll.ports + i),
+                                               NULL);
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+               if (!gpa || kvm_vcpu_read_guest(vcpu, gpa,
+                                               &ports[i], sizeof(port))) {
+                       *r = -EFAULT;
+                       goto out;
+               }
+       }
+
+       if (sched_poll.nr_ports == 1)
+               vcpu->arch.xen.poll_evtchn = port;
+       else
+               vcpu->arch.xen.poll_evtchn = -1;
+
+       set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+
+       if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
+               vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+               if (sched_poll.timeout)
+                       mod_timer(&vcpu->arch.xen.poll_timer,
+                                 jiffies + nsecs_to_jiffies(sched_poll.timeout));
+
+               kvm_vcpu_halt(vcpu);
+
+               if (sched_poll.timeout)
+                       del_timer(&vcpu->arch.xen.poll_timer);
+
+               vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+               kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+       }
+
+       vcpu->arch.xen.poll_evtchn = 0;
+       *r = 0;
+out:
+       /* Really, this is only needed in case of timeout */
+       clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+
+       if (unlikely(sched_poll.nr_ports > 1))
+               kfree(ports);
+       return true;
+}
+
+static void cancel_evtchn_poll(struct timer_list *t)
+{
+       struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
+
+       kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+       kvm_vcpu_kick(vcpu);
+}
+
+static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
+                                  int cmd, u64 param, u64 *r)
+{
+       switch (cmd) {
+       case SCHEDOP_poll:
+               if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
+                       return true;
+               fallthrough;
+       case SCHEDOP_yield:
+               kvm_vcpu_on_spin(vcpu, true);
+               *r = 0;
+               return true;
+       default:
+               break;
+       }
+
+       return false;
+}
+
+struct compat_vcpu_set_singleshot_timer {
+    uint64_t timeout_abs_ns;
+    uint32_t flags;
+} __attribute__((packed));
+
+static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
+                                 int vcpu_id, u64 param, u64 *r)
+{
+       struct vcpu_set_singleshot_timer oneshot;
+       s64 delta;
+       gpa_t gpa;
+       int idx;
+
+       if (!kvm_xen_timer_enabled(vcpu))
+               return false;
+
+       switch (cmd) {
+       case VCPUOP_set_singleshot_timer:
+               if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+                       *r = -EINVAL;
+                       return true;
+               }
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
+               gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+               /*
+                * The only difference for 32-bit compat is the 4 bytes of
+                * padding after the interesting part of the structure. So
+                * for a faithful emulation of Xen we have to *try* to copy
+                * the padding and return -EFAULT if we can't. Otherwise we
+                * might as well just have copied the 12-byte 32-bit struct.
+                */
+               BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+                            offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+               BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+                            sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+               BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
+                            offsetof(struct vcpu_set_singleshot_timer, flags));
+               BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
+                            sizeof_field(struct vcpu_set_singleshot_timer, flags));
+
+               if (!gpa ||
+                   kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) :
+                                       sizeof(struct compat_vcpu_set_singleshot_timer))) {
+                       *r = -EFAULT;
+                       return true;
+               }
+
+               delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
+               if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) {
+                       *r = -ETIME;
+                       return true;
+               }
+
+               kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
+               *r = 0;
+               return true;
+
+       case VCPUOP_stop_singleshot_timer:
+               if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+                       *r = -EINVAL;
+                       return true;
+               }
+               kvm_xen_stop_timer(vcpu);
+               *r = 0;
+               return true;
+       }
+
+       return false;
+}
+
+static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
+                                      u64 *r)
+{
+       if (!kvm_xen_timer_enabled(vcpu))
+               return false;
+
+       if (timeout) {
+               uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
+               int64_t delta = timeout - guest_now;
+
+               /* Xen has a 'Linux workaround' in do_set_timer_op() which
+                * checks for negative absolute timeout values (caused by
+                * integer overflow), and for values about 13 days in the
+                * future (2^50ns) which would be caused by jiffies
+                * overflow. For those cases, it sets the timeout 100ms in
+                * the future (not *too* soon, since if a guest really did
+                * set a long timeout on purpose we don't want to keep
+                * churning CPU time by waking it up).
+                */
+               if (unlikely((int64_t)timeout < 0 ||
+                            (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+                       delta = 100 * NSEC_PER_MSEC;
+                       timeout = guest_now + delta;
+               }
+
+               kvm_xen_start_timer(vcpu, timeout, delta);
+       } else {
+               kvm_xen_stop_timer(vcpu);
+       }
+
+       *r = 0;
+       return true;
+}
+
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 {
        bool longmode;
-       u64 input, params[6];
+       u64 input, params[6], r = -ENOSYS;
+       bool handled = false;
 
        input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
 
@@ -864,6 +1246,40 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
        trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
                                params[3], params[4], params[5]);
 
+       switch (input) {
+       case __HYPERVISOR_xen_version:
+               if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
+                       r = vcpu->kvm->arch.xen.xen_version;
+                       handled = true;
+               }
+               break;
+       case __HYPERVISOR_event_channel_op:
+               if (params[0] == EVTCHNOP_send)
+                       handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
+               break;
+       case __HYPERVISOR_sched_op:
+               handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
+                                                params[1], &r);
+               break;
+       case __HYPERVISOR_vcpu_op:
+               handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
+                                               params[2], &r);
+               break;
+       case __HYPERVISOR_set_timer_op: {
+               u64 timeout = params[0];
+               /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
+               if (!longmode)
+                       timeout |= params[1] << 32;
+               handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
+               break;
+       }
+       default:
+               break;
+       }
+
+       if (handled)
+               return kvm_xen_hypercall_set_result(vcpu, r);
+
        vcpu->run->exit_reason = KVM_EXIT_XEN;
        vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
        vcpu->run->xen.u.hcall.longmode = longmode;
@@ -890,14 +1306,28 @@ static inline int max_evtchn_port(struct kvm *kvm)
                return COMPAT_EVTCHN_2L_NR_CHANNELS;
 }
 
+static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
+{
+       int poll_evtchn = vcpu->arch.xen.poll_evtchn;
+
+       if ((poll_evtchn == port || poll_evtchn == -1) &&
+           test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) {
+               kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+               kvm_vcpu_kick(vcpu);
+       }
+}
+
 /*
- * This follows the kvm_set_irq() API, so it returns:
+ * The return value from this function is propagated to kvm_set_irq() API,
+ * so it returns:
  *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
  *  = 0   Interrupt was coalesced (previous irq is still pending)
  *  > 0   Number of CPUs interrupt was delivered to
+ *
+ * It is also called directly from kvm_arch_set_irq_inatomic(), where the
+ * only check on its return value is a comparison with -EWOULDBLOCK'.
  */
-int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
-                           struct kvm *kvm)
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 {
        struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
        struct kvm_vcpu *vcpu;
@@ -905,23 +1335,29 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
        unsigned long flags;
        int port_word_bit;
        bool kick_vcpu = false;
-       int idx;
-       int rc;
+       int vcpu_idx, idx, rc;
 
-       vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
-       if (!vcpu)
-               return -1;
+       vcpu_idx = READ_ONCE(xe->vcpu_idx);
+       if (vcpu_idx >= 0)
+               vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+       else {
+               vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
+               if (!vcpu)
+                       return -EINVAL;
+               WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu));
+       }
 
-       if (!vcpu->arch.xen.vcpu_info_set)
-               return -1;
+       if (!vcpu->arch.xen.vcpu_info_cache.active)
+               return -EINVAL;
 
-       if (e->xen_evtchn.port >= max_evtchn_port(kvm))
-               return -1;
+       if (xe->port >= max_evtchn_port(kvm))
+               return -EINVAL;
 
        rc = -EWOULDBLOCK;
-       read_lock_irqsave(&gpc->lock, flags);
 
        idx = srcu_read_lock(&kvm->srcu);
+
+       read_lock_irqsave(&gpc->lock, flags);
        if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
                goto out_rcu;
 
@@ -929,12 +1365,12 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
                struct shared_info *shinfo = gpc->khva;
                pending_bits = (unsigned long *)&shinfo->evtchn_pending;
                mask_bits = (unsigned long *)&shinfo->evtchn_mask;
-               port_word_bit = e->xen_evtchn.port / 64;
+               port_word_bit = xe->port / 64;
        } else {
                struct compat_shared_info *shinfo = gpc->khva;
                pending_bits = (unsigned long *)&shinfo->evtchn_pending;
                mask_bits = (unsigned long *)&shinfo->evtchn_mask;
-               port_word_bit = e->xen_evtchn.port / 32;
+               port_word_bit = xe->port / 32;
        }
 
        /*
@@ -944,39 +1380,68 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
         * already set, then we kick the vCPU in question to write to the
         * *real* evtchn_pending_sel in its own guest vcpu_info struct.
         */
-       if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
+       if (test_and_set_bit(xe->port, pending_bits)) {
                rc = 0; /* It was already raised */
-       } else if (test_bit(e->xen_evtchn.port, mask_bits)) {
-               rc = -1; /* Masked */
+       } else if (test_bit(xe->port, mask_bits)) {
+               rc = -ENOTCONN; /* Masked */
+               kvm_xen_check_poller(vcpu, xe->port);
        } else {
-               rc = 1; /* Delivered. But was the vCPU waking already? */
-               if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
-                       kick_vcpu = true;
+               rc = 1; /* Delivered to the bitmap in shared_info. */
+               /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
+               read_unlock_irqrestore(&gpc->lock, flags);
+               gpc = &vcpu->arch.xen.vcpu_info_cache;
+
+               read_lock_irqsave(&gpc->lock, flags);
+               if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
+                       /*
+                        * Could not access the vcpu_info. Set the bit in-kernel
+                        * and prod the vCPU to deliver it for itself.
+                        */
+                       if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+                               kick_vcpu = true;
+                       goto out_rcu;
+               }
+
+               if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+                       struct vcpu_info *vcpu_info = gpc->khva;
+                       if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
+                               WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+                               kick_vcpu = true;
+                       }
+               } else {
+                       struct compat_vcpu_info *vcpu_info = gpc->khva;
+                       if (!test_and_set_bit(port_word_bit,
+                                             (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
+                               WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+                               kick_vcpu = true;
+                       }
+               }
+
+               /* For the per-vCPU lapic vector, deliver it as MSI. */
+               if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
+                       kvm_xen_inject_vcpu_vector(vcpu);
+                       kick_vcpu = false;
+               }
        }
 
  out_rcu:
-       srcu_read_unlock(&kvm->srcu, idx);
        read_unlock_irqrestore(&gpc->lock, flags);
+       srcu_read_unlock(&kvm->srcu, idx);
 
        if (kick_vcpu) {
-               kvm_make_request(KVM_REQ_EVENT, vcpu);
+               kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
                kvm_vcpu_kick(vcpu);
        }
 
        return rc;
 }
 
-/* This is the version called from kvm_set_irq() as the .set function */
-static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
-                        int irq_source_id, int level, bool line_status)
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 {
        bool mm_borrowed = false;
        int rc;
 
-       if (!level)
-               return -1;
-
-       rc = kvm_xen_set_evtchn_fast(e, kvm);
+       rc = kvm_xen_set_evtchn_fast(xe, kvm);
        if (rc != -EWOULDBLOCK)
                return rc;
 
@@ -1020,7 +1485,7 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm
                struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
                int idx;
 
-               rc = kvm_xen_set_evtchn_fast(e, kvm);
+               rc = kvm_xen_set_evtchn_fast(xe, kvm);
                if (rc != -EWOULDBLOCK)
                        break;
 
@@ -1037,11 +1502,27 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm
        return rc;
 }
 
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+                        int irq_source_id, int level, bool line_status)
+{
+       if (!level)
+               return -EINVAL;
+
+       return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
+}
+
+/*
+ * Set up an event channel interrupt from the KVM IRQ routing table.
+ * Used for e.g. PIRQ from passed through physical devices.
+ */
 int kvm_xen_setup_evtchn(struct kvm *kvm,
                         struct kvm_kernel_irq_routing_entry *e,
                         const struct kvm_irq_routing_entry *ue)
 
 {
+       struct kvm_vcpu *vcpu;
+
        if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
                return -EINVAL;
 
@@ -1049,10 +1530,328 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
        if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
                return -EINVAL;
 
+       /*
+        * Xen gives us interesting mappings from vCPU index to APIC ID,
+        * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
+        * to find it. Do that once at setup time, instead of every time.
+        * But beware that on live update / live migration, the routing
+        * table might be reinstated before the vCPU threads have finished
+        * recreating their vCPUs.
+        */
+       vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
+       if (vcpu)
+               e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu);
+       else
+               e->xen_evtchn.vcpu_idx = -1;
+
        e->xen_evtchn.port = ue->u.xen_evtchn.port;
-       e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
+       e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
        e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
        e->set = evtchn_set_fn;
 
        return 0;
 }
+
+/*
+ * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
+ */
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
+{
+       struct kvm_xen_evtchn e;
+       int ret;
+
+       if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
+               return -EINVAL;
+
+       /* We only support 2 level event channels for now */
+       if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+               return -EINVAL;
+
+       e.port = uxe->port;
+       e.vcpu_id = uxe->vcpu;
+       e.vcpu_idx = -1;
+       e.priority = uxe->priority;
+
+       ret = kvm_xen_set_evtchn(&e, kvm);
+
+       /*
+        * None of that 'return 1 if it actually got delivered' nonsense.
+        * We don't care if it was masked (-ENOTCONN) either.
+        */
+       if (ret > 0 || ret == -ENOTCONN)
+               ret = 0;
+
+       return ret;
+}
+
+/*
+ * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
+ */
+struct evtchnfd {
+       u32 send_port;
+       u32 type;
+       union {
+               struct kvm_xen_evtchn port;
+               struct {
+                       u32 port; /* zero */
+                       struct eventfd_ctx *ctx;
+               } eventfd;
+       } deliver;
+};
+
+/*
+ * Update target vCPU or priority for a registered sending channel.
+ */
+static int kvm_xen_eventfd_update(struct kvm *kvm,
+                                 struct kvm_xen_hvm_attr *data)
+{
+       u32 port = data->u.evtchn.send_port;
+       struct evtchnfd *evtchnfd;
+
+       if (!port || port >= max_evtchn_port(kvm))
+               return -EINVAL;
+
+       mutex_lock(&kvm->lock);
+       evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
+       mutex_unlock(&kvm->lock);
+
+       if (!evtchnfd)
+               return -ENOENT;
+
+       /* For an UPDATE, nothing may change except the priority/vcpu */
+       if (evtchnfd->type != data->u.evtchn.type)
+               return -EINVAL;
+
+       /*
+        * Port cannot change, and if it's zero that was an eventfd
+        * which can't be changed either.
+        */
+       if (!evtchnfd->deliver.port.port ||
+           evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
+               return -EINVAL;
+
+       /* We only support 2 level event channels for now */
+       if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+               return -EINVAL;
+
+       mutex_lock(&kvm->lock);
+       evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+       if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
+               evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+               evtchnfd->deliver.port.vcpu_idx = -1;
+       }
+       mutex_unlock(&kvm->lock);
+       return 0;
+}
+
+/*
+ * Configure the target (eventfd or local port delivery) for sending on
+ * a given event channel.
+ */
+static int kvm_xen_eventfd_assign(struct kvm *kvm,
+                                 struct kvm_xen_hvm_attr *data)
+{
+       u32 port = data->u.evtchn.send_port;
+       struct eventfd_ctx *eventfd = NULL;
+       struct evtchnfd *evtchnfd = NULL;
+       int ret = -EINVAL;
+
+       if (!port || port >= max_evtchn_port(kvm))
+               return -EINVAL;
+
+       evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
+       if (!evtchnfd)
+               return -ENOMEM;
+
+       switch(data->u.evtchn.type) {
+       case EVTCHNSTAT_ipi:
+               /* IPI  must map back to the same port# */
+               if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
+                       goto out; /* -EINVAL */
+               break;
+
+       case EVTCHNSTAT_interdomain:
+               if (data->u.evtchn.deliver.port.port) {
+                       if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
+                               goto out; /* -EINVAL */
+               } else {
+                       eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
+                       if (IS_ERR(eventfd)) {
+                               ret = PTR_ERR(eventfd);
+                               goto out;
+                       }
+               }
+               break;
+
+       case EVTCHNSTAT_virq:
+       case EVTCHNSTAT_closed:
+       case EVTCHNSTAT_unbound:
+       case EVTCHNSTAT_pirq:
+       default: /* Unknown event channel type */
+               goto out; /* -EINVAL */
+       }
+
+       evtchnfd->send_port = data->u.evtchn.send_port;
+       evtchnfd->type = data->u.evtchn.type;
+       if (eventfd) {
+               evtchnfd->deliver.eventfd.ctx = eventfd;
+       } else {
+               /* We only support 2 level event channels for now */
+               if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+                       goto out; /* -EINVAL; */
+
+               evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
+               evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+               evtchnfd->deliver.port.vcpu_idx = -1;
+               evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+       }
+
+       mutex_lock(&kvm->lock);
+       ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
+                       GFP_KERNEL);
+       mutex_unlock(&kvm->lock);
+       if (ret >= 0)
+               return 0;
+
+       if (ret == -ENOSPC)
+               ret = -EEXIST;
+out:
+       if (eventfd)
+               eventfd_ctx_put(eventfd);
+       kfree(evtchnfd);
+       return ret;
+}
+
+static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
+{
+       struct evtchnfd *evtchnfd;
+
+       mutex_lock(&kvm->lock);
+       evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
+       mutex_unlock(&kvm->lock);
+
+       if (!evtchnfd)
+               return -ENOENT;
+
+       if (kvm)
+               synchronize_srcu(&kvm->srcu);
+       if (!evtchnfd->deliver.port.port)
+               eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+       kfree(evtchnfd);
+       return 0;
+}
+
+static int kvm_xen_eventfd_reset(struct kvm *kvm)
+{
+       struct evtchnfd *evtchnfd;
+       int i;
+
+       mutex_lock(&kvm->lock);
+       idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+               idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
+               synchronize_srcu(&kvm->srcu);
+               if (!evtchnfd->deliver.port.port)
+                       eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+               kfree(evtchnfd);
+       }
+       mutex_unlock(&kvm->lock);
+
+       return 0;
+}
+
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+       u32 port = data->u.evtchn.send_port;
+
+       if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
+               return kvm_xen_eventfd_reset(kvm);
+
+       if (!port || port >= max_evtchn_port(kvm))
+               return -EINVAL;
+
+       if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
+               return kvm_xen_eventfd_deassign(kvm, port);
+       if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
+               return kvm_xen_eventfd_update(kvm, data);
+       if (data->u.evtchn.flags)
+               return -EINVAL;
+
+       return kvm_xen_eventfd_assign(kvm, data);
+}
+
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
+{
+       struct evtchnfd *evtchnfd;
+       struct evtchn_send send;
+       gpa_t gpa;
+       int idx;
+
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+       gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+       if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) {
+               *r = -EFAULT;
+               return true;
+       }
+
+       /* The evtchn_ports idr is protected by vcpu->kvm->srcu */
+       evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
+       if (!evtchnfd)
+               return false;
+
+       if (evtchnfd->deliver.port.port) {
+               int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
+               if (ret < 0 && ret != -ENOTCONN)
+                       return false;
+       } else {
+               eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+       }
+
+       *r = 0;
+       return true;
+}
+
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
+       vcpu->arch.xen.poll_evtchn = 0;
+       timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
+}
+
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+       if (kvm_xen_timer_enabled(vcpu))
+               kvm_xen_stop_timer(vcpu);
+
+       kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+                                    &vcpu->arch.xen.runstate_cache);
+       kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+                                    &vcpu->arch.xen.vcpu_info_cache);
+       kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+                                    &vcpu->arch.xen.vcpu_time_info_cache);
+       del_timer_sync(&vcpu->arch.xen.poll_timer);
+}
+
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+       idr_init(&kvm->arch.xen.evtchn_ports);
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+       struct evtchnfd *evtchnfd;
+       int i;
+
+       kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+
+       idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+               if (!evtchnfd->deliver.port.port)
+                       eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+               kfree(evtchnfd);
+       }
+       idr_destroy(&kvm->arch.xen.evtchn_ports);
+
+       if (kvm->arch.xen_hvm_config.msr)
+               static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
index adbcc9e..ee5c4ae 100644 (file)
 extern struct static_key_false_deferred kvm_xen_enabled;
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
 int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
 int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
 int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
 int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt);
 int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_init_vm(struct kvm *kvm);
 void kvm_xen_destroy_vm(struct kvm *kvm);
-
-int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu);
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu);
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
                            struct kvm *kvm);
 int kvm_xen_setup_evtchn(struct kvm *kvm,
                         struct kvm_kernel_irq_routing_entry *e,
@@ -46,11 +49,33 @@ static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
 static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
 {
        if (static_branch_unlikely(&kvm_xen_enabled.key) &&
-           vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+           vcpu->arch.xen.vcpu_info_cache.active &&
+           vcpu->kvm->arch.xen.upcall_vector)
                return __kvm_xen_has_interrupt(vcpu);
 
        return 0;
 }
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+       return static_branch_unlikely(&kvm_xen_enabled.key) &&
+               vcpu->arch.xen.evtchn_pending_sel;
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+       return !!vcpu->arch.xen.timer_virq;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+       if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu))
+               return atomic_read(&vcpu->arch.xen.timer_pending);
+
+       return 0;
+}
+
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu);
 #else
 static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 {
@@ -65,6 +90,14 @@ static inline void kvm_xen_destroy_vm(struct kvm *kvm)
 {
 }
 
+static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
 static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
 {
        return false;
@@ -79,6 +112,29 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
 {
        return 0;
 }
+
+static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
+static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
 #endif
 
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
index 6cc4b19..57f41ef 100644 (file)
@@ -100,6 +100,17 @@ config SCLP_OFB
          This option enables the Open-for-Business interface to the s390
          Service Element.
 
+config S390_UV_UAPI
+       def_tristate m
+       prompt "Ultravisor userspace API"
+       depends on S390
+       help
+         Selecting exposes parts of the UV interface to userspace
+         by providing a misc character device at /dev/uv.
+         Using IOCTLs one can interact with the UV.
+         The device is only available if the Ultravisor
+         Facility (158) is present.
+
 config S390_TAPE
        def_tristate m
        prompt "S/390 tape device support"
index c6fdb81..ce32270 100644 (file)
@@ -48,6 +48,7 @@ obj-$(CONFIG_MONREADER) += monreader.o
 obj-$(CONFIG_MONWRITER) += monwriter.o
 obj-$(CONFIG_S390_VMUR) += vmur.o
 obj-$(CONFIG_CRASH_DUMP) += sclp_sdias.o zcore.o
+obj-$(CONFIG_S390_UV_UAPI) += uvdevice.o
 
 hmcdrv-objs := hmcdrv_mod.o hmcdrv_dev.o hmcdrv_ftp.o hmcdrv_cache.o diag_ftp.o sclp_ftp.o
 obj-$(CONFIG_HMC_DRV) += hmcdrv.o
diff --git a/drivers/s390/char/uvdevice.c b/drivers/s390/char/uvdevice.c
new file mode 100644 (file)
index 0000000..66505d7
--- /dev/null
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Copyright IBM Corp. 2022
+ *  Author(s): Steffen Eiden <seiden@linux.ibm.com>
+ *
+ *  This file provides a Linux misc device to give userspace access to some
+ *  Ultravisor (UV) functions. The device only accepts IOCTLs and will only
+ *  be present if the Ultravisor facility (158) is present.
+ *
+ *  When userspace sends a valid IOCTL uvdevice will copy the input data to
+ *  kernel space, do some basic validity checks to avoid kernel/system
+ *  corruption. Any other check that the Ultravisor does will not be done by
+ *  the uvdevice to keep changes minimal when adding new functionalities
+ *  to existing UV-calls.
+ *  After the checks uvdevice builds a corresponding
+ *  Ultravisor Call Control Block, and sends the request to the Ultravisor.
+ *  Then, it copies the response, including the return codes, back to userspace.
+ *  It is the responsibility of the userspace to check for any error issued
+ *  by UV and to interpret the UV response. The uvdevice acts as a communication
+ *  channel for userspace to the Ultravisor.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/uvdevice.h>
+#include <asm/uv.h>
+
+static int uvio_build_uvcb_attest(struct uv_cb_attest *uvcb_attest, u8 *arcb,
+                                 u8 *meas, u8 *add_data, struct uvio_attest *uvio_attest)
+{
+       void __user *user_buf_arcb = (void __user *)uvio_attest->arcb_addr;
+
+       if (copy_from_user(arcb, user_buf_arcb, uvio_attest->arcb_len))
+               return -EFAULT;
+
+       uvcb_attest->header.len = sizeof(*uvcb_attest);
+       uvcb_attest->header.cmd = UVC_CMD_RETR_ATTEST;
+       uvcb_attest->arcb_addr = (u64)arcb;
+       uvcb_attest->cont_token = 0;
+       uvcb_attest->user_data_len = uvio_attest->user_data_len;
+       memcpy(uvcb_attest->user_data, uvio_attest->user_data, sizeof(uvcb_attest->user_data));
+       uvcb_attest->meas_len = uvio_attest->meas_len;
+       uvcb_attest->meas_addr = (u64)meas;
+       uvcb_attest->add_data_len = uvio_attest->add_data_len;
+       uvcb_attest->add_data_addr = (u64)add_data;
+
+       return 0;
+}
+
+static int uvio_copy_attest_result_to_user(struct uv_cb_attest *uvcb_attest,
+                                          struct uvio_ioctl_cb *uv_ioctl,
+                                          u8 *measurement, u8 *add_data,
+                                          struct uvio_attest *uvio_attest)
+{
+       struct uvio_attest __user *user_uvio_attest = (void __user *)uv_ioctl->argument_addr;
+       void __user *user_buf_add = (void __user *)uvio_attest->add_data_addr;
+       void __user *user_buf_meas = (void __user *)uvio_attest->meas_addr;
+       void __user *user_buf_uid = &user_uvio_attest->config_uid;
+
+       if (copy_to_user(user_buf_meas, measurement, uvio_attest->meas_len))
+               return -EFAULT;
+       if (add_data && copy_to_user(user_buf_add, add_data, uvio_attest->add_data_len))
+               return -EFAULT;
+       if (copy_to_user(user_buf_uid, uvcb_attest->config_uid, sizeof(uvcb_attest->config_uid)))
+               return -EFAULT;
+       return 0;
+}
+
+static int get_uvio_attest(struct uvio_ioctl_cb *uv_ioctl, struct uvio_attest *uvio_attest)
+{
+       u8 __user *user_arg_buf = (u8 __user *)uv_ioctl->argument_addr;
+
+       if (copy_from_user(uvio_attest, user_arg_buf, sizeof(*uvio_attest)))
+               return -EFAULT;
+
+       if (uvio_attest->arcb_len > UVIO_ATT_ARCB_MAX_LEN)
+               return -EINVAL;
+       if (uvio_attest->arcb_len == 0)
+               return -EINVAL;
+       if (uvio_attest->meas_len > UVIO_ATT_MEASUREMENT_MAX_LEN)
+               return -EINVAL;
+       if (uvio_attest->meas_len == 0)
+               return -EINVAL;
+       if (uvio_attest->add_data_len > UVIO_ATT_ADDITIONAL_MAX_LEN)
+               return -EINVAL;
+       if (uvio_attest->reserved136)
+               return -EINVAL;
+       return 0;
+}
+
+/**
+ * uvio_attestation() - Perform a Retrieve Attestation Measurement UVC.
+ *
+ * @uv_ioctl: ioctl control block
+ *
+ * uvio_attestation() does a Retrieve Attestation Measurement Ultravisor Call.
+ * It verifies that the given userspace addresses are valid and request sizes
+ * are sane. Every other check is made by the Ultravisor (UV) and won't result
+ * in a negative return value. It copies the input to kernelspace, builds the
+ * request, sends the UV-call, and copies the result to userspace.
+ *
+ * The Attestation Request has two input and two outputs.
+ * ARCB and User Data are inputs for the UV generated by userspace.
+ * Measurement and Additional Data are outputs for userspace generated by UV.
+ *
+ * The Attestation Request Control Block (ARCB) is a cryptographically verified
+ * and secured request to UV and User Data is some plaintext data which is
+ * going to be included in the Attestation Measurement calculation.
+ *
+ * Measurement is a cryptographic measurement of the callers properties,
+ * optional data configured by the ARCB and the user data. If specified by the
+ * ARCB, UV will add some Additional Data to the measurement calculation.
+ * This Additional Data is then returned as well.
+ *
+ * If the Retrieve Attestation Measurement UV facility is not present,
+ * UV will return invalid command rc. This won't be fenced in the driver
+ * and does not result in a negative return value.
+ *
+ * Context: might sleep
+ *
+ * Return: 0 on success or a negative error code on error.
+ */
+static int uvio_attestation(struct uvio_ioctl_cb *uv_ioctl)
+{
+       struct uv_cb_attest *uvcb_attest = NULL;
+       struct uvio_attest *uvio_attest = NULL;
+       u8 *measurement = NULL;
+       u8 *add_data = NULL;
+       u8 *arcb = NULL;
+       int ret;
+
+       ret = -EINVAL;
+       if (uv_ioctl->argument_len != sizeof(*uvio_attest))
+               goto out;
+
+       ret = -ENOMEM;
+       uvio_attest = kzalloc(sizeof(*uvio_attest), GFP_KERNEL);
+       if (!uvio_attest)
+               goto out;
+
+       ret = get_uvio_attest(uv_ioctl, uvio_attest);
+       if (ret)
+               goto out;
+
+       ret = -ENOMEM;
+       arcb = kvzalloc(uvio_attest->arcb_len, GFP_KERNEL);
+       measurement = kvzalloc(uvio_attest->meas_len, GFP_KERNEL);
+       if (!arcb || !measurement)
+               goto out;
+
+       if (uvio_attest->add_data_len) {
+               add_data = kvzalloc(uvio_attest->add_data_len, GFP_KERNEL);
+               if (!add_data)
+                       goto out;
+       }
+
+       uvcb_attest = kzalloc(sizeof(*uvcb_attest), GFP_KERNEL);
+       if (!uvcb_attest)
+               goto out;
+
+       ret = uvio_build_uvcb_attest(uvcb_attest, arcb,  measurement, add_data, uvio_attest);
+       if (ret)
+               goto out;
+
+       uv_call_sched(0, (u64)uvcb_attest);
+
+       uv_ioctl->uv_rc = uvcb_attest->header.rc;
+       uv_ioctl->uv_rrc = uvcb_attest->header.rrc;
+
+       ret = uvio_copy_attest_result_to_user(uvcb_attest, uv_ioctl, measurement, add_data,
+                                             uvio_attest);
+out:
+       kvfree(arcb);
+       kvfree(measurement);
+       kvfree(add_data);
+       kfree(uvio_attest);
+       kfree(uvcb_attest);
+       return ret;
+}
+
+static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *argp)
+{
+       if (copy_from_user(ioctl, argp, sizeof(*ioctl)))
+               return -EFAULT;
+       if (ioctl->flags != 0)
+               return -EINVAL;
+       if (memchr_inv(ioctl->reserved14, 0, sizeof(ioctl->reserved14)))
+               return -EINVAL;
+
+       return 0;
+}
+
+/*
+ * IOCTL entry point for the Ultravisor device.
+ */
+static long uvio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+       struct uvio_ioctl_cb uv_ioctl = { };
+       long ret;
+
+       switch (cmd) {
+       case UVIO_IOCTL_ATT:
+               ret = uvio_copy_and_check_ioctl(&uv_ioctl, argp);
+               if (ret)
+                       return ret;
+               ret = uvio_attestation(&uv_ioctl);
+               break;
+       default:
+               ret = -ENOIOCTLCMD;
+               break;
+       }
+       if (ret)
+               return ret;
+
+       if (copy_to_user(argp, &uv_ioctl, sizeof(uv_ioctl)))
+               ret = -EFAULT;
+
+       return ret;
+}
+
+static const struct file_operations uvio_dev_fops = {
+       .owner = THIS_MODULE,
+       .unlocked_ioctl = uvio_ioctl,
+       .llseek = no_llseek,
+};
+
+static struct miscdevice uvio_dev_miscdev = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = UVIO_DEVICE_NAME,
+       .fops = &uvio_dev_fops,
+};
+
+static void __exit uvio_dev_exit(void)
+{
+       misc_deregister(&uvio_dev_miscdev);
+}
+
+static int __init uvio_dev_init(void)
+{
+       if (!test_facility(158))
+               return -ENXIO;
+       return misc_register(&uvio_dev_miscdev);
+}
+
+module_init(uvio_dev_init);
+module_exit(uvio_dev_exit);
+
+MODULE_AUTHOR("IBM Corporation");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Ultravisor UAPI driver");
index 51c1938..cd6d8f2 100644 (file)
@@ -76,8 +76,6 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
 int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
 int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
 
-bool kvm_timer_is_pending(struct kvm_vcpu *vcpu);
-
 u64 kvm_phys_timer_read(void);
 
 void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu);
index 0e2509d..1188f11 100644 (file)
@@ -40,4 +40,12 @@ static inline void smccc_set_retval(struct kvm_vcpu *vcpu,
        vcpu_set_reg(vcpu, 3, a3);
 }
 
+struct kvm_one_reg;
+
+void kvm_arm_init_hypercalls(struct kvm *kvm);
+int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu);
+int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
+int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
+int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
+
 #endif
index 2019341..c0b868c 100644 (file)
@@ -20,13 +20,19 @@ struct kvm_pmc {
        struct perf_event *perf_event;
 };
 
+struct kvm_pmu_events {
+       u32 events_host;
+       u32 events_guest;
+};
+
 struct kvm_pmu {
-       int irq_num;
+       struct irq_work overflow_work;
+       struct kvm_pmu_events events;
        struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS];
        DECLARE_BITMAP(chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
+       int irq_num;
        bool created;
        bool irq_level;
-       struct irq_work overflow_work;
 };
 
 struct arm_pmu_entry {
@@ -66,6 +72,25 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr);
 int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
+
+struct kvm_pmu_events *kvm_get_pmu_events(void);
+void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu);
+void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
+
+#define kvm_vcpu_has_pmu(vcpu)                                 \
+       (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
+
+/*
+ * Updates the vcpu's view of the pmu events for this cpu.
+ * Must be called before every vcpu run after disabling interrupts, to ensure
+ * that an interrupt cannot fire and update the structure.
+ */
+#define kvm_pmu_update_vcpu_events(vcpu)                               \
+       do {                                                            \
+               if (!has_vhe() && kvm_vcpu_has_pmu(vcpu))               \
+                       vcpu->arch.pmu.events = *kvm_get_pmu_events();  \
+       } while (0)
+
 #else
 struct kvm_pmu {
 };
@@ -127,6 +152,11 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
        return 0;
 }
 
+#define kvm_vcpu_has_pmu(vcpu)         ({ false; })
+static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {}
+static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {}
+static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {}
+
 #endif
 
 #endif
index 68b96c3..6e55b92 100644 (file)
@@ -39,11 +39,4 @@ static inline int kvm_psci_version(struct kvm_vcpu *vcpu)
 
 int kvm_psci_call(struct kvm_vcpu *vcpu);
 
-struct kvm_one_reg;
-
-int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu);
-int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
-int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
-int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
-
 #endif /* __KVM_ARM_PSCI_H__ */
index bb30a68..2d8f2e9 100644 (file)
@@ -231,6 +231,9 @@ struct vgic_dist {
 
        /* Implementation revision as reported in the GICD_IIDR */
        u32                     implementation_rev;
+#define KVM_VGIC_IMP_REV_2     2 /* GICv2 restorable groups */
+#define KVM_VGIC_IMP_REV_3     3 /* GICv3 GICR_CTLR.{IW,CES,RWP} */
+#define KVM_VGIC_IMP_REV_LATEST        KVM_VGIC_IMP_REV_3
 
        /* Userspace can write to GICv2 IGROUPR */
        bool                    v2_groups_user_writable;
@@ -344,11 +347,12 @@ struct vgic_cpu {
        struct vgic_io_device   rd_iodev;
        struct vgic_redist_region *rdreg;
        u32 rdreg_index;
+       atomic_t syncr_busy;
 
        /* Contains the attributes and gpa of the LPI pending tables. */
        u64 pendbaser;
-
-       bool lpis_enabled;
+       /* GICR_CTLR.{ENABLE_LPIS,RWP} */
+       atomic_t ctlr;
 
        /* Cache guest priority bits */
        u32 num_pri_bits;
index 4640393..c20f2d5 100644 (file)
@@ -614,7 +614,8 @@ struct kvm_hv_sint {
 
 struct kvm_xen_evtchn {
        u32 port;
-       u32 vcpu;
+       u32 vcpu_id;
+       int vcpu_idx;
        u32 priority;
 };
 
@@ -727,6 +728,7 @@ struct kvm {
         * and is accessed atomically.
         */
        atomic_t online_vcpus;
+       int max_vcpus;
        int created_vcpus;
        int last_boosted_vcpu;
        struct list_head vm_list;
index 6a184d2..5088bd9 100644 (file)
@@ -444,6 +444,9 @@ struct kvm_run {
 #define KVM_SYSTEM_EVENT_SHUTDOWN       1
 #define KVM_SYSTEM_EVENT_RESET          2
 #define KVM_SYSTEM_EVENT_CRASH          3
+#define KVM_SYSTEM_EVENT_WAKEUP         4
+#define KVM_SYSTEM_EVENT_SUSPEND        5
+#define KVM_SYSTEM_EVENT_SEV_TERM       6
                        __u32 type;
                        __u32 ndata;
                        union {
@@ -646,6 +649,7 @@ struct kvm_vapic_addr {
 #define KVM_MP_STATE_OPERATING         7
 #define KVM_MP_STATE_LOAD              8
 #define KVM_MP_STATE_AP_RESET_HOLD     9
+#define KVM_MP_STATE_SUSPENDED         10
 
 struct kvm_mp_state {
        __u32 mp_state;
@@ -1150,8 +1154,9 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_MEM_OP_EXTENSION 211
 #define KVM_CAP_PMU_CAPABILITY 212
 #define KVM_CAP_DISABLE_QUIRKS2 213
-/* #define KVM_CAP_VM_TSC_CONTROL 214 */
+#define KVM_CAP_VM_TSC_CONTROL 214
 #define KVM_CAP_SYSTEM_EVENT_DATA 215
+#define KVM_CAP_ARM_SYSTEM_SUSPEND 216
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1240,6 +1245,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO         (1 << 2)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE            (1 << 3)
 #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL       (1 << 4)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND         (1 << 5)
 
 struct kvm_xen_hvm_config {
        __u32 flags;
@@ -1478,7 +1484,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 /* Available with KVM_CAP_PPC_GET_PVINFO */
 #define KVM_PPC_GET_PVINFO       _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
-/* Available with KVM_CAP_TSC_CONTROL */
+/* Available with KVM_CAP_TSC_CONTROL for a vCPU, or with
+*  KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */
 #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
 #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
 /* Available with KVM_CAP_PCI_2_3 */
@@ -1694,6 +1701,32 @@ struct kvm_xen_hvm_attr {
                struct {
                        __u64 gfn;
                } shared_info;
+               struct {
+                       __u32 send_port;
+                       __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+                       __u32 flags;
+#define KVM_XEN_EVTCHN_DEASSIGN                (1 << 0)
+#define KVM_XEN_EVTCHN_UPDATE          (1 << 1)
+#define KVM_XEN_EVTCHN_RESET           (1 << 2)
+                       /*
+                        * Events sent by the guest are either looped back to
+                        * the guest itself (potentially on a different port#)
+                        * or signalled via an eventfd.
+                        */
+                       union {
+                               struct {
+                                       __u32 port;
+                                       __u32 vcpu;
+                                       __u32 priority;
+                               } port;
+                               struct {
+                                       __u32 port; /* Zero for eventfd */
+                                       __s32 fd;
+                               } eventfd;
+                               __u32 padding[4];
+                       } deliver;
+               } evtchn;
+               __u32 xen_version;
                __u64 pad[8];
        } u;
 };
@@ -1702,11 +1735,17 @@ struct kvm_xen_hvm_attr {
 #define KVM_XEN_ATTR_TYPE_LONG_MODE            0x0
 #define KVM_XEN_ATTR_TYPE_SHARED_INFO          0x1
 #define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR                0x2
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_ATTR_TYPE_EVTCHN               0x3
+#define KVM_XEN_ATTR_TYPE_XEN_VERSION          0x4
 
 /* Per-vCPU Xen attributes */
 #define KVM_XEN_VCPU_GET_ATTR  _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
 #define KVM_XEN_VCPU_SET_ATTR  _IOW(KVMIO,  0xcb, struct kvm_xen_vcpu_attr)
 
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_HVM_EVTCHN_SEND        _IOW(KVMIO,  0xd0, struct kvm_irq_routing_xen_evtchn)
+
 #define KVM_GET_SREGS2             _IOR(KVMIO,  0xcc, struct kvm_sregs2)
 #define KVM_SET_SREGS2             _IOW(KVMIO,  0xcd, struct kvm_sregs2)
 
@@ -1724,6 +1763,13 @@ struct kvm_xen_vcpu_attr {
                        __u64 time_blocked;
                        __u64 time_offline;
                } runstate;
+               __u32 vcpu_id;
+               struct {
+                       __u32 port;
+                       __u32 priority;
+                       __u64 expires_ns;
+               } timer;
+               __u8 vector;
        } u;
 };
 
@@ -1734,6 +1780,10 @@ struct kvm_xen_vcpu_attr {
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT        0x3
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA   0x4
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID         0x6
+#define KVM_XEN_VCPU_ATTR_TYPE_TIMER           0x7
+#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR   0x8
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
index 9300cb6..526cde2 100644 (file)
@@ -77,6 +77,11 @@ config CC_HAS_ASM_GOTO_OUTPUT
        depends on CC_HAS_ASM_GOTO
        def_bool $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null)
 
+config CC_HAS_ASM_GOTO_TIED_OUTPUT
+       depends on CC_HAS_ASM_GOTO_OUTPUT
+       # Detect buggy gcc and clang, fixed in gcc-11 clang-14.
+       def_bool $(success,echo 'int foo(int *x) { asm goto (".long (%l[bar]) - .\n": "+m"(*x) ::: bar); return *x; bar: return 0; }' | $CC -x c - -c -o /dev/null)
+
 config TOOLS_SUPPORT_RELR
        def_bool $(success,env "CC=$(CC)" "LD=$(LD)" "NM=$(NM)" "OBJCOPY=$(OBJCOPY)" $(srctree)/scripts/tools-support-relr.sh)
 
index 8caabdd..e6906f7 100644 (file)
@@ -111,7 +111,8 @@ static bool is_ignored_symbol(const char *name, char type)
                ".L",                   /* local labels, .LBB,.Ltmpxxx,.L__unnamed_xx,.LASANPC, etc. */
                "__crc_",               /* modversions */
                "__efistub_",           /* arm64 EFI stub namespace */
-               "__kvm_nvhe_",          /* arm64 non-VHE KVM namespace */
+               "__kvm_nvhe_$",         /* arm64 local symbols in non-VHE KVM namespace */
+               "__kvm_nvhe_.L",        /* arm64 local symbols in non-VHE KVM namespace */
                "__AArch64ADRPThunk_",  /* arm64 lld */
                "__ARMV5PILongThunk_",  /* arm lld */
                "__ARMV7PILongThunk_",
diff --git a/tools/include/linux/arm-smccc.h b/tools/include/linux/arm-smccc.h
new file mode 100644 (file)
index 0000000..63ce9be
--- /dev/null
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2015, Linaro Limited
+ */
+#ifndef __LINUX_ARM_SMCCC_H
+#define __LINUX_ARM_SMCCC_H
+
+#include <linux/const.h>
+
+/*
+ * This file provides common defines for ARM SMC Calling Convention as
+ * specified in
+ * https://developer.arm.com/docs/den0028/latest
+ *
+ * This code is up-to-date with version DEN 0028 C
+ */
+
+#define ARM_SMCCC_STD_CALL             _AC(0,U)
+#define ARM_SMCCC_FAST_CALL            _AC(1,U)
+#define ARM_SMCCC_TYPE_SHIFT           31
+
+#define ARM_SMCCC_SMC_32               0
+#define ARM_SMCCC_SMC_64               1
+#define ARM_SMCCC_CALL_CONV_SHIFT      30
+
+#define ARM_SMCCC_OWNER_MASK           0x3F
+#define ARM_SMCCC_OWNER_SHIFT          24
+
+#define ARM_SMCCC_FUNC_MASK            0xFFFF
+
+#define ARM_SMCCC_IS_FAST_CALL(smc_val)        \
+       ((smc_val) & (ARM_SMCCC_FAST_CALL << ARM_SMCCC_TYPE_SHIFT))
+#define ARM_SMCCC_IS_64(smc_val) \
+       ((smc_val) & (ARM_SMCCC_SMC_64 << ARM_SMCCC_CALL_CONV_SHIFT))
+#define ARM_SMCCC_FUNC_NUM(smc_val)    ((smc_val) & ARM_SMCCC_FUNC_MASK)
+#define ARM_SMCCC_OWNER_NUM(smc_val) \
+       (((smc_val) >> ARM_SMCCC_OWNER_SHIFT) & ARM_SMCCC_OWNER_MASK)
+
+#define ARM_SMCCC_CALL_VAL(type, calling_convention, owner, func_num) \
+       (((type) << ARM_SMCCC_TYPE_SHIFT) | \
+       ((calling_convention) << ARM_SMCCC_CALL_CONV_SHIFT) | \
+       (((owner) & ARM_SMCCC_OWNER_MASK) << ARM_SMCCC_OWNER_SHIFT) | \
+       ((func_num) & ARM_SMCCC_FUNC_MASK))
+
+#define ARM_SMCCC_OWNER_ARCH           0
+#define ARM_SMCCC_OWNER_CPU            1
+#define ARM_SMCCC_OWNER_SIP            2
+#define ARM_SMCCC_OWNER_OEM            3
+#define ARM_SMCCC_OWNER_STANDARD       4
+#define ARM_SMCCC_OWNER_STANDARD_HYP   5
+#define ARM_SMCCC_OWNER_VENDOR_HYP     6
+#define ARM_SMCCC_OWNER_TRUSTED_APP    48
+#define ARM_SMCCC_OWNER_TRUSTED_APP_END        49
+#define ARM_SMCCC_OWNER_TRUSTED_OS     50
+#define ARM_SMCCC_OWNER_TRUSTED_OS_END 63
+
+#define ARM_SMCCC_FUNC_QUERY_CALL_UID  0xff01
+
+#define ARM_SMCCC_QUIRK_NONE           0
+#define ARM_SMCCC_QUIRK_QCOM_A6                1 /* Save/restore register a6 */
+
+#define ARM_SMCCC_VERSION_1_0          0x10000
+#define ARM_SMCCC_VERSION_1_1          0x10001
+#define ARM_SMCCC_VERSION_1_2          0x10002
+#define ARM_SMCCC_VERSION_1_3          0x10003
+
+#define ARM_SMCCC_1_3_SVE_HINT         0x10000
+
+#define ARM_SMCCC_VERSION_FUNC_ID                                      \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          0, 0)
+
+#define ARM_SMCCC_ARCH_FEATURES_FUNC_ID                                        \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          0, 1)
+
+#define ARM_SMCCC_ARCH_SOC_ID                                          \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          0, 2)
+
+#define ARM_SMCCC_ARCH_WORKAROUND_1                                    \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          0, 0x8000)
+
+#define ARM_SMCCC_ARCH_WORKAROUND_2                                    \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          0, 0x7fff)
+
+#define ARM_SMCCC_ARCH_WORKAROUND_3                                    \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          0, 0x3fff)
+
+#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID                          \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_FUNC_QUERY_CALL_UID)
+
+/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0     0xb66fb428U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1     0xe911c52eU
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2     0x564bcaa9U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3     0x743a004dU
+
+/* KVM "vendor specific" services */
+#define ARM_SMCCC_KVM_FUNC_FEATURES            0
+#define ARM_SMCCC_KVM_FUNC_PTP                 1
+#define ARM_SMCCC_KVM_FUNC_FEATURES_2          127
+#define ARM_SMCCC_KVM_NUM_FUNCS                        128
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID                      \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_KVM_FUNC_FEATURES)
+
+#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED   1
+
+/*
+ * ptp_kvm is a feature used for time sync between vm and host.
+ * ptp_kvm module in guest kernel will get service from host using
+ * this hypercall ID.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID                           \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_KVM_FUNC_PTP)
+
+/* ptp_kvm counter type ID */
+#define KVM_PTP_VIRT_COUNTER                   0
+#define KVM_PTP_PHYS_COUNTER                   1
+
+/* Paravirtualised time calls (defined by ARM DEN0057A) */
+#define ARM_SMCCC_HV_PV_TIME_FEATURES                          \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
+                          ARM_SMCCC_SMC_64,                    \
+                          ARM_SMCCC_OWNER_STANDARD_HYP,        \
+                          0x20)
+
+#define ARM_SMCCC_HV_PV_TIME_ST                                        \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
+                          ARM_SMCCC_SMC_64,                    \
+                          ARM_SMCCC_OWNER_STANDARD_HYP,        \
+                          0x21)
+
+/* TRNG entropy source calls (defined by ARM DEN0098) */
+#define ARM_SMCCC_TRNG_VERSION                                 \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
+                          ARM_SMCCC_SMC_32,                    \
+                          ARM_SMCCC_OWNER_STANDARD,            \
+                          0x50)
+
+#define ARM_SMCCC_TRNG_FEATURES                                        \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
+                          ARM_SMCCC_SMC_32,                    \
+                          ARM_SMCCC_OWNER_STANDARD,            \
+                          0x51)
+
+#define ARM_SMCCC_TRNG_GET_UUID                                        \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
+                          ARM_SMCCC_SMC_32,                    \
+                          ARM_SMCCC_OWNER_STANDARD,            \
+                          0x52)
+
+#define ARM_SMCCC_TRNG_RND32                                   \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
+                          ARM_SMCCC_SMC_32,                    \
+                          ARM_SMCCC_OWNER_STANDARD,            \
+                          0x53)
+
+#define ARM_SMCCC_TRNG_RND64                                   \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
+                          ARM_SMCCC_SMC_64,                    \
+                          ARM_SMCCC_OWNER_STANDARD,            \
+                          0x53)
+
+/*
+ * Return codes defined in ARM DEN 0070A
+ * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C
+ */
+#define SMCCC_RET_SUCCESS                      0
+#define SMCCC_RET_NOT_SUPPORTED                        -1
+#define SMCCC_RET_NOT_REQUIRED                 -2
+#define SMCCC_RET_INVALID_PARAMETER            -3
+
+#endif /*__LINUX_ARM_SMCCC_H*/
index 0aedcd7..de11992 100644 (file)
@@ -11,6 +11,7 @@ TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += damon
 TARGETS += drivers/dma-buf
+TARGETS += drivers/s390x/uvdevice
 TARGETS += efivarfs
 TARGETS += exec
 TARGETS += filesystems
index ca74f2e..09e23b5 100644 (file)
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 /dma-buf/udmabuf
+/s390x/uvdevice/test_uvdevice
diff --git a/tools/testing/selftests/drivers/s390x/uvdevice/Makefile b/tools/testing/selftests/drivers/s390x/uvdevice/Makefile
new file mode 100644 (file)
index 0000000..5e701d2
--- /dev/null
@@ -0,0 +1,22 @@
+include ../../../../../build/Build.include
+
+UNAME_M := $(shell uname -m)
+
+ifneq ($(UNAME_M),s390x)
+nothing:
+.PHONY: all clean run_tests install
+.SILENT:
+else
+
+TEST_GEN_PROGS := test_uvdevice
+
+top_srcdir ?= ../../../../../..
+KSFT_KHDR_INSTALL := 1
+khdr_dir = $(top_srcdir)/usr/include
+LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
+
+CFLAGS += -Wall -Werror -static -I$(khdr_dir) -I$(LINUX_TOOL_ARCH_INCLUDE)
+
+include ../../../lib.mk
+
+endif
diff --git a/tools/testing/selftests/drivers/s390x/uvdevice/config b/tools/testing/selftests/drivers/s390x/uvdevice/config
new file mode 100644 (file)
index 0000000..f28a04b
--- /dev/null
@@ -0,0 +1 @@
+CONFIG_S390_UV_UAPI=y
diff --git a/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c b/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c
new file mode 100644 (file)
index 0000000..ea0cdc3
--- /dev/null
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  selftest for the Ultravisor UAPI device
+ *
+ *  Copyright IBM Corp. 2022
+ *  Author(s): Steffen Eiden <seiden@linux.ibm.com>
+ */
+
+#include <stdint.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <asm/uvdevice.h>
+
+#include "../../../kselftest_harness.h"
+
+#define UV_PATH  "/dev/uv"
+#define BUFFER_SIZE 0x200
+FIXTURE(uvio_fixture) {
+       int uv_fd;
+       struct uvio_ioctl_cb uvio_ioctl;
+       uint8_t buffer[BUFFER_SIZE];
+       __u64 fault_page;
+};
+
+FIXTURE_VARIANT(uvio_fixture) {
+       unsigned long ioctl_cmd;
+       uint32_t arg_size;
+};
+
+FIXTURE_VARIANT_ADD(uvio_fixture, att) {
+       .ioctl_cmd = UVIO_IOCTL_ATT,
+       .arg_size = sizeof(struct uvio_attest),
+};
+
+FIXTURE_SETUP(uvio_fixture)
+{
+       self->uv_fd = open(UV_PATH, O_ACCMODE);
+
+       self->uvio_ioctl.argument_addr = (__u64)self->buffer;
+       self->uvio_ioctl.argument_len = variant->arg_size;
+       self->fault_page =
+               (__u64)mmap(NULL, (size_t)getpagesize(), PROT_NONE, MAP_ANONYMOUS, -1, 0);
+}
+
+FIXTURE_TEARDOWN(uvio_fixture)
+{
+       if (self->uv_fd)
+               close(self->uv_fd);
+       munmap((void *)self->fault_page, (size_t)getpagesize());
+}
+
+TEST_F(uvio_fixture, fault_ioctl_arg)
+{
+       int rc, errno_cache;
+
+       rc = ioctl(self->uv_fd, variant->ioctl_cmd, NULL);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EFAULT);
+
+       rc = ioctl(self->uv_fd, variant->ioctl_cmd, self->fault_page);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EFAULT);
+}
+
+TEST_F(uvio_fixture, fault_uvio_arg)
+{
+       int rc, errno_cache;
+
+       self->uvio_ioctl.argument_addr = 0;
+       rc = ioctl(self->uv_fd, variant->ioctl_cmd, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EFAULT);
+
+       self->uvio_ioctl.argument_addr = self->fault_page;
+       rc = ioctl(self->uv_fd, variant->ioctl_cmd, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EFAULT);
+}
+
+/*
+ * Test to verify that IOCTLs with invalid values in the ioctl_control block
+ * are rejected.
+ */
+TEST_F(uvio_fixture, inval_ioctl_cb)
+{
+       int rc, errno_cache;
+
+       self->uvio_ioctl.argument_len = 0;
+       rc = ioctl(self->uv_fd, variant->ioctl_cmd, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EINVAL);
+
+       self->uvio_ioctl.argument_len = (uint32_t)-1;
+       rc = ioctl(self->uv_fd, variant->ioctl_cmd, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EINVAL);
+       self->uvio_ioctl.argument_len = variant->arg_size;
+
+       self->uvio_ioctl.flags = (uint32_t)-1;
+       rc = ioctl(self->uv_fd, variant->ioctl_cmd, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EINVAL);
+       self->uvio_ioctl.flags = 0;
+
+       memset(self->uvio_ioctl.reserved14, 0xff, sizeof(self->uvio_ioctl.reserved14));
+       rc = ioctl(self->uv_fd, variant->ioctl_cmd, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EINVAL);
+
+       memset(&self->uvio_ioctl, 0x11, sizeof(self->uvio_ioctl));
+       rc = ioctl(self->uv_fd, variant->ioctl_cmd, &self->uvio_ioctl);
+       ASSERT_EQ(rc, -1);
+}
+
+TEST_F(uvio_fixture, inval_ioctl_cmd)
+{
+       int rc, errno_cache;
+       uint8_t nr = _IOC_NR(variant->ioctl_cmd);
+       unsigned long cmds[] = {
+               _IOWR('a', nr, struct uvio_ioctl_cb),
+               _IOWR(UVIO_TYPE_UVC, nr, int),
+               _IO(UVIO_TYPE_UVC, nr),
+               _IOR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb),
+               _IOW(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb),
+       };
+
+       for (size_t i = 0; i < ARRAY_SIZE(cmds); i++) {
+               rc = ioctl(self->uv_fd, cmds[i], &self->uvio_ioctl);
+               errno_cache = errno;
+               ASSERT_EQ(rc, -1);
+               ASSERT_EQ(errno_cache, ENOTTY);
+       }
+}
+
+struct test_attest_buffer {
+       uint8_t arcb[0x180];
+       uint8_t meas[64];
+       uint8_t add[32];
+};
+
+FIXTURE(attest_fixture) {
+       int uv_fd;
+       struct uvio_ioctl_cb uvio_ioctl;
+       struct uvio_attest uvio_attest;
+       struct test_attest_buffer attest_buffer;
+       __u64 fault_page;
+};
+
+FIXTURE_SETUP(attest_fixture)
+{
+       self->uv_fd = open(UV_PATH, O_ACCMODE);
+
+       self->uvio_ioctl.argument_addr = (__u64)&self->uvio_attest;
+       self->uvio_ioctl.argument_len = sizeof(self->uvio_attest);
+
+       self->uvio_attest.arcb_addr = (__u64)&self->attest_buffer.arcb;
+       self->uvio_attest.arcb_len = sizeof(self->attest_buffer.arcb);
+
+       self->uvio_attest.meas_addr = (__u64)&self->attest_buffer.meas;
+       self->uvio_attest.meas_len = sizeof(self->attest_buffer.meas);
+
+       self->uvio_attest.add_data_addr = (__u64)&self->attest_buffer.add;
+       self->uvio_attest.add_data_len = sizeof(self->attest_buffer.add);
+       self->fault_page =
+               (__u64)mmap(NULL, (size_t)getpagesize(), PROT_NONE, MAP_ANONYMOUS, -1, 0);
+}
+
+FIXTURE_TEARDOWN(attest_fixture)
+{
+       if (self->uv_fd)
+               close(self->uv_fd);
+       munmap((void *)self->fault_page, (size_t)getpagesize());
+}
+
+static void att_inval_sizes_test(uint32_t *size, uint32_t max_size, bool test_zero,
+                                struct __test_metadata *_metadata,
+                                FIXTURE_DATA(attest_fixture) *self)
+{
+       int rc, errno_cache;
+       uint32_t tmp = *size;
+
+       if (test_zero) {
+               *size = 0;
+               rc = ioctl(self->uv_fd, UVIO_IOCTL_ATT, &self->uvio_ioctl);
+               errno_cache = errno;
+               ASSERT_EQ(rc, -1);
+               ASSERT_EQ(errno_cache, EINVAL);
+       }
+       *size = max_size + 1;
+       rc = ioctl(self->uv_fd, UVIO_IOCTL_ATT, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EINVAL);
+       *size = tmp;
+}
+
+/*
+ * Test to verify that attestation IOCTLs with invalid values in the UVIO
+ * attestation control block are rejected.
+ */
+TEST_F(attest_fixture, att_inval_request)
+{
+       int rc, errno_cache;
+
+       att_inval_sizes_test(&self->uvio_attest.add_data_len, UVIO_ATT_ADDITIONAL_MAX_LEN,
+                            false, _metadata, self);
+       att_inval_sizes_test(&self->uvio_attest.meas_len, UVIO_ATT_MEASUREMENT_MAX_LEN,
+                            true, _metadata, self);
+       att_inval_sizes_test(&self->uvio_attest.arcb_len, UVIO_ATT_ARCB_MAX_LEN,
+                            true, _metadata, self);
+
+       self->uvio_attest.reserved136 = (uint16_t)-1;
+       rc = ioctl(self->uv_fd, UVIO_IOCTL_ATT, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EINVAL);
+
+       memset(&self->uvio_attest, 0x11, sizeof(self->uvio_attest));
+       rc = ioctl(self->uv_fd, UVIO_IOCTL_ATT, &self->uvio_ioctl);
+       ASSERT_EQ(rc, -1);
+}
+
+static void att_inval_addr_test(__u64 *addr, struct __test_metadata *_metadata,
+                               FIXTURE_DATA(attest_fixture) *self)
+{
+       int rc, errno_cache;
+       __u64 tmp = *addr;
+
+       *addr = 0;
+       rc = ioctl(self->uv_fd, UVIO_IOCTL_ATT, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EFAULT);
+       *addr = self->fault_page;
+       rc = ioctl(self->uv_fd, UVIO_IOCTL_ATT, &self->uvio_ioctl);
+       errno_cache = errno;
+       ASSERT_EQ(rc, -1);
+       ASSERT_EQ(errno_cache, EFAULT);
+       *addr = tmp;
+}
+
+TEST_F(attest_fixture, att_inval_addr)
+{
+       att_inval_addr_test(&self->uvio_attest.arcb_addr, _metadata, self);
+       att_inval_addr_test(&self->uvio_attest.add_data_addr, _metadata, self);
+       att_inval_addr_test(&self->uvio_attest.meas_addr, _metadata, self);
+}
+
+static void __attribute__((constructor)) __constructor_order_last(void)
+{
+       if (!__constructor_order)
+               __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD;
+}
+
+int main(int argc, char **argv)
+{
+       int fd = open(UV_PATH, O_ACCMODE);
+
+       if (fd < 0)
+               ksft_exit_skip("No uv-device or cannot access " UV_PATH  "\n"
+                              "Enable CONFIG_S390_UV_UAPI and check the access rights on "
+                              UV_PATH ".\n");
+       close(fd);
+       return test_harness_run(argc, argv);
+}
index 0b0e440..4509a3a 100644 (file)
@@ -2,7 +2,8 @@
 /aarch64/arch_timer
 /aarch64/debug-exceptions
 /aarch64/get-reg-list
-/aarch64/psci_cpu_on_test
+/aarch64/hypercalls
+/aarch64/psci_test
 /aarch64/vcpu_width_config
 /aarch64/vgic_init
 /aarch64/vgic_irq
@@ -16,6 +17,7 @@
 /x86_64/debug_regs
 /x86_64/evmcs_test
 /x86_64/emulator_error_test
+/x86_64/fix_hypercall_test
 /x86_64/get_msr_index_features
 /x86_64/kvm_clock_test
 /x86_64/kvm_pv_test
@@ -53,7 +55,7 @@
 /x86_64/xen_shinfo_test
 /x86_64/xen_vmcall_test
 /x86_64/xss_msr_test
-/x86_64/vmx_pmu_msrs_test
+/x86_64/vmx_pmu_caps_test
 /access_tracking_perf_test
 /demand_paging_test
 /dirty_log_test
index 681b173..81470a9 100644 (file)
@@ -48,6 +48,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
 TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
+TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
@@ -65,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
 TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
 TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
@@ -81,7 +83,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xapic_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
 TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
 TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test
 TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
 TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
@@ -105,7 +107,8 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test
 TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
-TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test
+TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
+TEST_GEN_PROGS_aarch64 += aarch64/psci_test
 TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config
 TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
 TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
index 0b571f3..d3a7dbf 100644 (file)
@@ -294,6 +294,11 @@ static void print_reg(struct vcpu_config *c, __u64 id)
                            "%s: Unexpected bits set in FW reg id: 0x%llx", config_name(c), id);
                printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff);
                break;
+       case KVM_REG_ARM_FW_FEAT_BMAP:
+               TEST_ASSERT(id == KVM_REG_ARM_FW_FEAT_BMAP_REG(id & 0xffff),
+                           "%s: Unexpected bits set in the bitmap feature FW reg id: 0x%llx", config_name(c), id);
+               printf("\tKVM_REG_ARM_FW_FEAT_BMAP_REG(%lld),\n", id & 0xffff);
+               break;
        case KVM_REG_ARM64_SVE:
                if (has_cap(c, KVM_CAP_ARM_SVE))
                        printf("\t%s,\n", sve_id_to_str(c, id));
@@ -692,6 +697,9 @@ static __u64 base_regs[] = {
        KVM_REG_ARM_FW_REG(1),          /* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1 */
        KVM_REG_ARM_FW_REG(2),          /* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 */
        KVM_REG_ARM_FW_REG(3),          /* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3 */
+       KVM_REG_ARM_FW_FEAT_BMAP_REG(0),        /* KVM_REG_ARM_STD_BMAP */
+       KVM_REG_ARM_FW_FEAT_BMAP_REG(1),        /* KVM_REG_ARM_STD_HYP_BMAP */
+       KVM_REG_ARM_FW_FEAT_BMAP_REG(2),        /* KVM_REG_ARM_VENDOR_HYP_BMAP */
        ARM64_SYS_REG(3, 3, 14, 3, 1),  /* CNTV_CTL_EL0 */
        ARM64_SYS_REG(3, 3, 14, 3, 2),  /* CNTV_CVAL_EL0 */
        ARM64_SYS_REG(3, 3, 14, 0, 2),
diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c
new file mode 100644 (file)
index 0000000..41e0210
--- /dev/null
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* hypercalls: Check the ARM64's psuedo-firmware bitmap register interface.
+ *
+ * The test validates the basic hypercall functionalities that are exposed
+ * via the psuedo-firmware bitmap register. This includes the registers'
+ * read/write behavior before and after the VM has started, and if the
+ * hypercalls are properly masked or unmasked to the guest when disabled or
+ * enabled from the KVM userspace, respectively.
+ */
+
+#include <errno.h>
+#include <linux/arm-smccc.h>
+#include <asm/kvm.h>
+#include <kvm_util.h>
+
+#include "processor.h"
+
+#define FW_REG_ULIMIT_VAL(max_feat_bit) (GENMASK(max_feat_bit, 0))
+
+/* Last valid bits of the bitmapped firmware registers */
+#define KVM_REG_ARM_STD_BMAP_BIT_MAX           0
+#define KVM_REG_ARM_STD_HYP_BMAP_BIT_MAX       0
+#define KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_MAX    1
+
+struct kvm_fw_reg_info {
+       uint64_t reg;           /* Register definition */
+       uint64_t max_feat_bit;  /* Bit that represents the upper limit of the feature-map */
+};
+
+#define FW_REG_INFO(r)                 \
+       {                                       \
+               .reg = r,                       \
+               .max_feat_bit = r##_BIT_MAX,    \
+       }
+
+static const struct kvm_fw_reg_info fw_reg_info[] = {
+       FW_REG_INFO(KVM_REG_ARM_STD_BMAP),
+       FW_REG_INFO(KVM_REG_ARM_STD_HYP_BMAP),
+       FW_REG_INFO(KVM_REG_ARM_VENDOR_HYP_BMAP),
+};
+
+enum test_stage {
+       TEST_STAGE_REG_IFACE,
+       TEST_STAGE_HVC_IFACE_FEAT_DISABLED,
+       TEST_STAGE_HVC_IFACE_FEAT_ENABLED,
+       TEST_STAGE_HVC_IFACE_FALSE_INFO,
+       TEST_STAGE_END,
+};
+
+static int stage = TEST_STAGE_REG_IFACE;
+
+struct test_hvc_info {
+       uint32_t func_id;
+       uint64_t arg1;
+};
+
+#define TEST_HVC_INFO(f, a1)   \
+       {                       \
+               .func_id = f,   \
+               .arg1 = a1,     \
+       }
+
+static const struct test_hvc_info hvc_info[] = {
+       /* KVM_REG_ARM_STD_BMAP */
+       TEST_HVC_INFO(ARM_SMCCC_TRNG_VERSION, 0),
+       TEST_HVC_INFO(ARM_SMCCC_TRNG_FEATURES, ARM_SMCCC_TRNG_RND64),
+       TEST_HVC_INFO(ARM_SMCCC_TRNG_GET_UUID, 0),
+       TEST_HVC_INFO(ARM_SMCCC_TRNG_RND32, 0),
+       TEST_HVC_INFO(ARM_SMCCC_TRNG_RND64, 0),
+
+       /* KVM_REG_ARM_STD_HYP_BMAP */
+       TEST_HVC_INFO(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_HV_PV_TIME_FEATURES),
+       TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_FEATURES, ARM_SMCCC_HV_PV_TIME_ST),
+       TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_ST, 0),
+
+       /* KVM_REG_ARM_VENDOR_HYP_BMAP */
+       TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID,
+                       ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID),
+       TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, 0),
+       TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID, KVM_PTP_VIRT_COUNTER),
+};
+
+/* Feed false hypercall info to test the KVM behavior */
+static const struct test_hvc_info false_hvc_info[] = {
+       /* Feature support check against a different family of hypercalls */
+       TEST_HVC_INFO(ARM_SMCCC_TRNG_FEATURES, ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID),
+       TEST_HVC_INFO(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_TRNG_RND64),
+       TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_FEATURES, ARM_SMCCC_TRNG_RND64),
+};
+
+static void guest_test_hvc(const struct test_hvc_info *hc_info)
+{
+       unsigned int i;
+       struct arm_smccc_res res;
+       unsigned int hvc_info_arr_sz;
+
+       hvc_info_arr_sz =
+       hc_info == hvc_info ? ARRAY_SIZE(hvc_info) : ARRAY_SIZE(false_hvc_info);
+
+       for (i = 0; i < hvc_info_arr_sz; i++, hc_info++) {
+               memset(&res, 0, sizeof(res));
+               smccc_hvc(hc_info->func_id, hc_info->arg1, 0, 0, 0, 0, 0, 0, &res);
+
+               switch (stage) {
+               case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
+               case TEST_STAGE_HVC_IFACE_FALSE_INFO:
+                       GUEST_ASSERT_3(res.a0 == SMCCC_RET_NOT_SUPPORTED,
+                                       res.a0, hc_info->func_id, hc_info->arg1);
+                       break;
+               case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
+                       GUEST_ASSERT_3(res.a0 != SMCCC_RET_NOT_SUPPORTED,
+                                       res.a0, hc_info->func_id, hc_info->arg1);
+                       break;
+               default:
+                       GUEST_ASSERT_1(0, stage);
+               }
+       }
+}
+
+static void guest_code(void)
+{
+       while (stage != TEST_STAGE_END) {
+               switch (stage) {
+               case TEST_STAGE_REG_IFACE:
+                       break;
+               case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
+               case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
+                       guest_test_hvc(hvc_info);
+                       break;
+               case TEST_STAGE_HVC_IFACE_FALSE_INFO:
+                       guest_test_hvc(false_hvc_info);
+                       break;
+               default:
+                       GUEST_ASSERT_1(0, stage);
+               }
+
+               GUEST_SYNC(stage);
+       }
+
+       GUEST_DONE();
+}
+
+static int set_fw_reg(struct kvm_vm *vm, uint64_t id, uint64_t val)
+{
+       struct kvm_one_reg reg = {
+               .id = id,
+               .addr = (uint64_t)&val,
+       };
+
+       return _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, &reg);
+}
+
+static void get_fw_reg(struct kvm_vm *vm, uint64_t id, uint64_t *addr)
+{
+       struct kvm_one_reg reg = {
+               .id = id,
+               .addr = (uint64_t)addr,
+       };
+
+       vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, &reg);
+}
+
+struct st_time {
+       uint32_t rev;
+       uint32_t attr;
+       uint64_t st_time;
+};
+
+#define STEAL_TIME_SIZE                ((sizeof(struct st_time) + 63) & ~63)
+#define ST_GPA_BASE            (1 << 30)
+
+static void steal_time_init(struct kvm_vm *vm)
+{
+       uint64_t st_ipa = (ulong)ST_GPA_BASE;
+       unsigned int gpages;
+       struct kvm_device_attr dev = {
+               .group = KVM_ARM_VCPU_PVTIME_CTRL,
+               .attr = KVM_ARM_VCPU_PVTIME_IPA,
+               .addr = (uint64_t)&st_ipa,
+       };
+
+       gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE);
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0);
+
+       vcpu_ioctl(vm, 0, KVM_SET_DEVICE_ATTR, &dev);
+}
+
+static void test_fw_regs_before_vm_start(struct kvm_vm *vm)
+{
+       uint64_t val;
+       unsigned int i;
+       int ret;
+
+       for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) {
+               const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i];
+
+               /* First 'read' should be an upper limit of the features supported */
+               get_fw_reg(vm, reg_info->reg, &val);
+               TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit),
+                       "Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx\n",
+                       reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val);
+
+               /* Test a 'write' by disabling all the features of the register map */
+               ret = set_fw_reg(vm, reg_info->reg, 0);
+               TEST_ASSERT(ret == 0,
+                       "Failed to clear all the features of reg: 0x%lx; ret: %d\n",
+                       reg_info->reg, errno);
+
+               get_fw_reg(vm, reg_info->reg, &val);
+               TEST_ASSERT(val == 0,
+                       "Expected all the features to be cleared for reg: 0x%lx\n", reg_info->reg);
+
+               /*
+                * Test enabling a feature that's not supported.
+                * Avoid this check if all the bits are occupied.
+                */
+               if (reg_info->max_feat_bit < 63) {
+                       ret = set_fw_reg(vm, reg_info->reg, BIT(reg_info->max_feat_bit + 1));
+                       TEST_ASSERT(ret != 0 && errno == EINVAL,
+                       "Unexpected behavior or return value (%d) while setting an unsupported feature for reg: 0x%lx\n",
+                       errno, reg_info->reg);
+               }
+       }
+}
+
+static void test_fw_regs_after_vm_start(struct kvm_vm *vm)
+{
+       uint64_t val;
+       unsigned int i;
+       int ret;
+
+       for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) {
+               const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i];
+
+               /*
+                * Before starting the VM, the test clears all the bits.
+                * Check if that's still the case.
+                */
+               get_fw_reg(vm, reg_info->reg, &val);
+               TEST_ASSERT(val == 0,
+                       "Expected all the features to be cleared for reg: 0x%lx\n",
+                       reg_info->reg);
+
+               /*
+                * Since the VM has run at least once, KVM shouldn't allow modification of
+                * the registers and should return EBUSY. Set the registers and check for
+                * the expected errno.
+                */
+               ret = set_fw_reg(vm, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit));
+               TEST_ASSERT(ret != 0 && errno == EBUSY,
+               "Unexpected behavior or return value (%d) while setting a feature while VM is running for reg: 0x%lx\n",
+               errno, reg_info->reg);
+       }
+}
+
+static struct kvm_vm *test_vm_create(void)
+{
+       struct kvm_vm *vm;
+
+       vm = vm_create_default(0, 0, guest_code);
+
+       ucall_init(vm, NULL);
+       steal_time_init(vm);
+
+       return vm;
+}
+
+static struct kvm_vm *test_guest_stage(struct kvm_vm *vm)
+{
+       struct kvm_vm *ret_vm = vm;
+
+       pr_debug("Stage: %d\n", stage);
+
+       switch (stage) {
+       case TEST_STAGE_REG_IFACE:
+               test_fw_regs_after_vm_start(vm);
+               break;
+       case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
+               /* Start a new VM so that all the features are now enabled by default */
+               kvm_vm_free(vm);
+               ret_vm = test_vm_create();
+               break;
+       case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
+       case TEST_STAGE_HVC_IFACE_FALSE_INFO:
+               break;
+       default:
+               TEST_FAIL("Unknown test stage: %d\n", stage);
+       }
+
+       stage++;
+       sync_global_to_guest(vm, stage);
+
+       return ret_vm;
+}
+
+static void test_run(void)
+{
+       struct kvm_vm *vm;
+       struct ucall uc;
+       bool guest_done = false;
+
+       vm = test_vm_create();
+
+       test_fw_regs_before_vm_start(vm);
+
+       while (!guest_done) {
+               vcpu_run(vm, 0);
+
+               switch (get_ucall(vm, 0, &uc)) {
+               case UCALL_SYNC:
+                       vm = test_guest_stage(vm);
+                       break;
+               case UCALL_DONE:
+                       guest_done = true;
+                       break;
+               case UCALL_ABORT:
+                       TEST_FAIL("%s at %s:%ld\n\tvalues: 0x%lx, 0x%lx; 0x%lx, stage: %u",
+                       (const char *)uc.args[0], __FILE__, uc.args[1],
+                       uc.args[2], uc.args[3], uc.args[4], stage);
+                       break;
+               default:
+                       TEST_FAIL("Unexpected guest exit\n");
+               }
+       }
+
+       kvm_vm_free(vm);
+}
+
+int main(void)
+{
+       setbuf(stdout, NULL);
+
+       test_run();
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c b/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c
deleted file mode 100644 (file)
index 4c5f681..0000000
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * psci_cpu_on_test - Test that the observable state of a vCPU targeted by the
- * CPU_ON PSCI call matches what the caller requested.
- *
- * Copyright (c) 2021 Google LLC.
- *
- * This is a regression test for a race between KVM servicing the PSCI call and
- * userspace reading the vCPUs registers.
- */
-
-#define _GNU_SOURCE
-
-#include <linux/psci.h>
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "test_util.h"
-
-#define VCPU_ID_SOURCE 0
-#define VCPU_ID_TARGET 1
-
-#define CPU_ON_ENTRY_ADDR 0xfeedf00dul
-#define CPU_ON_CONTEXT_ID 0xdeadc0deul
-
-static uint64_t psci_cpu_on(uint64_t target_cpu, uint64_t entry_addr,
-                           uint64_t context_id)
-{
-       register uint64_t x0 asm("x0") = PSCI_0_2_FN64_CPU_ON;
-       register uint64_t x1 asm("x1") = target_cpu;
-       register uint64_t x2 asm("x2") = entry_addr;
-       register uint64_t x3 asm("x3") = context_id;
-
-       asm("hvc #0"
-           : "=r"(x0)
-           : "r"(x0), "r"(x1), "r"(x2), "r"(x3)
-           : "memory");
-
-       return x0;
-}
-
-static uint64_t psci_affinity_info(uint64_t target_affinity,
-                                  uint64_t lowest_affinity_level)
-{
-       register uint64_t x0 asm("x0") = PSCI_0_2_FN64_AFFINITY_INFO;
-       register uint64_t x1 asm("x1") = target_affinity;
-       register uint64_t x2 asm("x2") = lowest_affinity_level;
-
-       asm("hvc #0"
-           : "=r"(x0)
-           : "r"(x0), "r"(x1), "r"(x2)
-           : "memory");
-
-       return x0;
-}
-
-static void guest_main(uint64_t target_cpu)
-{
-       GUEST_ASSERT(!psci_cpu_on(target_cpu, CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID));
-       uint64_t target_state;
-
-       do {
-               target_state = psci_affinity_info(target_cpu, 0);
-
-               GUEST_ASSERT((target_state == PSCI_0_2_AFFINITY_LEVEL_ON) ||
-                            (target_state == PSCI_0_2_AFFINITY_LEVEL_OFF));
-       } while (target_state != PSCI_0_2_AFFINITY_LEVEL_ON);
-
-       GUEST_DONE();
-}
-
-int main(void)
-{
-       uint64_t target_mpidr, obs_pc, obs_x0;
-       struct kvm_vcpu_init init;
-       struct kvm_vm *vm;
-       struct ucall uc;
-
-       vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
-       kvm_vm_elf_load(vm, program_invocation_name);
-       ucall_init(vm, NULL);
-
-       vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init);
-       init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2);
-
-       aarch64_vcpu_add_default(vm, VCPU_ID_SOURCE, &init, guest_main);
-
-       /*
-        * make sure the target is already off when executing the test.
-        */
-       init.features[0] |= (1 << KVM_ARM_VCPU_POWER_OFF);
-       aarch64_vcpu_add_default(vm, VCPU_ID_TARGET, &init, guest_main);
-
-       get_reg(vm, VCPU_ID_TARGET, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr);
-       vcpu_args_set(vm, VCPU_ID_SOURCE, 1, target_mpidr & MPIDR_HWID_BITMASK);
-       vcpu_run(vm, VCPU_ID_SOURCE);
-
-       switch (get_ucall(vm, VCPU_ID_SOURCE, &uc)) {
-       case UCALL_DONE:
-               break;
-       case UCALL_ABORT:
-               TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__,
-                         uc.args[1]);
-               break;
-       default:
-               TEST_FAIL("Unhandled ucall: %lu", uc.cmd);
-       }
-
-       get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.pc), &obs_pc);
-       get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.regs[0]), &obs_x0);
-
-       TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR,
-                   "unexpected target cpu pc: %lx (expected: %lx)",
-                   obs_pc, CPU_ON_ENTRY_ADDR);
-       TEST_ASSERT(obs_x0 == CPU_ON_CONTEXT_ID,
-                   "unexpected target context id: %lx (expected: %lx)",
-                   obs_x0, CPU_ON_CONTEXT_ID);
-
-       kvm_vm_free(vm);
-       return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c
new file mode 100644 (file)
index 0000000..88541de
--- /dev/null
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * psci_cpu_on_test - Test that the observable state of a vCPU targeted by the
+ * CPU_ON PSCI call matches what the caller requested.
+ *
+ * Copyright (c) 2021 Google LLC.
+ *
+ * This is a regression test for a race between KVM servicing the PSCI call and
+ * userspace reading the vCPUs registers.
+ */
+
+#define _GNU_SOURCE
+
+#include <linux/psci.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define VCPU_ID_SOURCE 0
+#define VCPU_ID_TARGET 1
+
+#define CPU_ON_ENTRY_ADDR 0xfeedf00dul
+#define CPU_ON_CONTEXT_ID 0xdeadc0deul
+
+static uint64_t psci_cpu_on(uint64_t target_cpu, uint64_t entry_addr,
+                           uint64_t context_id)
+{
+       struct arm_smccc_res res;
+
+       smccc_hvc(PSCI_0_2_FN64_CPU_ON, target_cpu, entry_addr, context_id,
+                 0, 0, 0, 0, &res);
+
+       return res.a0;
+}
+
+static uint64_t psci_affinity_info(uint64_t target_affinity,
+                                  uint64_t lowest_affinity_level)
+{
+       struct arm_smccc_res res;
+
+       smccc_hvc(PSCI_0_2_FN64_AFFINITY_INFO, target_affinity, lowest_affinity_level,
+                 0, 0, 0, 0, 0, &res);
+
+       return res.a0;
+}
+
+static uint64_t psci_system_suspend(uint64_t entry_addr, uint64_t context_id)
+{
+       struct arm_smccc_res res;
+
+       smccc_hvc(PSCI_1_0_FN64_SYSTEM_SUSPEND, entry_addr, context_id,
+                 0, 0, 0, 0, 0, &res);
+
+       return res.a0;
+}
+
+static uint64_t psci_features(uint32_t func_id)
+{
+       struct arm_smccc_res res;
+
+       smccc_hvc(PSCI_1_0_FN_PSCI_FEATURES, func_id, 0, 0, 0, 0, 0, 0, &res);
+
+       return res.a0;
+}
+
+static void vcpu_power_off(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       struct kvm_mp_state mp_state = {
+               .mp_state = KVM_MP_STATE_STOPPED,
+       };
+
+       vcpu_set_mp_state(vm, vcpuid, &mp_state);
+}
+
+static struct kvm_vm *setup_vm(void *guest_code)
+{
+       struct kvm_vcpu_init init;
+       struct kvm_vm *vm;
+
+       vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
+       kvm_vm_elf_load(vm, program_invocation_name);
+       ucall_init(vm, NULL);
+
+       vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init);
+       init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2);
+
+       aarch64_vcpu_add_default(vm, VCPU_ID_SOURCE, &init, guest_code);
+       aarch64_vcpu_add_default(vm, VCPU_ID_TARGET, &init, guest_code);
+
+       return vm;
+}
+
+static void enter_guest(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       struct ucall uc;
+
+       vcpu_run(vm, vcpuid);
+       if (get_ucall(vm, vcpuid, &uc) == UCALL_ABORT)
+               TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__,
+                         uc.args[1]);
+}
+
+static void assert_vcpu_reset(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       uint64_t obs_pc, obs_x0;
+
+       get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &obs_pc);
+       get_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[0]), &obs_x0);
+
+       TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR,
+                   "unexpected target cpu pc: %lx (expected: %lx)",
+                   obs_pc, CPU_ON_ENTRY_ADDR);
+       TEST_ASSERT(obs_x0 == CPU_ON_CONTEXT_ID,
+                   "unexpected target context id: %lx (expected: %lx)",
+                   obs_x0, CPU_ON_CONTEXT_ID);
+}
+
+static void guest_test_cpu_on(uint64_t target_cpu)
+{
+       uint64_t target_state;
+
+       GUEST_ASSERT(!psci_cpu_on(target_cpu, CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID));
+
+       do {
+               target_state = psci_affinity_info(target_cpu, 0);
+
+               GUEST_ASSERT((target_state == PSCI_0_2_AFFINITY_LEVEL_ON) ||
+                            (target_state == PSCI_0_2_AFFINITY_LEVEL_OFF));
+       } while (target_state != PSCI_0_2_AFFINITY_LEVEL_ON);
+
+       GUEST_DONE();
+}
+
+static void host_test_cpu_on(void)
+{
+       uint64_t target_mpidr;
+       struct kvm_vm *vm;
+       struct ucall uc;
+
+       vm = setup_vm(guest_test_cpu_on);
+
+       /*
+        * make sure the target is already off when executing the test.
+        */
+       vcpu_power_off(vm, VCPU_ID_TARGET);
+
+       get_reg(vm, VCPU_ID_TARGET, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr);
+       vcpu_args_set(vm, VCPU_ID_SOURCE, 1, target_mpidr & MPIDR_HWID_BITMASK);
+       enter_guest(vm, VCPU_ID_SOURCE);
+
+       if (get_ucall(vm, VCPU_ID_SOURCE, &uc) != UCALL_DONE)
+               TEST_FAIL("Unhandled ucall: %lu", uc.cmd);
+
+       assert_vcpu_reset(vm, VCPU_ID_TARGET);
+       kvm_vm_free(vm);
+}
+
+static void enable_system_suspend(struct kvm_vm *vm)
+{
+       struct kvm_enable_cap cap = {
+               .cap = KVM_CAP_ARM_SYSTEM_SUSPEND,
+       };
+
+       vm_enable_cap(vm, &cap);
+}
+
+static void guest_test_system_suspend(void)
+{
+       uint64_t ret;
+
+       /* assert that SYSTEM_SUSPEND is discoverable */
+       GUEST_ASSERT(!psci_features(PSCI_1_0_FN_SYSTEM_SUSPEND));
+       GUEST_ASSERT(!psci_features(PSCI_1_0_FN64_SYSTEM_SUSPEND));
+
+       ret = psci_system_suspend(CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID);
+       GUEST_SYNC(ret);
+}
+
+static void host_test_system_suspend(void)
+{
+       struct kvm_run *run;
+       struct kvm_vm *vm;
+
+       vm = setup_vm(guest_test_system_suspend);
+       enable_system_suspend(vm);
+
+       vcpu_power_off(vm, VCPU_ID_TARGET);
+       run = vcpu_state(vm, VCPU_ID_SOURCE);
+
+       enter_guest(vm, VCPU_ID_SOURCE);
+
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_SYSTEM_EVENT,
+                   "Unhandled exit reason: %u (%s)",
+                   run->exit_reason, exit_reason_str(run->exit_reason));
+       TEST_ASSERT(run->system_event.type == KVM_SYSTEM_EVENT_SUSPEND,
+                   "Unhandled system event: %u (expected: %u)",
+                   run->system_event.type, KVM_SYSTEM_EVENT_SUSPEND);
+
+       kvm_vm_free(vm);
+}
+
+int main(void)
+{
+       if (!kvm_check_cap(KVM_CAP_ARM_SYSTEM_SUSPEND)) {
+               print_skip("KVM_CAP_ARM_SYSTEM_SUSPEND not supported");
+               exit(KSFT_SKIP);
+       }
+
+       host_test_cpu_on();
+       host_test_system_suspend();
+       return 0;
+}
index 8f9f469..59ece9d 100644 (file)
@@ -185,4 +185,26 @@ static inline void local_irq_disable(void)
        asm volatile("msr daifset, #3" : : : "memory");
 }
 
+/**
+ * struct arm_smccc_res - Result from SMC/HVC call
+ * @a0-a3 result values from registers 0 to 3
+ */
+struct arm_smccc_res {
+       unsigned long a0;
+       unsigned long a1;
+       unsigned long a2;
+       unsigned long a3;
+};
+
+/**
+ * smccc_hvc - Invoke a SMCCC function using the hvc conduit
+ * @function_id: the SMCCC function to be called
+ * @arg0-arg6: SMCCC function arguments, corresponding to registers x1-x7
+ * @res: pointer to write the return values from registers x0-x3
+ *
+ */
+void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
+              uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
+              uint64_t arg6, struct arm_smccc_res *res);
+
 #endif /* SELFTEST_KVM_PROCESSOR_H */
index eca5c62..4fcfd1c 100644 (file)
@@ -119,10 +119,12 @@ static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id,
 #define SATP_ASID_SHIFT                                44
 #define SATP_ASID_MASK                         _AC(0xFFFF, UL)
 
-#define SBI_EXT_EXPERIMENTAL_START     0x08000000
-#define SBI_EXT_EXPERIMENTAL_END       0x08FFFFFF
+#define SBI_EXT_EXPERIMENTAL_START             0x08000000
+#define SBI_EXT_EXPERIMENTAL_END               0x08FFFFFF
 
-#define KVM_RISCV_SELFTESTS_SBI_EXT    SBI_EXT_EXPERIMENTAL_END
+#define KVM_RISCV_SELFTESTS_SBI_EXT            SBI_EXT_EXPERIMENTAL_END
+#define KVM_RISCV_SELFTESTS_SBI_UCALL          0
+#define KVM_RISCV_SELFTESTS_SBI_UNEXP          1
 
 struct sbiret {
        long error;
index 9343d82..6a04128 100644 (file)
@@ -500,3 +500,28 @@ void __attribute__((constructor)) init_guest_modes(void)
 {
        guest_modes_append_default();
 }
+
+void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
+              uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
+              uint64_t arg6, struct arm_smccc_res *res)
+{
+       asm volatile("mov   w0, %w[function_id]\n"
+                    "mov   x1, %[arg0]\n"
+                    "mov   x2, %[arg1]\n"
+                    "mov   x3, %[arg2]\n"
+                    "mov   x4, %[arg3]\n"
+                    "mov   x5, %[arg4]\n"
+                    "mov   x6, %[arg5]\n"
+                    "mov   x7, %[arg6]\n"
+                    "hvc   #0\n"
+                    "mov   %[res0], x0\n"
+                    "mov   %[res1], x1\n"
+                    "mov   %[res2], x2\n"
+                    "mov   %[res3], x3\n"
+                    : [res0] "=r"(res->a0), [res1] "=r"(res->a1),
+                      [res2] "=r"(res->a2), [res3] "=r"(res->a3)
+                    : [function_id] "r"(function_id), [arg0] "r"(arg0),
+                      [arg1] "r"(arg1), [arg2] "r"(arg2), [arg3] "r"(arg3),
+                      [arg4] "r"(arg4), [arg5] "r"(arg5), [arg6] "r"(arg6)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7");
+}
index 3961487..abc0ae5 100644 (file)
@@ -268,10 +268,11 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
                core.regs.t3, core.regs.t4, core.regs.t5, core.regs.t6);
 }
 
-static void __aligned(16) guest_hang(void)
+static void __aligned(16) guest_unexp_trap(void)
 {
-       while (1)
-               ;
+       sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT,
+                 KVM_RISCV_SELFTESTS_SBI_UNEXP,
+                 0, 0, 0, 0, 0, 0);
 }
 
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
@@ -310,7 +311,7 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
 
        /* Setup default exception vector of guest */
        set_reg(vm, vcpuid, RISCV_CSR_REG(stvec),
-               (unsigned long)guest_hang);
+               (unsigned long)guest_unexp_trap);
 }
 
 void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
@@ -350,7 +351,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
                case 7:
                        id = RISCV_CORE_REG(regs.a7);
                        break;
-               };
+               }
                set_reg(vm, vcpuid, id, va_arg(ap, uint64_t));
        }
 
index 9e42d82..8550f42 100644 (file)
@@ -60,8 +60,9 @@ void ucall(uint64_t cmd, int nargs, ...)
                uc.args[i] = va_arg(va, uint64_t);
        va_end(va);
 
-       sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT, 0, (vm_vaddr_t)&uc,
-                 0, 0, 0, 0, 0);
+       sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT,
+                 KVM_RISCV_SELFTESTS_SBI_UCALL,
+                 (vm_vaddr_t)&uc, 0, 0, 0, 0, 0);
 }
 
 uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
@@ -73,14 +74,24 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
                memset(uc, 0, sizeof(*uc));
 
        if (run->exit_reason == KVM_EXIT_RISCV_SBI &&
-           run->riscv_sbi.extension_id == KVM_RISCV_SELFTESTS_SBI_EXT &&
-           run->riscv_sbi.function_id == 0) {
-               memcpy(&ucall, addr_gva2hva(vm, run->riscv_sbi.args[0]),
-                       sizeof(ucall));
-
-               vcpu_run_complete_io(vm, vcpu_id);
-               if (uc)
-                       memcpy(uc, &ucall, sizeof(ucall));
+           run->riscv_sbi.extension_id == KVM_RISCV_SELFTESTS_SBI_EXT) {
+               switch (run->riscv_sbi.function_id) {
+               case KVM_RISCV_SELFTESTS_SBI_UCALL:
+                       memcpy(&ucall, addr_gva2hva(vm,
+                              run->riscv_sbi.args[0]), sizeof(ucall));
+
+                       vcpu_run_complete_io(vm, vcpu_id);
+                       if (uc)
+                               memcpy(uc, &ucall, sizeof(ucall));
+
+                       break;
+               case KVM_RISCV_SELFTESTS_SBI_UNEXP:
+                       vcpu_dump(stderr, vm, vcpu_id, 2);
+                       TEST_ASSERT(0, "Unexpected trap taken by guest");
+                       break;
+               default:
+                       break;
+               }
        }
 
        return ucall.cmd;
index b04c2c1..49f26f5 100644 (file)
@@ -10,6 +10,8 @@
 #include <string.h>
 #include <sys/ioctl.h>
 
+#include <linux/bits.h>
+
 #include "test_util.h"
 #include "kvm_util.h"
 
@@ -194,6 +196,7 @@ static int err_memop_ioctl(struct test_vcpu vcpu, struct kvm_s390_mem_op *ksmo)
 #define SIDA_OFFSET(o) ._sida_offset = 1, .sida_offset = (o)
 #define AR(a) ._ar = 1, .ar = (a)
 #define KEY(a) .f_key = 1, .key = (a)
+#define INJECT .f_inject = 1
 
 #define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); })
 
@@ -430,9 +433,18 @@ static void test_copy_key_fetch_prot(void)
        TEST_ASSERT(rv == 4, "Should result in protection exception");          \
 })
 
+static void guest_error_key(void)
+{
+       GUEST_SYNC(STAGE_INITED);
+       set_storage_key_range(mem1, PAGE_SIZE, 0x18);
+       set_storage_key_range(mem1 + PAGE_SIZE, sizeof(mem1) - PAGE_SIZE, 0x98);
+       GUEST_SYNC(STAGE_SKEYS_SET);
+       GUEST_SYNC(STAGE_IDLED);
+}
+
 static void test_errors_key(void)
 {
-       struct test_default t = test_default_init(guest_copy_key_fetch_prot);
+       struct test_default t = test_default_init(guest_error_key);
 
        HOST_SYNC(t.vcpu, STAGE_INITED);
        HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
@@ -446,6 +458,37 @@ static void test_errors_key(void)
        kvm_vm_free(t.kvm_vm);
 }
 
+static void test_termination(void)
+{
+       struct test_default t = test_default_init(guest_error_key);
+       uint64_t prefix;
+       uint64_t teid;
+       uint64_t teid_mask = BIT(63 - 56) | BIT(63 - 60) | BIT(63 - 61);
+       uint64_t psw[2];
+
+       HOST_SYNC(t.vcpu, STAGE_INITED);
+       HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+       /* vcpu, mismatching keys after first page */
+       ERR_PROT_MOP(t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), KEY(1), INJECT);
+       /*
+        * The memop injected a program exception and the test needs to check the
+        * Translation-Exception Identification (TEID). It is necessary to run
+        * the guest in order to be able to read the TEID from guest memory.
+        * Set the guest program new PSW, so the guest state is not clobbered.
+        */
+       prefix = t.run->s.regs.prefix;
+       psw[0] = t.run->psw_mask;
+       psw[1] = t.run->psw_addr;
+       MOP(t.vm, ABSOLUTE, WRITE, psw, sizeof(psw), GADDR(prefix + 464));
+       HOST_SYNC(t.vcpu, STAGE_IDLED);
+       MOP(t.vm, ABSOLUTE, READ, &teid, sizeof(teid), GADDR(prefix + 168));
+       /* Bits 56, 60, 61 form a code, 0 being the only one allowing for termination */
+       ASSERT_EQ(teid & teid_mask, 0);
+
+       kvm_vm_free(t.kvm_vm);
+}
+
 static void test_errors_key_storage_prot_override(void)
 {
        struct test_default t = test_default_init(guest_copy_key_fetch_prot);
@@ -668,6 +711,7 @@ int main(int argc, char *argv[])
                test_copy_key_fetch_prot();
                test_copy_key_fetch_prot_override();
                test_errors_key();
+               test_termination();
                test_errors_key_storage_prot_override();
                test_errors_key_fetch_prot_override_not_enabled();
                test_errors_key_fetch_prot_override_enabled();
index 62f2eb9..8c4e811 100644 (file)
@@ -118,17 +118,10 @@ struct st_time {
 
 static int64_t smccc(uint32_t func, uint64_t arg)
 {
-       unsigned long ret;
+       struct arm_smccc_res res;
 
-       asm volatile(
-               "mov    w0, %w1\n"
-               "mov    x1, %2\n"
-               "hvc    #0\n"
-               "mov    %0, x0\n"
-       : "=r" (ret) : "r" (func), "r" (arg) :
-         "x0", "x1", "x2", "x3");
-
-       return ret;
+       smccc_hvc(func, arg, 0, 0, 0, 0, 0, 0, &res);
+       return res.a0;
 }
 
 static void check_status(struct st_time *st)
diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
new file mode 100644 (file)
index 0000000..1f5c321
--- /dev/null
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for KVM paravirtual feature disablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <linux/stringify.h>
+#include <stdint.h>
+
+#include "apic.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+static bool ud_expected;
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+       GUEST_ASSERT(ud_expected);
+       GUEST_DONE();
+}
+
+extern unsigned char svm_hypercall_insn;
+static uint64_t svm_do_sched_yield(uint8_t apic_id)
+{
+       uint64_t ret;
+
+       asm volatile("mov %1, %%rax\n\t"
+                    "mov %2, %%rbx\n\t"
+                    "svm_hypercall_insn:\n\t"
+                    "vmmcall\n\t"
+                    "mov %%rax, %0\n\t"
+                    : "=r"(ret)
+                    : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id)
+                    : "rax", "rbx", "memory");
+
+       return ret;
+}
+
+extern unsigned char vmx_hypercall_insn;
+static uint64_t vmx_do_sched_yield(uint8_t apic_id)
+{
+       uint64_t ret;
+
+       asm volatile("mov %1, %%rax\n\t"
+                    "mov %2, %%rbx\n\t"
+                    "vmx_hypercall_insn:\n\t"
+                    "vmcall\n\t"
+                    "mov %%rax, %0\n\t"
+                    : "=r"(ret)
+                    : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id)
+                    : "rax", "rbx", "memory");
+
+       return ret;
+}
+
+static void assert_hypercall_insn(unsigned char *exp_insn, unsigned char *obs_insn)
+{
+       uint32_t exp = 0, obs = 0;
+
+       memcpy(&exp, exp_insn, sizeof(exp));
+       memcpy(&obs, obs_insn, sizeof(obs));
+
+       GUEST_ASSERT_EQ(exp, obs);
+}
+
+static void guest_main(void)
+{
+       unsigned char *native_hypercall_insn, *hypercall_insn;
+       uint8_t apic_id;
+
+       apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID));
+
+       if (is_intel_cpu()) {
+               native_hypercall_insn = &vmx_hypercall_insn;
+               hypercall_insn = &svm_hypercall_insn;
+               svm_do_sched_yield(apic_id);
+       } else if (is_amd_cpu()) {
+               native_hypercall_insn = &svm_hypercall_insn;
+               hypercall_insn = &vmx_hypercall_insn;
+               vmx_do_sched_yield(apic_id);
+       } else {
+               GUEST_ASSERT(0);
+               /* unreachable */
+               return;
+       }
+
+       GUEST_ASSERT(!ud_expected);
+       assert_hypercall_insn(native_hypercall_insn, hypercall_insn);
+       GUEST_DONE();
+}
+
+static void setup_ud_vector(struct kvm_vm *vm)
+{
+       vm_init_descriptor_tables(vm);
+       vcpu_init_descriptor_tables(vm, VCPU_ID);
+       vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+}
+
+static void enter_guest(struct kvm_vm *vm)
+{
+       struct kvm_run *run;
+       struct ucall uc;
+
+       run = vcpu_state(vm, VCPU_ID);
+
+       vcpu_run(vm, VCPU_ID);
+       switch (get_ucall(vm, VCPU_ID, &uc)) {
+       case UCALL_SYNC:
+               pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]);
+               break;
+       case UCALL_DONE:
+               return;
+       case UCALL_ABORT:
+               TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]);
+       default:
+               TEST_FAIL("Unhandled ucall: %ld\nexit_reason: %u (%s)",
+                         uc.cmd, run->exit_reason, exit_reason_str(run->exit_reason));
+       }
+}
+
+static void test_fix_hypercall(void)
+{
+       struct kvm_vm *vm;
+
+       vm = vm_create_default(VCPU_ID, 0, guest_main);
+       setup_ud_vector(vm);
+
+       ud_expected = false;
+       sync_global_to_guest(vm, ud_expected);
+
+       virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+       enter_guest(vm);
+}
+
+static void test_fix_hypercall_disabled(void)
+{
+       struct kvm_enable_cap cap = {0};
+       struct kvm_vm *vm;
+
+       vm = vm_create_default(VCPU_ID, 0, guest_main);
+       setup_ud_vector(vm);
+
+       cap.cap = KVM_CAP_DISABLE_QUIRKS2;
+       cap.args[0] = KVM_X86_QUIRK_FIX_HYPERCALL_INSN;
+       vm_enable_cap(vm, &cap);
+
+       ud_expected = true;
+       sync_global_to_guest(vm, ud_expected);
+
+       virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+       enter_guest(vm);
+}
+
+int main(void)
+{
+       if (!(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
+               print_skip("KVM_X86_QUIRK_HYPERCALL_INSN not supported");
+               exit(KSFT_SKIP);
+       }
+
+       test_fix_hypercall();
+       test_fix_hypercall_disabled();
+}
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
new file mode 100644 (file)
index 0000000..f0083d8
--- /dev/null
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_vmcall_test
+ *
+ * Copyright Â© 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * Xen shared_info / pvclock testing
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <signal.h>
+#include <pthread.h>
+
+#define NR_TEST_VCPUS 20
+
+static struct kvm_vm *vm;
+pthread_spinlock_t create_lock;
+
+#define TEST_TSC_KHZ    2345678UL
+#define TEST_TSC_OFFSET 200000000
+
+uint64_t tsc_sync;
+static void guest_code(void)
+{
+       uint64_t start_tsc, local_tsc, tmp;
+
+       start_tsc = rdtsc();
+       do {
+               tmp = READ_ONCE(tsc_sync);
+               local_tsc = rdtsc();
+               WRITE_ONCE(tsc_sync, local_tsc);
+               if (unlikely(local_tsc < tmp))
+                       GUEST_SYNC_ARGS(0, local_tsc, tmp, 0, 0);
+
+       } while (local_tsc - start_tsc < 5000 * TEST_TSC_KHZ);
+
+       GUEST_DONE();
+}
+
+
+static void *run_vcpu(void *_cpu_nr)
+{
+       unsigned long cpu = (unsigned long)_cpu_nr;
+       unsigned long failures = 0;
+       static bool first_cpu_done;
+
+       /* The kernel is fine, but vm_vcpu_add_default() needs locking */
+       pthread_spin_lock(&create_lock);
+
+       vm_vcpu_add_default(vm, cpu, guest_code);
+
+       if (!first_cpu_done) {
+               first_cpu_done = true;
+               vcpu_set_msr(vm, cpu, MSR_IA32_TSC, TEST_TSC_OFFSET);
+       }
+
+       pthread_spin_unlock(&create_lock);
+
+       for (;;) {
+               volatile struct kvm_run *run = vcpu_state(vm, cpu);
+                struct ucall uc;
+
+                vcpu_run(vm, cpu);
+                TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                            "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+                            run->exit_reason,
+                            exit_reason_str(run->exit_reason));
+
+                switch (get_ucall(vm, cpu, &uc)) {
+                case UCALL_DONE:
+                       goto out;
+
+                case UCALL_SYNC:
+                       printf("Guest %ld sync %lx %lx %ld\n", cpu, uc.args[2], uc.args[3], uc.args[2] - uc.args[3]);
+                       failures++;
+                       break;
+
+                default:
+                        TEST_FAIL("Unknown ucall %lu", uc.cmd);
+               }
+       }
+ out:
+       return (void *)failures;
+}
+
+int main(int argc, char *argv[])
+{
+        if (!kvm_check_cap(KVM_CAP_VM_TSC_CONTROL)) {
+               print_skip("KVM_CAP_VM_TSC_CONTROL not available");
+               exit(KSFT_SKIP);
+       }
+
+       vm = vm_create_default_with_vcpus(0, DEFAULT_STACK_PGS * NR_TEST_VCPUS, 0, guest_code, NULL);
+       vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ);
+
+       pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE);
+       pthread_t cpu_threads[NR_TEST_VCPUS];
+       unsigned long cpu;
+       for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++)
+               pthread_create(&cpu_threads[cpu], NULL, run_vcpu, (void *)cpu);
+
+       unsigned long failures = 0;
+       for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) {
+               void *this_cpu_failures;
+               pthread_join(cpu_threads[cpu], &this_cpu_failures);
+               failures += (unsigned long)this_cpu_failures;
+       }
+
+       TEST_ASSERT(!failures, "TSC sync failed");
+       pthread_spin_destroy(&create_lock);
+       kvm_vm_free(vm);
+       return 0;
+}
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * VMX-pmu related msrs test
+ * Test for VMX-pmu perf capability msr
  *
  * Copyright (C) 2021 Intel Corporation
  *
- * Test to check the effect of various CPUID settings
- * on the MSR_IA32_PERF_CAPABILITIES MSR, and check that
- * whatever we write with KVM_SET_MSR is _not_ modified
- * in the guest and test it can be retrieved with KVM_GET_MSR.
- *
- * Test to check that invalid LBR formats are rejected.
+ * Test to check the effect of various CPUID settings on
+ * MSR_IA32_PERF_CAPABILITIES MSR, and check that what
+ * we write with KVM_SET_MSR is _not_ modified by the guest
+ * and check it can be retrieved with KVM_GET_MSR, also test
+ * the invalid LBR formats are rejected.
  */
 
 #define _GNU_SOURCE /* for program_invocation_short_name */
@@ -107,8 +106,11 @@ int main(int argc, char *argv[])
        ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), (u64)host_cap.lbr_format);
 
        /* testcase 3, check invalid LBR format is rejected */
-       ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT);
+       /* Note, on Arch LBR capable platforms, LBR_FMT in perf capability msr is 0x3f,
+        * to avoid the failure, use a true invalid format 0x30 for the test. */
+       ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0x30);
        TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
 
+       printf("Completed perf capability tests.\n");
        kvm_vm_free(vm);
 }
index bcd3708..7a51bb6 100644 (file)
 
 #define EVTCHN_VECTOR  0x10
 
+#define EVTCHN_TEST1 15
+#define EVTCHN_TEST2 66
+#define EVTCHN_TIMER 13
+
 static struct kvm_vm *vm;
 
 #define XEN_HYPERCALL_MSR      0x40000000
 
 #define MIN_STEAL_TIME         50000
 
+#define __HYPERVISOR_set_timer_op      15
+#define __HYPERVISOR_sched_op          29
+#define __HYPERVISOR_event_channel_op  32
+
+#define SCHEDOP_poll                   3
+
+#define EVTCHNOP_send                  4
+
+#define EVTCHNSTAT_interdomain         2
+
+struct evtchn_send {
+       u32 port;
+};
+
+struct sched_poll {
+       u32 *ports;
+       unsigned int nr_ports;
+       u64 timeout;
+};
+
 struct pvclock_vcpu_time_info {
        u32   version;
        u32   pad0;
@@ -106,15 +130,25 @@ struct {
        struct kvm_irq_routing_entry entries[2];
 } irq_routes;
 
+bool guest_saw_irq;
+
 static void evtchn_handler(struct ex_regs *regs)
 {
        struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
        vi->evtchn_upcall_pending = 0;
        vi->evtchn_pending_sel = 0;
+       guest_saw_irq = true;
 
        GUEST_SYNC(0x20);
 }
 
+static void guest_wait_for_irq(void)
+{
+       while (!guest_saw_irq)
+               __asm__ __volatile__ ("rep nop" : : : "memory");
+       guest_saw_irq = false;
+}
+
 static void guest_code(void)
 {
        struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
@@ -127,6 +161,8 @@ static void guest_code(void)
        /* Trigger an interrupt injection */
        GUEST_SYNC(0);
 
+       guest_wait_for_irq();
+
        /* Test having the host set runstates manually */
        GUEST_SYNC(RUNSTATE_runnable);
        GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
@@ -167,14 +203,132 @@ static void guest_code(void)
        /* Now deliver an *unmasked* interrupt */
        GUEST_SYNC(8);
 
-       while (!si->evtchn_pending[1])
-               __asm__ __volatile__ ("rep nop" : : : "memory");
+       guest_wait_for_irq();
 
        /* Change memslots and deliver an interrupt */
        GUEST_SYNC(9);
 
-       for (;;)
-               __asm__ __volatile__ ("rep nop" : : : "memory");
+       guest_wait_for_irq();
+
+       /* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
+       GUEST_SYNC(10);
+
+       guest_wait_for_irq();
+
+       GUEST_SYNC(11);
+
+       /* Our turn. Deliver event channel (to ourselves) with
+        * EVTCHNOP_send hypercall. */
+       unsigned long rax;
+       struct evtchn_send s = { .port = 127 };
+       __asm__ __volatile__ ("vmcall" :
+                             "=a" (rax) :
+                             "a" (__HYPERVISOR_event_channel_op),
+                             "D" (EVTCHNOP_send),
+                             "S" (&s));
+
+       GUEST_ASSERT(rax == 0);
+
+       guest_wait_for_irq();
+
+       GUEST_SYNC(12);
+
+       /* Deliver "outbound" event channel to an eventfd which
+        * happens to be one of our own irqfds. */
+       s.port = 197;
+       __asm__ __volatile__ ("vmcall" :
+                             "=a" (rax) :
+                             "a" (__HYPERVISOR_event_channel_op),
+                             "D" (EVTCHNOP_send),
+                             "S" (&s));
+
+       GUEST_ASSERT(rax == 0);
+
+       guest_wait_for_irq();
+
+       GUEST_SYNC(13);
+
+       /* Set a timer 100ms in the future. */
+       __asm__ __volatile__ ("vmcall" :
+                             "=a" (rax) :
+                             "a" (__HYPERVISOR_set_timer_op),
+                             "D" (rs->state_entry_time + 100000000));
+       GUEST_ASSERT(rax == 0);
+
+       GUEST_SYNC(14);
+
+       /* Now wait for the timer */
+       guest_wait_for_irq();
+
+       GUEST_SYNC(15);
+
+       /* The host has 'restored' the timer. Just wait for it. */
+       guest_wait_for_irq();
+
+       GUEST_SYNC(16);
+
+       /* Poll for an event channel port which is already set */
+       u32 ports[1] = { EVTCHN_TIMER };
+       struct sched_poll p = {
+               .ports = ports,
+               .nr_ports = 1,
+               .timeout = 0,
+       };
+
+       __asm__ __volatile__ ("vmcall" :
+                             "=a" (rax) :
+                             "a" (__HYPERVISOR_sched_op),
+                             "D" (SCHEDOP_poll),
+                             "S" (&p));
+
+       GUEST_ASSERT(rax == 0);
+
+       GUEST_SYNC(17);
+
+       /* Poll for an unset port and wait for the timeout. */
+       p.timeout = 100000000;
+       __asm__ __volatile__ ("vmcall" :
+                             "=a" (rax) :
+                             "a" (__HYPERVISOR_sched_op),
+                             "D" (SCHEDOP_poll),
+                             "S" (&p));
+
+       GUEST_ASSERT(rax == 0);
+
+       GUEST_SYNC(18);
+
+       /* A timer will wake the masked port we're waiting on, while we poll */
+       p.timeout = 0;
+       __asm__ __volatile__ ("vmcall" :
+                             "=a" (rax) :
+                             "a" (__HYPERVISOR_sched_op),
+                             "D" (SCHEDOP_poll),
+                             "S" (&p));
+
+       GUEST_ASSERT(rax == 0);
+
+       GUEST_SYNC(19);
+
+       /* A timer wake an *unmasked* port which should wake us with an
+        * actual interrupt, while we're polling on a different port. */
+       ports[0]++;
+       p.timeout = 0;
+       __asm__ __volatile__ ("vmcall" :
+                             "=a" (rax) :
+                             "a" (__HYPERVISOR_sched_op),
+                             "D" (SCHEDOP_poll),
+                             "S" (&p));
+
+       GUEST_ASSERT(rax == 0);
+
+       guest_wait_for_irq();
+
+       GUEST_SYNC(20);
+
+       /* Timer should have fired already */
+       guest_wait_for_irq();
+
+       GUEST_SYNC(21);
 }
 
 static int cmp_timespec(struct timespec *a, struct timespec *b)
@@ -190,9 +344,13 @@ static int cmp_timespec(struct timespec *a, struct timespec *b)
        else
                return 0;
 }
+struct vcpu_info *vinfo;
 
 static void handle_alrm(int sig)
 {
+       if (vinfo)
+               printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
+       vcpu_dump(stdout, vm, VCPU_ID, 0);
        TEST_FAIL("IRQ delivery timed out");
 }
 
@@ -212,6 +370,7 @@ int main(int argc, char *argv[])
 
        bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
        bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
+       bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
 
        clock_gettime(CLOCK_REALTIME, &min_ts);
 
@@ -232,6 +391,12 @@ int main(int argc, char *argv[])
                .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
                .msr = XEN_HYPERCALL_MSR,
        };
+
+       /* Let the kernel know that we *will* use it for sending all
+        * event channels, which lets it intercept SCHEDOP_poll */
+       if (do_evtchn_tests)
+               hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
        vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
 
        struct kvm_xen_hvm_attr lm = {
@@ -294,7 +459,7 @@ int main(int argc, char *argv[])
 
                /* Unexpected, but not a KVM failure */
                if (irq_fd[0] == -1 || irq_fd[1] == -1)
-                       do_eventfd_tests = false;
+                       do_evtchn_tests = do_eventfd_tests = false;
        }
 
        if (do_eventfd_tests) {
@@ -302,13 +467,13 @@ int main(int argc, char *argv[])
 
                irq_routes.entries[0].gsi = 32;
                irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
-               irq_routes.entries[0].u.xen_evtchn.port = 15;
+               irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
                irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID;
                irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
 
                irq_routes.entries[1].gsi = 33;
                irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
-               irq_routes.entries[1].u.xen_evtchn.port = 66;
+               irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
                irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID;
                irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
 
@@ -329,7 +494,39 @@ int main(int argc, char *argv[])
                sigaction(SIGALRM, &sa, NULL);
        }
 
-       struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
+       struct kvm_xen_vcpu_attr tmr = {
+               .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
+               .u.timer.port = EVTCHN_TIMER,
+               .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+               .u.timer.expires_ns = 0
+       };
+
+       if (do_evtchn_tests) {
+               struct kvm_xen_hvm_attr inj = {
+                       .type = KVM_XEN_ATTR_TYPE_EVTCHN,
+                       .u.evtchn.send_port = 127,
+                       .u.evtchn.type = EVTCHNSTAT_interdomain,
+                       .u.evtchn.flags = 0,
+                       .u.evtchn.deliver.port.port = EVTCHN_TEST1,
+                       .u.evtchn.deliver.port.vcpu = VCPU_ID + 1,
+                       .u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+               };
+               vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+               /* Test migration to a different vCPU */
+               inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
+               inj.u.evtchn.deliver.port.vcpu = VCPU_ID;
+               vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+               inj.u.evtchn.send_port = 197;
+               inj.u.evtchn.deliver.eventfd.port = 0;
+               inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
+               inj.u.evtchn.flags = 0;
+               vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+               vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+       }
+       vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
        vinfo->evtchn_upcall_pending = 0;
 
        struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
@@ -422,7 +619,7 @@ int main(int argc, char *argv[])
                                        goto done;
                                if (verbose)
                                        printf("Testing masked event channel\n");
-                               shinfo->evtchn_mask[0] = 0x8000;
+                               shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
                                eventfd_write(irq_fd[0], 1UL);
                                alarm(1);
                                break;
@@ -439,6 +636,9 @@ int main(int argc, char *argv[])
                                break;
 
                        case 9:
+                               TEST_ASSERT(!evtchn_irq_expected,
+                                           "Expected event channel IRQ but it didn't happen");
+                               shinfo->evtchn_pending[1] = 0;
                                if (verbose)
                                        printf("Testing event channel after memslot change\n");
                                vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
@@ -448,12 +648,153 @@ int main(int argc, char *argv[])
                                alarm(1);
                                break;
 
+                       case 10:
+                               TEST_ASSERT(!evtchn_irq_expected,
+                                           "Expected event channel IRQ but it didn't happen");
+                               if (!do_evtchn_tests)
+                                       goto done;
+
+                               shinfo->evtchn_pending[0] = 0;
+                               if (verbose)
+                                       printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
+
+                               struct kvm_irq_routing_xen_evtchn e;
+                               e.port = EVTCHN_TEST2;
+                               e.vcpu = VCPU_ID;
+                               e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+                               vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
+                               evtchn_irq_expected = true;
+                               alarm(1);
+                               break;
+
+                       case 11:
+                               TEST_ASSERT(!evtchn_irq_expected,
+                                           "Expected event channel IRQ but it didn't happen");
+                               shinfo->evtchn_pending[1] = 0;
+
+                               if (verbose)
+                                       printf("Testing guest EVTCHNOP_send direct to evtchn\n");
+                               evtchn_irq_expected = true;
+                               alarm(1);
+                               break;
+
+                       case 12:
+                               TEST_ASSERT(!evtchn_irq_expected,
+                                           "Expected event channel IRQ but it didn't happen");
+                               shinfo->evtchn_pending[0] = 0;
+
+                               if (verbose)
+                                       printf("Testing guest EVTCHNOP_send to eventfd\n");
+                               evtchn_irq_expected = true;
+                               alarm(1);
+                               break;
+
+                       case 13:
+                               TEST_ASSERT(!evtchn_irq_expected,
+                                           "Expected event channel IRQ but it didn't happen");
+                               shinfo->evtchn_pending[1] = 0;
+
+                               if (verbose)
+                                       printf("Testing guest oneshot timer\n");
+                               break;
+
+                       case 14:
+                               memset(&tmr, 0, sizeof(tmr));
+                               tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER;
+                               vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
+                               TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
+                                           "Timer port not returned");
+                               TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+                                           "Timer priority not returned");
+                               TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
+                                           "Timer expiry not returned");
+                               evtchn_irq_expected = true;
+                               alarm(1);
+                               break;
+
+                       case 15:
+                               TEST_ASSERT(!evtchn_irq_expected,
+                                           "Expected event channel IRQ but it didn't happen");
+                               shinfo->evtchn_pending[0] = 0;
+
+                               if (verbose)
+                                       printf("Testing restored oneshot timer\n");
+
+                               tmr.u.timer.expires_ns = rs->state_entry_time + 100000000,
+                               vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+                               evtchn_irq_expected = true;
+                               alarm(1);
+                               break;
+
+                       case 16:
+                               TEST_ASSERT(!evtchn_irq_expected,
+                                           "Expected event channel IRQ but it didn't happen");
+
+                               if (verbose)
+                                       printf("Testing SCHEDOP_poll with already pending event\n");
+                               shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
+                               alarm(1);
+                               break;
+
+                       case 17:
+                               if (verbose)
+                                       printf("Testing SCHEDOP_poll timeout\n");
+                               shinfo->evtchn_pending[0] = 0;
+                               alarm(1);
+                               break;
+
+                       case 18:
+                               if (verbose)
+                                       printf("Testing SCHEDOP_poll wake on masked event\n");
+
+                               tmr.u.timer.expires_ns = rs->state_entry_time + 100000000,
+                               vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+                               alarm(1);
+                               break;
+
+                       case 19:
+                               shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
+                               if (verbose)
+                                       printf("Testing SCHEDOP_poll wake on unmasked event\n");
+
+                               evtchn_irq_expected = true;
+                               tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
+                               vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+
+                               /* Read it back and check the pending time is reported correctly */
+                               tmr.u.timer.expires_ns = 0;
+                               vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
+                               TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
+                                           "Timer not reported pending");
+                               alarm(1);
+                               break;
+
+                       case 20:
+                               TEST_ASSERT(!evtchn_irq_expected,
+                                           "Expected event channel IRQ but it didn't happen");
+                               /* Read timer and check it is no longer pending */
+                               vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
+                               TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
+
+                               shinfo->evtchn_pending[0] = 0;
+                               if (verbose)
+                                       printf("Testing timer in the past\n");
+
+                               evtchn_irq_expected = true;
+                               tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
+                               vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
+                               alarm(1);
+                               break;
+
+                       case 21:
+                               TEST_ASSERT(!evtchn_irq_expected,
+                                           "Expected event channel IRQ but it didn't happen");
+                               goto done;
+
                        case 0x20:
                                TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
                                evtchn_irq_expected = false;
-                               if (shinfo->evtchn_pending[1] &&
-                                   shinfo->evtchn_pending[0])
-                                       goto done;
                                break;
                        }
                        break;
@@ -466,6 +807,7 @@ int main(int argc, char *argv[])
        }
 
  done:
+       alarm(0);
        clock_gettime(CLOCK_REALTIME, &max_ts);
 
        /*
index 5ab1221..64ec222 100644 (file)
@@ -1092,6 +1092,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
        spin_lock_init(&kvm->gpc_lock);
 
        INIT_LIST_HEAD(&kvm->devices);
+       kvm->max_vcpus = KVM_MAX_VCPUS;
 
        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
@@ -3753,7 +3754,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
                return -EINVAL;
 
        mutex_lock(&kvm->lock);
-       if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+       if (kvm->created_vcpus >= kvm->max_vcpus) {
                mutex_unlock(&kvm->lock);
                return -EINVAL;
        }