Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 22 Jan 2022 07:40:01 +0000 (09:40 +0200)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 22 Jan 2022 07:40:01 +0000 (09:40 +0200)
Pull more kvm updates from Paolo Bonzini:
 "Generic:

   - selftest compilation fix for non-x86

   - KVM: avoid warning on s390 in mark_page_dirty

 x86:

   - fix page write-protection bug and improve comments

   - use binary search to lookup the PMU event filter, add test

   - enable_pmu module parameter support for Intel CPUs

   - switch blocked_vcpu_on_cpu_lock to raw spinlock

   - cleanups of blocked vCPU logic

   - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression)

   - various small fixes"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (46 commits)
  docs: kvm: fix WARNINGs from api.rst
  selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c
  selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c
  kvm: selftests: Do not indent with spaces
  kvm: selftests: sync uapi/linux/kvm.h with Linux header
  selftests: kvm: add amx_test to .gitignore
  KVM: SVM: Nullify vcpu_(un)blocking() hooks if AVIC is disabled
  KVM: SVM: Move svm_hardware_setup() and its helpers below svm_x86_ops
  KVM: SVM: Drop AVIC's intermediate avic_set_running() helper
  KVM: VMX: Don't do full kick when handling posted interrupt wakeup
  KVM: VMX: Fold fallback path into triggering posted IRQ helper
  KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ
  KVM: VMX: Don't do full kick when triggering posted interrupt "fails"
  KVM: SVM: Skip AVIC and IRTE updates when loading blocking vCPU
  KVM: SVM: Use kvm_vcpu_is_blocking() in AVIC load to handle preemption
  KVM: SVM: Remove unnecessary APICv/AVIC update in vCPU unblocking path
  KVM: SVM: Don't bother checking for "running" AVIC when kicking for IPIs
  KVM: SVM: Signal AVIC doorbell iff vCPU is in guest mode
  KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks
  KVM: x86: Unexport LAPIC's switch_to_{hv,sw}_timer() helpers
  ...

36 files changed:
Documentation/virt/kvm/api.rst
arch/x86/include/asm/kvm-x86-ops.h
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/cpuid.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/spte.c
arch/x86/kvm/mmu/spte.h
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/pmu.c
arch/x86/kvm/svm/avic.c
arch/x86/kvm/svm/pmu.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/vmx/capabilities.h
arch/x86/kvm/vmx/pmu_intel.c
arch/x86/kvm/vmx/posted_intr.c
arch/x86/kvm/vmx/posted_intr.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
include/linux/kvm_host.h
tools/include/uapi/linux/kvm.h
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/include/x86_64/processor.h
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/x86_64/processor.c
tools/testing/selftests/kvm/x86_64/cpuid_test.c [moved from tools/testing/selftests/kvm/x86_64/get_cpuid_test.c with 83% similarity]
tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
virt/kvm/kvm_main.c

index d3791a1..bb8cfdd 100644 (file)
@@ -5545,8 +5545,8 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header.
 The Stats Data block contains an array of 64-bit values in the same order
 as the descriptors in Descriptors block.
 
-4.42 KVM_GET_XSAVE2
-------------------
+4.134 KVM_GET_XSAVE2
+--------------------
 
 :Capability: KVM_CAP_XSAVE2
 :Architectures: x86
@@ -7363,7 +7363,7 @@ trap and emulate MSRs that are outside of the scope of KVM as well as
 limit the attack surface on KVM's MSR emulation code.
 
 8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID
------------------------------
+-------------------------------------
 
 Architectures: x86
 
index f658bb4..631d504 100644 (file)
@@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush)
 KVM_X86_OP_NULL(tlb_remote_flush_with_range)
 KVM_X86_OP(tlb_flush_gva)
 KVM_X86_OP(tlb_flush_guest)
+KVM_X86_OP(vcpu_pre_run)
 KVM_X86_OP(run)
 KVM_X86_OP_NULL(handle_exit)
 KVM_X86_OP_NULL(skip_emulated_instruction)
@@ -98,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff)
 KVM_X86_OP_NULL(request_immediate_exit)
 KVM_X86_OP(sched_in)
 KVM_X86_OP_NULL(update_cpu_dirty_logging)
-KVM_X86_OP_NULL(pre_block)
-KVM_X86_OP_NULL(post_block)
 KVM_X86_OP_NULL(vcpu_blocking)
 KVM_X86_OP_NULL(vcpu_unblocking)
 KVM_X86_OP_NULL(update_pi_irte)
index 0677b9e..1384517 100644 (file)
@@ -1381,6 +1381,7 @@ struct kvm_x86_ops {
         */
        void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
 
+       int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
        enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
        int (*handle_exit)(struct kvm_vcpu *vcpu,
                enum exit_fastpath_completion exit_fastpath);
@@ -1454,18 +1455,6 @@ struct kvm_x86_ops {
        const struct kvm_pmu_ops *pmu_ops;
        const struct kvm_x86_nested_ops *nested_ops;
 
-       /*
-        * Architecture specific hooks for vCPU blocking due to
-        * HLT instruction.
-        * Returns for .pre_block():
-        *    - 0 means continue to block the vCPU.
-        *    - 1 means we cannot block the vCPU since some event
-        *        happens during this period, such as, 'ON' bit in
-        *        posted-interrupts descriptor is set.
-        */
-       int (*pre_block)(struct kvm_vcpu *vcpu);
-       void (*post_block)(struct kvm_vcpu *vcpu);
-
        void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
        void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
 
index c55e57b..3902c28 100644 (file)
@@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
        return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
 }
 
+/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
+static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+                                int nent)
+{
+       struct kvm_cpuid_entry2 *orig;
+       int i;
+
+       if (nent != vcpu->arch.cpuid_nent)
+               return -EINVAL;
+
+       for (i = 0; i < nent; i++) {
+               orig = &vcpu->arch.cpuid_entries[i];
+               if (e2[i].function != orig->function ||
+                   e2[i].index != orig->index ||
+                   e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
+                   e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
 static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
 {
        u32 function;
@@ -145,14 +167,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
        }
 }
 
-static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
+                                             struct kvm_cpuid_entry2 *entries, int nent)
 {
        u32 base = vcpu->arch.kvm_cpuid_base;
 
        if (!base)
                return NULL;
 
-       return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0);
+       return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
+}
+
+static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+{
+       return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
+                                            vcpu->arch.cpuid_nent);
 }
 
 void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
@@ -167,11 +196,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
                vcpu->arch.pv_cpuid.features = best->eax;
 }
 
-void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
+                                      int nent)
 {
        struct kvm_cpuid_entry2 *best;
 
-       best = kvm_find_cpuid_entry(vcpu, 1, 0);
+       best = cpuid_entry2_find(entries, nent, 1, 0);
        if (best) {
                /* Update OSXSAVE bit */
                if (boot_cpu_has(X86_FEATURE_XSAVE))
@@ -182,33 +212,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
                           vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
        }
 
-       best = kvm_find_cpuid_entry(vcpu, 7, 0);
+       best = cpuid_entry2_find(entries, nent, 7, 0);
        if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
                cpuid_entry_change(best, X86_FEATURE_OSPKE,
                                   kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
 
-       best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+       best = cpuid_entry2_find(entries, nent, 0xD, 0);
        if (best)
                best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
 
-       best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
+       best = cpuid_entry2_find(entries, nent, 0xD, 1);
        if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
                     cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
                best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-       best = kvm_find_kvm_cpuid_features(vcpu);
+       best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
        if (kvm_hlt_in_guest(vcpu->kvm) && best &&
                (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
                best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
 
        if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
-               best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+               best = cpuid_entry2_find(entries, nent, 0x1, 0);
                if (best)
                        cpuid_entry_change(best, X86_FEATURE_MWAIT,
                                           vcpu->arch.ia32_misc_enable_msr &
                                           MSR_IA32_MISC_ENABLE_MWAIT);
        }
 }
+
+void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+{
+       __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
+}
 EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
 
 static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
@@ -298,6 +333,22 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
 {
        int r;
 
+       __kvm_update_cpuid_runtime(vcpu, e2, nent);
+
+       /*
+        * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+        * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+        * tracked in kvm_mmu_page_role.  As a result, KVM may miss guest page
+        * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+        * the core vCPU model on the fly. It would've been better to forbid any
+        * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
+        * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
+        * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
+        * whether the supplied CPUID data is equal to what's already set.
+        */
+       if (vcpu->arch.last_vmentry_cpu != -1)
+               return kvm_cpuid_check_equal(vcpu, e2, nent);
+
        r = kvm_check_cpuid(vcpu, e2, nent);
        if (r)
                return r;
@@ -307,7 +358,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
        vcpu->arch.cpuid_nent = nent;
 
        kvm_update_kvm_cpuid_base(vcpu);
-       kvm_update_cpuid_runtime(vcpu);
        kvm_vcpu_after_set_cpuid(vcpu);
 
        return 0;
@@ -795,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                perf_get_x86_pmu_capability(&cap);
 
                /*
-                * Only support guest architectural pmu on a host
-                * with architectural pmu.
+                * The guest architecture pmu is only supported if the architecture
+                * pmu exists on the host and the module parameters allow it.
                 */
-               if (!cap.version)
+               if (!cap.version || !enable_pmu)
                        memset(&cap, 0, sizeof(cap));
 
                eax.split.version_id = min(cap.version, 2);
@@ -886,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                                --array->nent;
                                continue;
                        }
+
+                       if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+                               entry->ecx &= ~BIT_ULL(2);
                        entry->edx = 0;
                }
                break;
index c5028e6..baca9fa 100644 (file)
@@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 {
        restart_apic_timer(vcpu->arch.apic);
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
 
 void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 {
@@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
                start_sw_timer(apic);
        preempt_enable();
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
 
 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
 {
index 1d275e9..593093b 100644 (file)
@@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
                                continue;
 
                        flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+
                                                        PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
                                                        start, end - 1, true, flush);
                }
@@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
        }
 
        /*
-        * We can flush all the TLBs out of the mmu lock without TLB
-        * corruption since we just change the spte from writable to
-        * readonly so that we only need to care the case of changing
-        * spte from present to present (changing the spte from present
-        * to nonpresent will flush all the TLBs immediately), in other
-        * words, the only case we care is mmu_spte_update() where we
-        * have checked Host-writable | MMU-writable instead of
-        * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
-        * anymore.
+        * Flush TLBs if any SPTEs had to be write-protected to ensure that
+        * guest writes are reflected in the dirty bitmap before the memslot
+        * update completes, i.e. before enabling dirty logging is visible to
+        * userspace.
+        *
+        * Perform the TLB flush outside the mmu_lock to reduce the amount of
+        * time the lock is held. However, this does mean that another CPU can
+        * now grab mmu_lock and encounter a write-protected SPTE while CPUs
+        * still have a writable mapping for the associated GFN in their TLB.
+        *
+        * This is safe but requires KVM to be careful when making decisions
+        * based on the write-protection status of an SPTE. Specifically, KVM
+        * also write-protects SPTEs to monitor changes to guest page tables
+        * during shadow paging, and must guarantee no CPUs can write to those
+        * page before the lock is dropped. As mentioned in the previous
+        * paragraph, a write-protected SPTE is no guarantee that CPU cannot
+        * perform writes. So to determine if a TLB flush is truly required, KVM
+        * will clear a separate software-only bit (MMU-writable) and skip the
+        * flush if-and-only-if this bit was already clear.
+        *
+        * See DEFAULT_SPTE_MMU_WRITEABLE for more details.
         */
        if (flush)
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
index 351b04a..73cfe62 100644 (file)
@@ -216,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
 
        new_spte &= ~PT_WRITABLE_MASK;
        new_spte &= ~shadow_host_writable_mask;
+       new_spte &= ~shadow_mmu_writable_mask;
 
        new_spte = mark_spte_for_access_track(new_spte);
 
index a4af2a4..be6a007 100644 (file)
@@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
        (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
-/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
-#define DEFAULT_SPTE_HOST_WRITEABLE    BIT_ULL(9)
-#define DEFAULT_SPTE_MMU_WRITEABLE     BIT_ULL(10)
-
 /*
  * The mask/shift to use for saving the original R/X bits when marking the PTE
  * as not-present for access tracking purposes. We do not save the W bit as the
@@ -79,6 +75,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
 static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
 
 /*
+ * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
+ * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
+ * that map guest pages in read-only memslots and read-only VMAs.
+ *
+ * Invariants:
+ *  - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
+ *
+ *
+ * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
+ * allows writes to the guest page mapped by the SPTE. This bit is cleared when
+ * the guest page mapped by the SPTE contains a page table that is being
+ * monitored for shadow paging. In this case the SPTE can only be made writable
+ * by unsyncing the shadow page under the mmu_lock.
+ *
+ * Invariants:
+ *  - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
+ *  - If MMU-writable is set, Host-writable must be set.
+ *
+ * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
+ * to track writes for dirty logging. For such SPTEs, KVM will locklessly set
+ * PT_WRITABLE_MASK upon the next write from the guest and record the write in
+ * the dirty log (see fast_page_fault()).
+ */
+
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE    BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE     BIT_ULL(10)
+
+/*
  * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
  * to not overlap the A/D type mask or the saved access bits of access-tracked
  * SPTEs when A/D bits are disabled.
@@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
 
 static inline bool spte_can_locklessly_be_made_writable(u64 spte)
 {
-       return (spte & shadow_host_writable_mask) &&
-              (spte & shadow_mmu_writable_mask);
+       if (spte & shadow_mmu_writable_mask) {
+               WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
+               return true;
+       }
+
+       WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
+       return false;
 }
 
 static inline u64 get_mmio_spte_generation(u64 spte)
index 7b1bc81..bc9e355 100644 (file)
@@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
                    !is_last_spte(iter.old_spte, iter.level))
                        continue;
 
-               if (!is_writable_pte(iter.old_spte))
-                       break;
-
                new_spte = iter.old_spte &
                        ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
 
+               if (new_spte == iter.old_spte)
+                       break;
+
                tdp_mmu_set_spte(kvm, &iter, new_spte);
                spte_set = true;
        }
index 261b39c..f614f95 100644 (file)
@@ -13,6 +13,8 @@
 #include <linux/types.h>
 #include <linux/kvm_host.h>
 #include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
 #include <asm/perf_event.h>
 #include "x86.h"
 #include "cpuid.h"
@@ -109,6 +111,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
                .config = config,
        };
 
+       if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
+               return;
+
        attr.sample_period = get_sample_period(pmc, pmc->counter);
 
        if (in_tx)
@@ -169,12 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
        return true;
 }
 
+static int cmp_u64(const void *a, const void *b)
+{
+       return *(__u64 *)a - *(__u64 *)b;
+}
+
 void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 {
        unsigned config, type = PERF_TYPE_RAW;
        struct kvm *kvm = pmc->vcpu->kvm;
        struct kvm_pmu_event_filter *filter;
-       int i;
        bool allow_event = true;
 
        if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@@ -189,16 +198,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 
        filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
        if (filter) {
-               for (i = 0; i < filter->nevents; i++)
-                       if (filter->events[i] ==
-                           (eventsel & AMD64_RAW_EVENT_MASK_NB))
-                               break;
-               if (filter->action == KVM_PMU_EVENT_ALLOW &&
-                   i == filter->nevents)
-                       allow_event = false;
-               if (filter->action == KVM_PMU_EVENT_DENY &&
-                   i < filter->nevents)
-                       allow_event = false;
+               __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
+
+               if (bsearch(&key, filter->events, filter->nevents,
+                           sizeof(__u64), cmp_u64))
+                       allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
+               else
+                       allow_event = filter->action == KVM_PMU_EVENT_DENY;
        }
        if (!allow_event)
                return;
@@ -573,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
        /* Ensure nevents can't be changed between the user copies. */
        *filter = tmp;
 
+       /*
+        * Sort the in-kernel list so that we can search it with bsearch.
+        */
+       sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+
        mutex_lock(&kvm->lock);
        filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
                                     mutex_is_locked(&kvm->lock));
index 0e5b492..90364d0 100644 (file)
@@ -295,13 +295,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
        struct kvm_vcpu *vcpu;
        unsigned long i;
 
+       /*
+        * Wake any target vCPUs that are blocking, i.e. waiting for a wake
+        * event.  There's no need to signal doorbells, as hardware has handled
+        * vCPUs that were in guest at the time of the IPI, and vCPUs that have
+        * since entered the guest will have processed pending IRQs at VMRUN.
+        */
        kvm_for_each_vcpu(i, vcpu, kvm) {
-               bool m = kvm_apic_match_dest(vcpu, source,
-                                            icrl & APIC_SHORT_MASK,
-                                            GET_APIC_DEST_FIELD(icrh),
-                                            icrl & APIC_DEST_MASK);
-
-               if (m && !avic_vcpu_is_running(vcpu))
+               if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
+                                       GET_APIC_DEST_FIELD(icrh),
+                                       icrl & APIC_DEST_MASK))
                        kvm_vcpu_wake_up(vcpu);
        }
 }
@@ -672,9 +675,22 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
                return -1;
 
        kvm_lapic_set_irr(vec, vcpu->arch.apic);
+
+       /*
+        * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+        * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+        * the read of guest_mode, which guarantees that either VMRUN will see
+        * and process the new vIRR entry, or that the below code will signal
+        * the doorbell if the vCPU is already running in the guest.
+        */
        smp_mb__after_atomic();
 
-       if (avic_vcpu_is_running(vcpu)) {
+       /*
+        * Signal the doorbell to tell hardware to inject the IRQ if the vCPU
+        * is in the guest.  If the vCPU is not in the guest, hardware will
+        * automatically process AVIC interrupts at VMRUN.
+        */
+       if (vcpu->mode == IN_GUEST_MODE) {
                int cpu = READ_ONCE(vcpu->cpu);
 
                /*
@@ -688,8 +704,13 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
                if (cpu != get_cpu())
                        wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
                put_cpu();
-       } else
+       } else {
+               /*
+                * Wake the vCPU if it was blocking.  KVM will then detect the
+                * pending IRQ when checking if the vCPU has a wake event.
+                */
                kvm_vcpu_wake_up(vcpu);
+       }
 
        return 0;
 }
@@ -957,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        int h_physical_id = kvm_cpu_get_apicid(cpu);
        struct vcpu_svm *svm = to_svm(vcpu);
 
+       lockdep_assert_preemption_disabled();
+
        /*
         * Since the host physical APIC id is 8 bits,
         * we can support host APIC ID upto 255.
@@ -964,19 +987,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
                return;
 
+       /*
+        * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+        * is being scheduled in after being preempted.  The CPU entries in the
+        * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+        * If the vCPU was migrated, its new CPU value will be stuffed when the
+        * vCPU unblocks.
+        */
+       if (kvm_vcpu_is_blocking(vcpu))
+               return;
+
        entry = READ_ONCE(*(svm->avic_physical_id_cache));
        WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
 
        entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
        entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-
-       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       if (svm->avic_is_running)
-               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 
        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
-       avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
-                                       svm->avic_is_running);
+       avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
 }
 
 void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -984,42 +1013,56 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
        u64 entry;
        struct vcpu_svm *svm = to_svm(vcpu);
 
+       lockdep_assert_preemption_disabled();
+
        entry = READ_ONCE(*(svm->avic_physical_id_cache));
-       if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
-               avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+       /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
+       if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+               return;
+
+       avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
 
        entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
 }
 
-/*
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-       int cpu = get_cpu();
-
-       WARN_ON(cpu != vcpu->cpu);
-       svm->avic_is_running = is_run;
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
 
-       if (kvm_vcpu_apicv_active(vcpu)) {
-               if (is_run)
-                       avic_vcpu_load(vcpu, cpu);
-               else
-                       avic_vcpu_put(vcpu);
-       }
-       put_cpu();
+       preempt_disable();
+
+       /*
+        * Unload the AVIC when the vCPU is about to block, _before_
+        * the vCPU actually blocks.
+        *
+        * Any IRQs that arrive before IsRunning=0 will not cause an
+        * incomplete IPI vmexit on the source, therefore vIRR will also
+        * be checked by kvm_vcpu_check_block() before blocking.  The
+        * memory barrier implicit in set_current_state orders writing
+        * IsRunning=0 before reading the vIRR.  The processor needs a
+        * matching memory barrier on interrupt delivery between writing
+        * IRR and reading IsRunning; the lack of this barrier might be
+        * the cause of errata #1235).
+        */
+       avic_vcpu_put(vcpu);
+
+       preempt_enable();
 }
 
-void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
 {
-       avic_set_running(vcpu, false);
-}
+       int cpu;
 
-void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
-{
-       if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
-               kvm_vcpu_update_apicv(vcpu);
-       avic_set_running(vcpu, true);
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+
+       cpu = get_cpu();
+       WARN_ON(cpu != vcpu->cpu);
+
+       avic_vcpu_load(vcpu, cpu);
+
+       put_cpu();
 }
index 12d8b30..5aa45f1 100644 (file)
@@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
 {
        struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
 
-       if (!pmu)
+       if (!enable_pmu)
                return NULL;
 
        switch (msr) {
index 46bcc70..2c99b18 100644 (file)
@@ -192,10 +192,6 @@ module_param(vgif, int, 0444);
 static int lbrv = true;
 module_param(lbrv, int, 0444);
 
-/* enable/disable PMU virtualization */
-bool pmu = true;
-module_param(pmu, bool, 0444);
-
 static int tsc_scaling = true;
 module_param(tsc_scaling, int, 0444);
 
@@ -873,47 +869,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
        }
 }
 
-/*
- * The default MMIO mask is a single bit (excluding the present bit),
- * which could conflict with the memory encryption bit. Check for
- * memory encryption support and override the default MMIO mask if
- * memory encryption is enabled.
- */
-static __init void svm_adjust_mmio_mask(void)
-{
-       unsigned int enc_bit, mask_bit;
-       u64 msr, mask;
-
-       /* If there is no memory encryption support, use existing mask */
-       if (cpuid_eax(0x80000000) < 0x8000001f)
-               return;
-
-       /* If memory encryption is not enabled, use existing mask */
-       rdmsrl(MSR_AMD64_SYSCFG, msr);
-       if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
-               return;
-
-       enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
-       mask_bit = boot_cpu_data.x86_phys_bits;
-
-       /* Increment the mask bit if it is the same as the encryption bit */
-       if (enc_bit == mask_bit)
-               mask_bit++;
-
-       /*
-        * If the mask bit location is below 52, then some bits above the
-        * physical addressing limit will always be reserved, so use the
-        * rsvd_bits() function to generate the mask. This mask, along with
-        * the present bit, will be used to generate a page fault with
-        * PFER.RSV = 1.
-        *
-        * If the mask bit location is 52 (or above), then clear the mask.
-        */
-       mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
-
-       kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
-}
-
 static void svm_hardware_teardown(void)
 {
        int cpu;
@@ -928,198 +883,6 @@ static void svm_hardware_teardown(void)
        iopm_base = 0;
 }
 
-static __init void svm_set_cpu_caps(void)
-{
-       kvm_set_cpu_caps();
-
-       supported_xss = 0;
-
-       /* CPUID 0x80000001 and 0x8000000A (SVM features) */
-       if (nested) {
-               kvm_cpu_cap_set(X86_FEATURE_SVM);
-
-               if (nrips)
-                       kvm_cpu_cap_set(X86_FEATURE_NRIPS);
-
-               if (npt_enabled)
-                       kvm_cpu_cap_set(X86_FEATURE_NPT);
-
-               if (tsc_scaling)
-                       kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
-
-               /* Nested VM can receive #VMEXIT instead of triggering #GP */
-               kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
-       }
-
-       /* CPUID 0x80000008 */
-       if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
-           boot_cpu_has(X86_FEATURE_AMD_SSBD))
-               kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
-
-       /* AMD PMU PERFCTR_CORE CPUID */
-       if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-               kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
-
-       /* CPUID 0x8000001F (SME/SEV features) */
-       sev_set_cpu_caps();
-}
-
-static __init int svm_hardware_setup(void)
-{
-       int cpu;
-       struct page *iopm_pages;
-       void *iopm_va;
-       int r;
-       unsigned int order = get_order(IOPM_SIZE);
-
-       /*
-        * NX is required for shadow paging and for NPT if the NX huge pages
-        * mitigation is enabled.
-        */
-       if (!boot_cpu_has(X86_FEATURE_NX)) {
-               pr_err_ratelimited("NX (Execute Disable) not supported\n");
-               return -EOPNOTSUPP;
-       }
-       kvm_enable_efer_bits(EFER_NX);
-
-       iopm_pages = alloc_pages(GFP_KERNEL, order);
-
-       if (!iopm_pages)
-               return -ENOMEM;
-
-       iopm_va = page_address(iopm_pages);
-       memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
-       iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
-       init_msrpm_offsets();
-
-       supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
-
-       if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
-               kvm_enable_efer_bits(EFER_FFXSR);
-
-       if (tsc_scaling) {
-               if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
-                       tsc_scaling = false;
-               } else {
-                       pr_info("TSC scaling supported\n");
-                       kvm_has_tsc_control = true;
-                       kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
-                       kvm_tsc_scaling_ratio_frac_bits = 32;
-               }
-       }
-
-       tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
-
-       /* Check for pause filtering support */
-       if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
-               pause_filter_count = 0;
-               pause_filter_thresh = 0;
-       } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
-               pause_filter_thresh = 0;
-       }
-
-       if (nested) {
-               printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
-               kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
-       }
-
-       /*
-        * KVM's MMU doesn't support using 2-level paging for itself, and thus
-        * NPT isn't supported if the host is using 2-level paging since host
-        * CR4 is unchanged on VMRUN.
-        */
-       if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
-               npt_enabled = false;
-
-       if (!boot_cpu_has(X86_FEATURE_NPT))
-               npt_enabled = false;
-
-       /* Force VM NPT level equal to the host's paging level */
-       kvm_configure_mmu(npt_enabled, get_npt_level(),
-                         get_npt_level(), PG_LEVEL_1G);
-       pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
-
-       /* Note, SEV setup consumes npt_enabled. */
-       sev_hardware_setup();
-
-       svm_hv_hardware_setup();
-
-       svm_adjust_mmio_mask();
-
-       for_each_possible_cpu(cpu) {
-               r = svm_cpu_init(cpu);
-               if (r)
-                       goto err;
-       }
-
-       if (nrips) {
-               if (!boot_cpu_has(X86_FEATURE_NRIPS))
-                       nrips = false;
-       }
-
-       enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
-
-       if (enable_apicv) {
-               pr_info("AVIC enabled\n");
-
-               amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
-       }
-
-       if (vls) {
-               if (!npt_enabled ||
-                   !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
-                   !IS_ENABLED(CONFIG_X86_64)) {
-                       vls = false;
-               } else {
-                       pr_info("Virtual VMLOAD VMSAVE supported\n");
-               }
-       }
-
-       if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
-               svm_gp_erratum_intercept = false;
-
-       if (vgif) {
-               if (!boot_cpu_has(X86_FEATURE_VGIF))
-                       vgif = false;
-               else
-                       pr_info("Virtual GIF supported\n");
-       }
-
-       if (lbrv) {
-               if (!boot_cpu_has(X86_FEATURE_LBRV))
-                       lbrv = false;
-               else
-                       pr_info("LBR virtualization supported\n");
-       }
-
-       if (!pmu)
-               pr_info("PMU virtualization is disabled\n");
-
-       svm_set_cpu_caps();
-
-       /*
-        * It seems that on AMD processors PTE's accessed bit is
-        * being set by the CPU hardware before the NPF vmexit.
-        * This is not expected behaviour and our tests fail because
-        * of it.
-        * A workaround here is to disable support for
-        * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
-        * In this case userspace can know if there is support using
-        * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
-        * it
-        * If future AMD CPU models change the behaviour described above,
-        * this variable can be changed accordingly
-        */
-       allow_smaller_maxphyaddr = !npt_enabled;
-
-       return 0;
-
-err:
-       svm_hardware_teardown();
-       return r;
-}
-
 static void init_seg(struct vmcb_seg *seg)
 {
        seg->selector = 0;
@@ -1444,12 +1207,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
        if (err)
                goto error_free_vmsa_page;
 
-       /* We initialize this flag to true to make sure that the is_running
-        * bit would be set the first time the vcpu is loaded.
-        */
-       if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
-               svm->avic_is_running = true;
-
        svm->msrpm = svm_vcpu_alloc_msrpm();
        if (!svm->msrpm) {
                err = -ENOMEM;
@@ -3833,6 +3590,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
        svm_complete_interrupts(vcpu);
 }
 
+static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+       return 1;
+}
+
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
        if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
@@ -4629,8 +4391,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .prepare_guest_switch = svm_prepare_guest_switch,
        .vcpu_load = svm_vcpu_load,
        .vcpu_put = svm_vcpu_put,
-       .vcpu_blocking = svm_vcpu_blocking,
-       .vcpu_unblocking = svm_vcpu_unblocking,
+       .vcpu_blocking = avic_vcpu_blocking,
+       .vcpu_unblocking = avic_vcpu_unblocking,
 
        .update_exception_bitmap = svm_update_exception_bitmap,
        .get_msr_feature = svm_get_msr_feature,
@@ -4662,6 +4424,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .tlb_flush_gva = svm_flush_tlb_gva,
        .tlb_flush_guest = svm_flush_tlb,
 
+       .vcpu_pre_run = svm_vcpu_pre_run,
        .run = svm_vcpu_run,
        .handle_exit = handle_exit,
        .skip_emulated_instruction = skip_emulated_instruction,
@@ -4742,6 +4505,243 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
 };
 
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+       unsigned int enc_bit, mask_bit;
+       u64 msr, mask;
+
+       /* If there is no memory encryption support, use existing mask */
+       if (cpuid_eax(0x80000000) < 0x8000001f)
+               return;
+
+       /* If memory encryption is not enabled, use existing mask */
+       rdmsrl(MSR_AMD64_SYSCFG, msr);
+       if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+               return;
+
+       enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+       mask_bit = boot_cpu_data.x86_phys_bits;
+
+       /* Increment the mask bit if it is the same as the encryption bit */
+       if (enc_bit == mask_bit)
+               mask_bit++;
+
+       /*
+        * If the mask bit location is below 52, then some bits above the
+        * physical addressing limit will always be reserved, so use the
+        * rsvd_bits() function to generate the mask. This mask, along with
+        * the present bit, will be used to generate a page fault with
+        * PFER.RSV = 1.
+        *
+        * If the mask bit location is 52 (or above), then clear the mask.
+        */
+       mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+       kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
+static __init void svm_set_cpu_caps(void)
+{
+       kvm_set_cpu_caps();
+
+       supported_xss = 0;
+
+       /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+       if (nested) {
+               kvm_cpu_cap_set(X86_FEATURE_SVM);
+
+               if (nrips)
+                       kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+
+               if (npt_enabled)
+                       kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+               if (tsc_scaling)
+                       kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+
+               /* Nested VM can receive #VMEXIT instead of triggering #GP */
+               kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
+       }
+
+       /* CPUID 0x80000008 */
+       if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+           boot_cpu_has(X86_FEATURE_AMD_SSBD))
+               kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+       /* AMD PMU PERFCTR_CORE CPUID */
+       if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+               kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
+
+       /* CPUID 0x8000001F (SME/SEV features) */
+       sev_set_cpu_caps();
+}
+
+static __init int svm_hardware_setup(void)
+{
+       int cpu;
+       struct page *iopm_pages;
+       void *iopm_va;
+       int r;
+       unsigned int order = get_order(IOPM_SIZE);
+
+       /*
+        * NX is required for shadow paging and for NPT if the NX huge pages
+        * mitigation is enabled.
+        */
+       if (!boot_cpu_has(X86_FEATURE_NX)) {
+               pr_err_ratelimited("NX (Execute Disable) not supported\n");
+               return -EOPNOTSUPP;
+       }
+       kvm_enable_efer_bits(EFER_NX);
+
+       iopm_pages = alloc_pages(GFP_KERNEL, order);
+
+       if (!iopm_pages)
+               return -ENOMEM;
+
+       iopm_va = page_address(iopm_pages);
+       memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
+       iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+       init_msrpm_offsets();
+
+       supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+
+       if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+               kvm_enable_efer_bits(EFER_FFXSR);
+
+       if (tsc_scaling) {
+               if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+                       tsc_scaling = false;
+               } else {
+                       pr_info("TSC scaling supported\n");
+                       kvm_has_tsc_control = true;
+                       kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+                       kvm_tsc_scaling_ratio_frac_bits = 32;
+               }
+       }
+
+       tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+
+       /* Check for pause filtering support */
+       if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+               pause_filter_count = 0;
+               pause_filter_thresh = 0;
+       } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+               pause_filter_thresh = 0;
+       }
+
+       if (nested) {
+               printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+               kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+       }
+
+       /*
+        * KVM's MMU doesn't support using 2-level paging for itself, and thus
+        * NPT isn't supported if the host is using 2-level paging since host
+        * CR4 is unchanged on VMRUN.
+        */
+       if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
+               npt_enabled = false;
+
+       if (!boot_cpu_has(X86_FEATURE_NPT))
+               npt_enabled = false;
+
+       /* Force VM NPT level equal to the host's paging level */
+       kvm_configure_mmu(npt_enabled, get_npt_level(),
+                         get_npt_level(), PG_LEVEL_1G);
+       pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+
+       /* Note, SEV setup consumes npt_enabled. */
+       sev_hardware_setup();
+
+       svm_hv_hardware_setup();
+
+       svm_adjust_mmio_mask();
+
+       for_each_possible_cpu(cpu) {
+               r = svm_cpu_init(cpu);
+               if (r)
+                       goto err;
+       }
+
+       if (nrips) {
+               if (!boot_cpu_has(X86_FEATURE_NRIPS))
+                       nrips = false;
+       }
+
+       enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
+
+       if (enable_apicv) {
+               pr_info("AVIC enabled\n");
+
+               amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+       } else {
+               svm_x86_ops.vcpu_blocking = NULL;
+               svm_x86_ops.vcpu_unblocking = NULL;
+       }
+
+       if (vls) {
+               if (!npt_enabled ||
+                   !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+                   !IS_ENABLED(CONFIG_X86_64)) {
+                       vls = false;
+               } else {
+                       pr_info("Virtual VMLOAD VMSAVE supported\n");
+               }
+       }
+
+       if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+               svm_gp_erratum_intercept = false;
+
+       if (vgif) {
+               if (!boot_cpu_has(X86_FEATURE_VGIF))
+                       vgif = false;
+               else
+                       pr_info("Virtual GIF supported\n");
+       }
+
+       if (lbrv) {
+               if (!boot_cpu_has(X86_FEATURE_LBRV))
+                       lbrv = false;
+               else
+                       pr_info("LBR virtualization supported\n");
+       }
+
+       if (!enable_pmu)
+               pr_info("PMU virtualization is disabled\n");
+
+       svm_set_cpu_caps();
+
+       /*
+        * It seems that on AMD processors PTE's accessed bit is
+        * being set by the CPU hardware before the NPF vmexit.
+        * This is not expected behaviour and our tests fail because
+        * of it.
+        * A workaround here is to disable support for
+        * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+        * In this case userspace can know if there is support using
+        * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+        * it
+        * If future AMD CPU models change the behaviour described above,
+        * this variable can be changed accordingly
+        */
+       allow_smaller_maxphyaddr = !npt_enabled;
+
+       return 0;
+
+err:
+       svm_hardware_teardown();
+       return r;
+}
+
+
 static struct kvm_x86_init_ops svm_init_ops __initdata = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
index 9f153c5..47ef8f4 100644 (file)
@@ -32,7 +32,6 @@
 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
 extern bool intercept_smi;
-extern bool pmu;
 
 /*
  * Clean bits in VMCB.
@@ -226,7 +225,6 @@ struct vcpu_svm {
        u32 dfr_reg;
        struct page *avic_backing_page;
        u64 *avic_physical_id_cache;
-       bool avic_is_running;
 
        /*
         * Per-vcpu list of struct amd_svm_iommu_ir:
@@ -574,17 +572,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
 
 #define VMCB_AVIC_APIC_BAR_MASK                0xFFFFFFFFFF000ULL
 
-static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u64 *entry = svm->avic_physical_id_cache;
-
-       if (!entry)
-               return false;
-
-       return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
-}
-
 int avic_ga_log_notifier(u32 ga_tag);
 void avic_vm_destroy(struct kvm *kvm);
 int avic_vm_init(struct kvm *kvm);
@@ -605,8 +592,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
 bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                       uint32_t guest_irq, bool set);
-void svm_vcpu_blocking(struct kvm_vcpu *vcpu);
-void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 
 /* sev.c */
 
index c8029b7..959b59d 100644 (file)
@@ -5,6 +5,7 @@
 #include <asm/vmx.h>
 
 #include "lapic.h"
+#include "x86.h"
 
 extern bool __read_mostly enable_vpid;
 extern bool __read_mostly flexpriority_enabled;
@@ -389,6 +390,9 @@ static inline u64 vmx_get_perf_capabilities(void)
 {
        u64 perf_cap = 0;
 
+       if (!enable_pmu)
+               return perf_cap;
+
        if (boot_cpu_has(X86_FEATURE_PDCM))
                rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
 
index 5e0ac57..466d18f 100644 (file)
@@ -21,7 +21,6 @@
 #define MSR_PMC_FULL_WIDTH_BIT      (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
 
 static struct kvm_event_hw_type_mapping intel_arch_events[] = {
-       /* Index must match CPUID 0x0A.EBX bit vector */
        [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
        [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
        [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
@@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
        [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
        [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
        [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+       /* The above index must match CPUID 0x0A.EBX bit vector */
        [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
 };
 
@@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
        u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
        int i;
 
-       for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
-               if (intel_arch_events[i].eventsel == event_select &&
-                   intel_arch_events[i].unit_mask == unit_mask &&
-                   (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i)))
-                       break;
+       for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
+               if (intel_arch_events[i].eventsel != event_select ||
+                   intel_arch_events[i].unit_mask != unit_mask)
+                       continue;
+
+               /* disable event that reported as not present by cpuid */
+               if ((i < 7) && !(pmu->available_event_types & (1 << i)))
+                       return PERF_COUNT_HW_MAX + 1;
+
+               break;
+       }
 
        if (i == ARRAY_SIZE(intel_arch_events))
                return PERF_COUNT_HW_MAX;
@@ -481,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
        pmu->reserved_bits = 0xffffffff00200000ull;
 
        entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
-       if (!entry)
+       if (!entry || !enable_pmu)
                return;
        eax.full = entry->eax;
        edx.full = entry->edx;
index 88c53c5..aa1fe90 100644 (file)
@@ -19,7 +19,7 @@
  * wake the target vCPUs.  vCPUs are removed from the list and the notification
  * vector is reset when the vCPU is scheduled in.
  */
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
 /*
  * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
  * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
@@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
  * CPU.  IRQs must be disabled when taking this lock, otherwise deadlock will
  * occur if a wakeup IRQ arrives and attempts to acquire the lock.
  */
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
 
 static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
 {
@@ -51,7 +51,9 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 {
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct pi_desc old, new;
+       unsigned long flags;
        unsigned int dest;
 
        /*
@@ -62,23 +64,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
        if (!enable_apicv || !lapic_in_kernel(vcpu))
                return;
 
-       /* Nothing to do if PI.SN and PI.NDST both have the desired value. */
-       if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+       /*
+        * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
+        * full update can be skipped as neither the vector nor the destination
+        * needs to be changed.
+        */
+       if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
+               /*
+                * Clear SN if it was set due to being preempted.  Again, do
+                * this even if there is no assigned device for simplicity.
+                */
+               if (pi_test_and_clear_sn(pi_desc))
+                       goto after_clear_sn;
                return;
+       }
+
+       local_irq_save(flags);
 
        /*
-        * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
-        * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
-        * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
-        * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
-        * correctly.
+        * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
+        * list of the _previous_ pCPU, which will not be the same as the
+        * current pCPU if the task was migrated.
         */
-       if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
-               pi_clear_sn(pi_desc);
-               goto after_clear_sn;
+       if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
+               raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+               list_del(&vmx->pi_wakeup_list);
+               raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
        }
 
-       /* The full case.  Set the new destination and clear SN. */
        dest = cpu_physical_id(cpu);
        if (!x2apic_mode)
                dest = (dest << 8) & 0xFF00;
@@ -86,10 +99,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
        do {
                old.control = new.control = READ_ONCE(pi_desc->control);
 
+               /*
+                * Clear SN (as above) and refresh the destination APIC ID to
+                * handle task migration (@cpu != vcpu->cpu).
+                */
                new.ndst = dest;
                new.sn = 0;
+
+               /*
+                * Restore the notification vector; in the blocking case, the
+                * descriptor was modified on "put" to use the wakeup vector.
+                */
+               new.nv = POSTED_INTR_VECTOR;
        } while (pi_try_set_control(pi_desc, old.control, new.control));
 
+       local_irq_restore(flags);
+
 after_clear_sn:
 
        /*
@@ -111,83 +136,25 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
                irq_remapping_cap(IRQ_POSTING_CAP);
 }
 
-void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-       if (!vmx_can_use_vtd_pi(vcpu->kvm))
-               return;
-
-       /* Set SN when the vCPU is preempted */
-       if (vcpu->preempted)
-               pi_set_sn(pi_desc);
-}
-
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-       struct pi_desc old, new;
-       unsigned int dest;
-
-       /*
-        * Remove the vCPU from the wakeup list of the _previous_ pCPU, which
-        * will not be the same as the current pCPU if the task was migrated.
-        */
-       spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-       list_del(&vcpu->blocked_vcpu_list);
-       spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-
-       dest = cpu_physical_id(vcpu->cpu);
-       if (!x2apic_mode)
-               dest = (dest << 8) & 0xFF00;
-
-       WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
-            "Wakeup handler not enabled while the vCPU was blocking");
-
-       do {
-               old.control = new.control = READ_ONCE(pi_desc->control);
-
-               new.ndst = dest;
-
-               /* set 'NV' to 'notification vector' */
-               new.nv = POSTED_INTR_VECTOR;
-       } while (pi_try_set_control(pi_desc, old.control, new.control));
-
-       vcpu->pre_pcpu = -1;
-}
-
 /*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- *   we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- *   interrupt is posted for this vCPU, we cannot block it, in
- *   this case, return 1, otherwise, return 0.
- *
+ * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
+ * WAKEUP as the notification vector in the PI descriptor.
  */
-int pi_pre_block(struct kvm_vcpu *vcpu)
+static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
 {
-       struct pi_desc old, new;
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct pi_desc old, new;
        unsigned long flags;
 
-       if (!vmx_can_use_vtd_pi(vcpu->kvm) ||
-           vmx_interrupt_blocked(vcpu))
-               return 0;
-
        local_irq_save(flags);
 
-       vcpu->pre_pcpu = vcpu->cpu;
-       spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
-       list_add_tail(&vcpu->blocked_vcpu_list,
-                     &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
-       spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
+       raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+       list_add_tail(&vmx->pi_wakeup_list,
+                     &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
+       raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 
-       WARN(pi_desc->sn == 1,
-            "Posted Interrupt Suppress Notification set before blocking");
+       WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
 
        do {
                old.control = new.control = READ_ONCE(pi_desc->control);
@@ -196,24 +163,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
                new.nv = POSTED_INTR_WAKEUP_VECTOR;
        } while (pi_try_set_control(pi_desc, old.control, new.control));
 
-       /* We should not block the vCPU if an interrupt is posted for it.  */
-       if (pi_test_on(pi_desc))
-               __pi_post_block(vcpu);
+       /*
+        * Send a wakeup IPI to this CPU if an interrupt may have been posted
+        * before the notification vector was updated, in which case the IRQ
+        * will arrive on the non-wakeup vector.  An IPI is needed as calling
+        * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
+        * enabled until it is safe to call try_to_wake_up() on the task being
+        * scheduled out).
+        */
+       if (pi_test_on(&new))
+               apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
 
        local_irq_restore(flags);
-       return (vcpu->pre_pcpu == -1);
 }
 
-void pi_post_block(struct kvm_vcpu *vcpu)
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 {
-       unsigned long flags;
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 
-       if (vcpu->pre_pcpu == -1)
+       if (!vmx_can_use_vtd_pi(vcpu->kvm))
                return;
 
-       local_irq_save(flags);
-       __pi_post_block(vcpu);
-       local_irq_restore(flags);
+       if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
+               pi_enable_wakeup_handler(vcpu);
+
+       /*
+        * Set SN when the vCPU is preempted.  Note, the vCPU can both be seen
+        * as blocking and preempted, e.g. if it's preempted between setting
+        * its wait state and manually scheduling out.
+        */
+       if (vcpu->preempted)
+               pi_set_sn(pi_desc);
 }
 
 /*
@@ -221,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu)
  */
 void pi_wakeup_handler(void)
 {
-       struct kvm_vcpu *vcpu;
        int cpu = smp_processor_id();
+       struct vcpu_vmx *vmx;
 
-       spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-       list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
-                       blocked_vcpu_list) {
-               struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+       raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
+       list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu),
+                           pi_wakeup_list) {
 
-               if (pi_test_on(pi_desc))
-                       kvm_vcpu_kick(vcpu);
+               if (pi_test_on(&vmx->pi_desc))
+                       kvm_vcpu_wake_up(&vmx->vcpu);
        }
-       spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+       raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
 }
 
 void __init pi_init_cpu(int cpu)
 {
-       INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
-       spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+       INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
+       raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
 }
 
 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
@@ -254,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
  * Bail out of the block loop if the VM has an assigned
  * device, but the blocking vCPU didn't reconfigure the
  * PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in pi_pre_block().
+ * came along after the initial check in vmx_vcpu_pi_put().
  */
 void vmx_pi_start_assignment(struct kvm *kvm)
 {
index 36ae035..eb14e76 100644 (file)
@@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
                        (unsigned long *)&pi_desc->control);
 }
 
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+       return test_and_clear_bit(POSTED_INTR_SN,
+                       (unsigned long *)&pi_desc->control);
+}
+
 static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 {
        return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
@@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
 
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
-int pi_pre_block(struct kvm_vcpu *vcpu);
-void pi_post_block(struct kvm_vcpu *vcpu);
 void pi_wakeup_handler(void);
 void __init pi_init_cpu(int cpu);
 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
index 1b2e9d8..4ac6760 100644 (file)
@@ -3931,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
        pt_update_intercept_for_msr(vcpu);
 }
 
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
-                                                    bool nested)
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+                                                    int pi_vec)
 {
 #ifdef CONFIG_SMP
-       int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
-
        if (vcpu->mode == IN_GUEST_MODE) {
                /*
                 * The vector of interrupt to be delivered to vcpu had
@@ -3964,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
                 */
 
                apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
-               return true;
+               return;
        }
 #endif
-       return false;
+       /*
+        * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+        * otherwise do nothing as KVM will grab the highest priority pending
+        * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+        */
+       kvm_vcpu_wake_up(vcpu);
 }
 
 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -3997,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
                smp_mb__after_atomic();
 
                /* the PIR and ON have been set by L1. */
-               if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
-                       kvm_vcpu_kick(vcpu);
+               kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
                return 0;
        }
        return -1;
@@ -4035,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
         * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
         * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
         */
-       if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
-               kvm_vcpu_kick(vcpu);
-
+       kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
        return 0;
 }
 
@@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       return vmx->emulation_required && !vmx->rmode.vm86_active &&
+              vcpu->arch.exception.pending;
+}
+
 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                if (!kvm_emulate_instruction(vcpu, 0))
                        return 0;
 
-               if (vmx->emulation_required && !vmx->rmode.vm86_active &&
-                   vcpu->arch.exception.pending) {
+               if (vmx_emulation_required_with_pending_exception(vcpu)) {
                        kvm_prepare_emulation_failure_exit(vcpu);
                        return 0;
                }
@@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+       if (vmx_emulation_required_with_pending_exception(vcpu)) {
+               kvm_prepare_emulation_failure_exit(vcpu);
+               return 0;
+       }
+
+       return 1;
+}
+
 static void grow_ple_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6928,6 +6945,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
        vmx = to_vmx(vcpu);
 
+       INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+
        err = -ENOMEM;
 
        vmx->vpid = allocate_vpid();
@@ -7549,25 +7568,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
                secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
 }
 
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
-{
-       if (pi_pre_block(vcpu))
-               return 1;
-
-       if (kvm_lapic_hv_timer_in_use(vcpu))
-               kvm_lapic_switch_to_sw_timer(vcpu);
-
-       return 0;
-}
-
-static void vmx_post_block(struct kvm_vcpu *vcpu)
-{
-       if (kvm_x86_ops.set_hv_timer)
-               kvm_lapic_switch_to_hv_timer(vcpu);
-
-       pi_post_block(vcpu);
-}
-
 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 {
        if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@@ -7710,6 +7710,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .tlb_flush_gva = vmx_flush_tlb_gva,
        .tlb_flush_guest = vmx_flush_tlb_guest,
 
+       .vcpu_pre_run = vmx_vcpu_pre_run,
        .run = vmx_vcpu_run,
        .handle_exit = vmx_handle_exit,
        .skip_emulated_instruction = vmx_skip_emulated_instruction,
@@ -7768,9 +7769,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .cpu_dirty_log_size = PML_ENTITY_NUM,
        .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
 
-       .pre_block = vmx_pre_block,
-       .post_block = vmx_post_block,
-
        .pmu_ops = &intel_pmu_ops,
        .nested_ops = &vmx_nested_ops,
 
index f8fc744..7f2c82e 100644 (file)
@@ -317,6 +317,9 @@ struct vcpu_vmx {
        /* Posted interrupt descriptor */
        struct pi_desc pi_desc;
 
+       /* Used if this vCPU is waiting for PI notification wakeup. */
+       struct list_head pi_wakeup_list;
+
        /* Support for a guest hypervisor (nested VMX) */
        struct nested_vmx nested;
 
index 76b4803..9e43d75 100644 (file)
@@ -187,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
 int __read_mostly pi_inject_timer = -1;
 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
 
+/* Enable/disable PMU virtualization */
+bool __read_mostly enable_pmu = true;
+EXPORT_SYMBOL_GPL(enable_pmu);
+module_param(enable_pmu, bool, 0444);
+
 /*
  * Restoring the host value for MSRs that are only consumed when running in
  * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -5230,17 +5235,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                struct kvm_cpuid __user *cpuid_arg = argp;
                struct kvm_cpuid cpuid;
 
-               /*
-                * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
-                * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
-                * tracked in kvm_mmu_page_role.  As a result, KVM may miss guest page
-                * faults due to reusing SPs/SPTEs.  In practice no sane VMM mucks with
-                * the core vCPU model on the fly, so fail.
-                */
-               r = -EINVAL;
-               if (vcpu->arch.last_vmentry_cpu != -1)
-                       goto out;
-
                r = -EFAULT;
                if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
                        goto out;
@@ -5251,14 +5245,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                struct kvm_cpuid2 __user *cpuid_arg = argp;
                struct kvm_cpuid2 cpuid;
 
-               /*
-                * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
-                * KVM_SET_CPUID case above.
-                */
-               r = -EINVAL;
-               if (vcpu->arch.last_vmentry_cpu != -1)
-                       goto out;
-
                r = -EFAULT;
                if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
                        goto out;
@@ -9945,10 +9931,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        smp_mb__after_srcu_read_unlock();
 
        /*
-        * This handles the case where a posted interrupt was
-        * notified with kvm_vcpu_kick.  Assigned devices can
-        * use the POSTED_INTR_VECTOR even if APICv is disabled,
-        * so do it even if APICv is disabled on this vCPU.
+        * Process pending posted interrupts to handle the case where the
+        * notification IRQ arrived in the host, or was never sent (because the
+        * target vCPU wasn't running).  Do this regardless of the vCPU's APICv
+        * status, KVM doesn't update assigned devices when APICv is inhibited,
+        * i.e. they can post interrupts even if APICv is temporarily disabled.
         */
        if (kvm_lapic_enabled(vcpu))
                static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
@@ -10113,8 +10100,20 @@ out:
 
 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 {
-       if (!kvm_arch_vcpu_runnable(vcpu) &&
-           (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
+       bool hv_timer;
+
+       if (!kvm_arch_vcpu_runnable(vcpu)) {
+               /*
+                * Switch to the software timer before halt-polling/blocking as
+                * the guest's timer may be a break event for the vCPU, and the
+                * hypervisor timer runs only when the CPU is in guest mode.
+                * Switch before halt-polling so that KVM recognizes an expired
+                * timer before blocking.
+                */
+               hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
+               if (hv_timer)
+                       kvm_lapic_switch_to_sw_timer(vcpu);
+
                srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
                if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
                        kvm_vcpu_halt(vcpu);
@@ -10122,8 +10121,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
                        kvm_vcpu_block(vcpu);
                vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 
-               if (kvm_x86_ops.post_block)
-                       static_call(kvm_x86_post_block)(vcpu);
+               if (hv_timer)
+                       kvm_lapic_switch_to_hv_timer(vcpu);
 
                if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
                        return 1;
@@ -10316,6 +10315,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                        r = -EINTR;
                        goto out;
                }
+               /*
+                * It should be impossible for the hypervisor timer to be in
+                * use before KVM has ever run the vCPU.
+                */
+               WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
                kvm_vcpu_block(vcpu);
                if (kvm_apic_accept_events(vcpu) < 0) {
                        r = 0;
@@ -10360,10 +10364,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
        } else
                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
-       if (kvm_run->immediate_exit)
+       if (kvm_run->immediate_exit) {
                r = -EINTR;
-       else
-               r = vcpu_run(vcpu);
+               goto out;
+       }
+
+       r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+       if (r <= 0)
+               goto out;
+
+       r = vcpu_run(vcpu);
 
 out:
        kvm_put_guest_fpu(vcpu);
index bec8ed0..635b75f 100644 (file)
@@ -336,6 +336,7 @@ extern u64 host_xcr0;
 extern u64 supported_xcr0;
 extern u64 host_xss;
 extern u64 supported_xss;
+extern bool enable_pmu;
 
 static inline bool kvm_mpx_supported(void)
 {
index d89d564..06912d6 100644 (file)
@@ -309,9 +309,6 @@ struct kvm_vcpu {
        u64 requests;
        unsigned long guest_debug;
 
-       int pre_pcpu;
-       struct list_head blocked_vcpu_list;
-
        struct mutex mutex;
        struct kvm_run *run;
 
index f066637..9563d29 100644 (file)
@@ -1131,7 +1131,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
-#define KVM_CAP_XSAVE2 207
+#define KVM_CAP_VM_GPA_BITS 207
+#define KVM_CAP_XSAVE2 208
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1163,11 +1164,20 @@ struct kvm_irq_routing_hv_sint {
        __u32 sint;
 };
 
+struct kvm_irq_routing_xen_evtchn {
+       __u32 port;
+       __u32 vcpu;
+       __u32 priority;
+};
+
+#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
+#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 struct kvm_irq_routing_entry {
        __u32 gsi;
@@ -1179,6 +1189,7 @@ struct kvm_irq_routing_entry {
                struct kvm_irq_routing_msi msi;
                struct kvm_irq_routing_s390_adapter adapter;
                struct kvm_irq_routing_hv_sint hv_sint;
+               struct kvm_irq_routing_xen_evtchn xen_evtchn;
                __u32 pad[8];
        } u;
 };
@@ -1209,6 +1220,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL     (1 << 1)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO         (1 << 2)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE            (1 << 3)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL       (1 << 4)
 
 struct kvm_xen_hvm_config {
        __u32 flags;
@@ -1552,8 +1564,6 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_XSAVE */
 #define KVM_GET_XSAVE            _IOR(KVMIO,  0xa4, struct kvm_xsave)
 #define KVM_SET_XSAVE            _IOW(KVMIO,  0xa5, struct kvm_xsave)
-/* Available with KVM_CAP_XSAVE2 */
-#define KVM_GET_XSAVE2           _IOR(KVMIO,  0xcf, struct kvm_xsave)
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS             _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS             _IOW(KVMIO,  0xa7, struct kvm_xcrs)
@@ -1613,6 +1623,9 @@ struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET  _IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET   _IO(KVMIO,   0xc4)
 
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2           _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 struct kvm_s390_pv_sec_parm {
        __u64 origin;
        __u64 length;
index 8c12996..dce7de7 100644 (file)
@@ -8,11 +8,12 @@
 /s390x/memop
 /s390x/resets
 /s390x/sync_regs_test
+/x86_64/amx_test
+/x86_64/cpuid_test
 /x86_64/cr4_cpuid_sync_test
 /x86_64/debug_regs
 /x86_64/evmcs_test
 /x86_64/emulator_error_test
-/x86_64/get_cpuid_test
 /x86_64/get_msr_index_features
 /x86_64/kvm_clock_test
 /x86_64/kvm_pv_test
@@ -22,6 +23,7 @@
 /x86_64/mmio_warning_test
 /x86_64/mmu_role_test
 /x86_64/platform_info_test
+/x86_64/pmu_event_filter_test
 /x86_64/set_boot_cpu_id
 /x86_64/set_sregs_test
 /x86_64/sev_migrate_tests
@@ -36,6 +38,7 @@
 /x86_64/vmx_apic_access_test
 /x86_64/vmx_close_while_nested_test
 /x86_64/vmx_dirty_log_test
+/x86_64/vmx_exception_with_invalid_guest_state
 /x86_64/vmx_invalid_nested_guest_state
 /x86_64/vmx_preemption_timer_test
 /x86_64/vmx_set_nested_state_test
index ee8cf21..81ebf99 100644 (file)
@@ -43,11 +43,11 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handler
 LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
 LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c
 
-TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
+TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
 TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
-TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
@@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
 TEST_GEN_PROGS_x86_64 += x86_64/smm_test
@@ -69,6 +70,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
index e94ba0f..423d8a6 100644 (file)
@@ -364,6 +364,24 @@ static inline unsigned long get_xmm(int n)
 }
 
 bool is_intel_cpu(void);
+bool is_amd_cpu(void);
+
+static inline unsigned int x86_family(unsigned int eax)
+{
+       unsigned int x86;
+
+       x86 = (eax >> 8) & 0xf;
+
+       if (x86 == 0xf)
+               x86 += (eax >> 20) & 0xff;
+
+       return x86;
+}
+
+static inline unsigned int x86_model(unsigned int eax)
+{
+       return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
+}
 
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
@@ -375,6 +393,8 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index);
 struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
 
 struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
+int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
+                    struct kvm_cpuid2 *cpuid);
 void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
                    struct kvm_cpuid2 *cpuid);
 
@@ -419,6 +439,11 @@ void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
                             uint64_t pte);
 
 /*
+ * get_cpuid() - find matching CPUID entry and return pointer to it.
+ */
+struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
+                                  uint32_t index);
+/*
  * set_cpuid() - overwrites a matching cpuid entry with the provided value.
  *              matches based on ent->function && ent->index. returns true
  *              if a match was found and successfully overwritten.
index 4a645dc..8c53f96 100644 (file)
@@ -393,10 +393,12 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
        struct kvm_vm *vm;
        int i;
 
+#ifdef __x86_64__
        /*
         * Permission needs to be requested before KVM_SET_CPUID2.
         */
        vm_xsave_req_perm();
+#endif
 
        /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
        if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
@@ -497,9 +499,11 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
 void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
                            uint64_t first_page, uint32_t num_pages)
 {
-       struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot,
-                                           .first_page = first_page,
-                                           .num_pages = num_pages };
+       struct kvm_clear_dirty_log args = {
+               .dirty_bitmap = log, .slot = slot,
+               .first_page = first_page,
+               .num_pages = num_pages
+       };
        int ret;
 
        ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);
index babb0f2..5f9d7e9 100644 (file)
@@ -886,6 +886,17 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
        return entry;
 }
 
+
+int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
+                    struct kvm_cpuid2 *cpuid)
+{
+       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+       return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
+}
+
 /*
  * VM VCPU CPUID Set
  *
@@ -903,12 +914,9 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
 void vcpu_set_cpuid(struct kvm_vm *vm,
                uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
 {
-       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
        int rc;
 
-       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
-
-       rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
+       rc = __vcpu_set_cpuid(vm, vcpuid, cpuid);
        TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
                    rc, errno);
 
@@ -1136,25 +1144,25 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
        list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
        list->nmsrs = nmsrs;
        r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
+                   r);
 
        state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
        r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
+                   r);
 
        r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
+                   r);
 
        r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
+                   r);
 
        r = vcpu_save_xsave_state(vm, vcpu, state);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
+                   r);
 
        if (kvm_check_cap(KVM_CAP_XCRS)) {
                r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
@@ -1163,17 +1171,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
        }
 
        r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
+                   r);
 
        if (nested_size) {
                state->nested.size = sizeof(state->nested_);
                r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
                TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
-                       r);
+                           r);
                TEST_ASSERT(state->nested.size <= nested_size,
-                       "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
-                       state->nested.size, nested_size);
+                           "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
+                           state->nested.size, nested_size);
        } else
                state->nested.size = 0;
 
@@ -1181,12 +1189,12 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
        for (i = 0; i < nmsrs; i++)
                state->msrs.entries[i].index = list->indices[i];
        r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
-        TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
-                r, r == nmsrs ? -1 : list->indices[r]);
+       TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
+                   r, r == nmsrs ? -1 : list->indices[r]);
 
        r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
+                   r);
 
        free(list);
        return state;
@@ -1199,7 +1207,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 
        r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
-                r);
+                   r);
 
        r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
        TEST_ASSERT(r == state->msrs.nmsrs,
@@ -1214,28 +1222,28 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 
        r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
-                r);
+                   r);
 
        r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
+                   r);
 
        r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
+                   r);
 
        r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
+                   r);
 
        r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
-                r);
+       TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
+                   r);
 
        if (state->nested.size) {
                r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
                TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
-                       r);
+                           r);
        }
 }
 
@@ -1245,10 +1253,10 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state)
        free(state);
 }
 
-bool is_intel_cpu(void)
+static bool cpu_vendor_string_is(const char *vendor)
 {
+       const uint32_t *chunk = (const uint32_t *)vendor;
        int eax, ebx, ecx, edx;
-       const uint32_t *chunk;
        const int leaf = 0;
 
        __asm__ __volatile__(
@@ -1257,10 +1265,22 @@ bool is_intel_cpu(void)
                  "=c"(ecx), "=d"(edx)
                : /* input */ "0"(leaf), "2"(0));
 
-       chunk = (const uint32_t *)("GenuineIntel");
        return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
 }
 
+bool is_intel_cpu(void)
+{
+       return cpu_vendor_string_is("GenuineIntel");
+}
+
+/*
+ * Exclude early K5 samples with a vendor string of "AMDisbetter!"
+ */
+bool is_amd_cpu(void)
+{
+       return cpu_vendor_string_is("AuthenticAMD");
+}
+
 uint32_t kvm_get_cpuid_max_basic(void)
 {
        return kvm_get_supported_cpuid_entry(0)->eax;
@@ -1384,6 +1404,23 @@ void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
        }
 }
 
+struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
+                                  uint32_t index)
+{
+       int i;
+
+       for (i = 0; i < cpuid->nent; i++) {
+               struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
+
+               if (cur->function == function && cur->index == index)
+                       return cur;
+       }
+
+       TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
+
+       return NULL;
+}
+
 bool set_cpuid(struct kvm_cpuid2 *cpuid,
               struct kvm_cpuid_entry2 *ent)
 {
@@ -1479,22 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
        return cpuid;
 }
 
-#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
-#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
-#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
-
-static inline unsigned x86_family(unsigned int eax)
-{
-        unsigned int x86;
-
-        x86 = (eax >> 8) & 0xf;
-
-        if (x86 == 0xf)
-                x86 += (eax >> 20) & 0xff;
-
-        return x86;
-}
-
 unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
 {
        const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
@@ -1504,11 +1525,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
        max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
 
        /* Avoid reserved HyperTransport region on AMD processors.  */
-       eax = ecx = 0;
-       cpuid(&eax, &ebx, &ecx, &edx);
-       if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
-           ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
-           edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+       if (!is_amd_cpu())
                return max_gfn;
 
        /* On parts with <40 physical address bits, the area is fully hidden */
@@ -1518,6 +1535,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
        /* Before family 17h, the HyperTransport area is just below 1T.  */
        ht_gfn = (1 << 28) - num_ht_pages;
        eax = 1;
+       ecx = 0;
        cpuid(&eax, &ebx, &ecx, &edx);
        if (x86_family(eax) < 0x17)
                goto done;
@@ -154,6 +154,34 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct
        return guest_cpuids;
 }
 
+static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid)
+{
+       struct kvm_cpuid_entry2 *ent;
+       int rc;
+       u32 eax, ebx, x;
+
+       /* Setting unmodified CPUID is allowed */
+       rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+       TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
+
+       /* Changing CPU features is forbidden */
+       ent = get_cpuid(cpuid, 0x7, 0);
+       ebx = ent->ebx;
+       ent->ebx--;
+       rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+       TEST_ASSERT(rc, "Changing CPU features should fail");
+       ent->ebx = ebx;
+
+       /* Changing MAXPHYADDR is forbidden */
+       ent = get_cpuid(cpuid, 0x80000008, 0);
+       eax = ent->eax;
+       x = eax & 0xff;
+       ent->eax = (eax & ~0xffu) | (x - 1);
+       rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+       TEST_ASSERT(rc, "Changing MAXPHYADDR should fail");
+       ent->eax = eax;
+}
+
 int main(void)
 {
        struct kvm_cpuid2 *supp_cpuid, *cpuid2;
@@ -175,5 +203,7 @@ int main(void)
        for (stage = 0; stage < 3; stage++)
                run_vcpu(vm, VCPU_ID, stage);
 
+       set_cpuid_after_run(vm, cpuid2);
+
        kvm_vm_free(vm);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
new file mode 100644 (file)
index 0000000..c715adc
--- /dev/null
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_SET_PMU_EVENT_FILTER.
+ *
+ * Copyright (C) 2022, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Verifies the expected behavior of allow lists and deny lists for
+ * virtual PMU events.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+/*
+ * In lieu of copying perf_event.h into tools...
+ */
+#define ARCH_PERFMON_EVENTSEL_OS                       (1ULL << 17)
+#define ARCH_PERFMON_EVENTSEL_ENABLE                   (1ULL << 22)
+
+union cpuid10_eax {
+       struct {
+               unsigned int version_id:8;
+               unsigned int num_counters:8;
+               unsigned int bit_width:8;
+               unsigned int mask_length:8;
+       } split;
+       unsigned int full;
+};
+
+union cpuid10_ebx {
+       struct {
+               unsigned int no_unhalted_core_cycles:1;
+               unsigned int no_instructions_retired:1;
+               unsigned int no_unhalted_reference_cycles:1;
+               unsigned int no_llc_reference:1;
+               unsigned int no_llc_misses:1;
+               unsigned int no_branch_instruction_retired:1;
+               unsigned int no_branch_misses_retired:1;
+       } split;
+       unsigned int full;
+};
+
+/* End of stuff taken from perf_event.h. */
+
+/* Oddly, this isn't in perf_event.h. */
+#define ARCH_PERFMON_BRANCHES_RETIRED          5
+
+#define VCPU_ID 0
+#define NUM_BRANCHES 42
+
+/*
+ * This is how the event selector and unit mask are stored in an AMD
+ * core performance event-select register. Intel's format is similar,
+ * but the event selector is only 8 bits.
+ */
+#define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \
+                             (umask & 0xff) << 8)
+
+/*
+ * "Branch instructions retired", from the Intel SDM, volume 3,
+ * "Pre-defined Architectural Performance Events."
+ */
+
+#define INTEL_BR_RETIRED EVENT(0xc4, 0)
+
+/*
+ * "Retired branch instructions", from Processor Programming Reference
+ * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
+ * Preliminary Processor Programming Reference (PPR) for AMD Family
+ * 17h Model 31h, Revision B0 Processors, and Preliminary Processor
+ * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
+ * B1 Processors Volume 1 of 2.
+ */
+
+#define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0)
+
+/*
+ * This event list comprises Intel's eight architectural events plus
+ * AMD's "retired branch instructions" for Zen[123] (and possibly
+ * other AMD CPUs).
+ */
+static const uint64_t event_list[] = {
+       EVENT(0x3c, 0),
+       EVENT(0xc0, 0),
+       EVENT(0x3c, 1),
+       EVENT(0x2e, 0x4f),
+       EVENT(0x2e, 0x41),
+       EVENT(0xc4, 0),
+       EVENT(0xc5, 0),
+       EVENT(0xa4, 1),
+       AMD_ZEN_BR_RETIRED,
+};
+
+/*
+ * If we encounter a #GP during the guest PMU sanity check, then the guest
+ * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0).
+ */
+static void guest_gp_handler(struct ex_regs *regs)
+{
+       GUEST_SYNC(0);
+}
+
+/*
+ * Check that we can write a new value to the given MSR and read it back.
+ * The caller should provide a non-empty set of bits that are safe to flip.
+ *
+ * Return on success. GUEST_SYNC(0) on error.
+ */
+static void check_msr(uint32_t msr, uint64_t bits_to_flip)
+{
+       uint64_t v = rdmsr(msr) ^ bits_to_flip;
+
+       wrmsr(msr, v);
+       if (rdmsr(msr) != v)
+               GUEST_SYNC(0);
+
+       v ^= bits_to_flip;
+       wrmsr(msr, v);
+       if (rdmsr(msr) != v)
+               GUEST_SYNC(0);
+}
+
+static void intel_guest_code(void)
+{
+       check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
+       check_msr(MSR_P6_EVNTSEL0, 0xffff);
+       check_msr(MSR_IA32_PMC0, 0xffff);
+       GUEST_SYNC(1);
+
+       for (;;) {
+               uint64_t br0, br1;
+
+               wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+               wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+                     ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED);
+               wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
+               br0 = rdmsr(MSR_IA32_PMC0);
+               __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+               br1 = rdmsr(MSR_IA32_PMC0);
+               GUEST_SYNC(br1 - br0);
+       }
+}
+
+/*
+ * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23],
+ * this code uses the always-available, legacy K7 PMU MSRs, which alias to
+ * the first four of the six extended core PMU MSRs.
+ */
+static void amd_guest_code(void)
+{
+       check_msr(MSR_K7_EVNTSEL0, 0xffff);
+       check_msr(MSR_K7_PERFCTR0, 0xffff);
+       GUEST_SYNC(1);
+
+       for (;;) {
+               uint64_t br0, br1;
+
+               wrmsr(MSR_K7_EVNTSEL0, 0);
+               wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+                     ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED);
+               br0 = rdmsr(MSR_K7_PERFCTR0);
+               __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+               br1 = rdmsr(MSR_K7_PERFCTR0);
+               GUEST_SYNC(br1 - br0);
+       }
+}
+
+/*
+ * Run the VM to the next GUEST_SYNC(value), and return the value passed
+ * to the sync. Any other exit from the guest is fatal.
+ */
+static uint64_t run_vm_to_sync(struct kvm_vm *vm)
+{
+       struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+       struct ucall uc;
+
+       vcpu_run(vm, VCPU_ID);
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                   "Exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+                   run->exit_reason,
+                   exit_reason_str(run->exit_reason));
+       get_ucall(vm, VCPU_ID, &uc);
+       TEST_ASSERT(uc.cmd == UCALL_SYNC,
+                   "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
+       return uc.args[1];
+}
+
+/*
+ * In a nested environment or if the vPMU is disabled, the guest PMU
+ * might not work as architected (accessing the PMU MSRs may raise
+ * #GP, or writes could simply be discarded). In those situations,
+ * there is no point in running these tests. The guest code will perform
+ * a sanity check and then GUEST_SYNC(success). In the case of failure,
+ * the behavior of the guest on resumption is undefined.
+ */
+static bool sanity_check_pmu(struct kvm_vm *vm)
+{
+       bool success;
+
+       vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
+       success = run_vm_to_sync(vm);
+       vm_install_exception_handler(vm, GP_VECTOR, NULL);
+
+       return success;
+}
+
+static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents)
+{
+       struct kvm_pmu_event_filter *f;
+       int size = sizeof(*f) + nevents * sizeof(f->events[0]);
+
+       f = malloc(size);
+       TEST_ASSERT(f, "Out of memory");
+       memset(f, 0, size);
+       f->nevents = nevents;
+       return f;
+}
+
+static struct kvm_pmu_event_filter *event_filter(uint32_t action)
+{
+       struct kvm_pmu_event_filter *f;
+       int i;
+
+       f = make_pmu_event_filter(ARRAY_SIZE(event_list));
+       f->action = action;
+       for (i = 0; i < ARRAY_SIZE(event_list); i++)
+               f->events[i] = event_list[i];
+
+       return f;
+}
+
+/*
+ * Remove the first occurrence of 'event' (if any) from the filter's
+ * event list.
+ */
+static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f,
+                                                uint64_t event)
+{
+       bool found = false;
+       int i;
+
+       for (i = 0; i < f->nevents; i++) {
+               if (found)
+                       f->events[i - 1] = f->events[i];
+               else
+                       found = f->events[i] == event;
+       }
+       if (found)
+               f->nevents--;
+       return f;
+}
+
+static void test_without_filter(struct kvm_vm *vm)
+{
+       uint64_t count = run_vm_to_sync(vm);
+
+       if (count != NUM_BRANCHES)
+               pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
+                       __func__, count, NUM_BRANCHES);
+       TEST_ASSERT(count, "Allowed PMU event is not counting");
+}
+
+static uint64_t test_with_filter(struct kvm_vm *vm,
+                                struct kvm_pmu_event_filter *f)
+{
+       vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f);
+       return run_vm_to_sync(vm);
+}
+
+static void test_member_deny_list(struct kvm_vm *vm)
+{
+       struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
+       uint64_t count = test_with_filter(vm, f);
+
+       free(f);
+       if (count)
+               pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
+                       __func__, count);
+       TEST_ASSERT(!count, "Disallowed PMU Event is counting");
+}
+
+static void test_member_allow_list(struct kvm_vm *vm)
+{
+       struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
+       uint64_t count = test_with_filter(vm, f);
+
+       free(f);
+       if (count != NUM_BRANCHES)
+               pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
+                       __func__, count, NUM_BRANCHES);
+       TEST_ASSERT(count, "Allowed PMU event is not counting");
+}
+
+static void test_not_member_deny_list(struct kvm_vm *vm)
+{
+       struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
+       uint64_t count;
+
+       remove_event(f, INTEL_BR_RETIRED);
+       remove_event(f, AMD_ZEN_BR_RETIRED);
+       count = test_with_filter(vm, f);
+       free(f);
+       if (count != NUM_BRANCHES)
+               pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
+                       __func__, count, NUM_BRANCHES);
+       TEST_ASSERT(count, "Allowed PMU event is not counting");
+}
+
+static void test_not_member_allow_list(struct kvm_vm *vm)
+{
+       struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
+       uint64_t count;
+
+       remove_event(f, INTEL_BR_RETIRED);
+       remove_event(f, AMD_ZEN_BR_RETIRED);
+       count = test_with_filter(vm, f);
+       free(f);
+       if (count)
+               pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
+                       __func__, count);
+       TEST_ASSERT(!count, "Disallowed PMU Event is counting");
+}
+
+/*
+ * Check for a non-zero PMU version, at least one general-purpose
+ * counter per logical processor, an EBX bit vector of length greater
+ * than 5, and EBX[5] clear.
+ */
+static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry)
+{
+       union cpuid10_eax eax = { .full = entry->eax };
+       union cpuid10_ebx ebx = { .full = entry->ebx };
+
+       return eax.split.version_id && eax.split.num_counters > 0 &&
+               eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED &&
+               !ebx.split.no_branch_instruction_retired;
+}
+
+/*
+ * Note that CPUID leaf 0xa is Intel-specific. This leaf should be
+ * clear on AMD hardware.
+ */
+static bool use_intel_pmu(void)
+{
+       struct kvm_cpuid_entry2 *entry;
+
+       entry = kvm_get_supported_cpuid_index(0xa, 0);
+       return is_intel_cpu() && entry && check_intel_pmu_leaf(entry);
+}
+
+static bool is_zen1(uint32_t eax)
+{
+       return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f;
+}
+
+static bool is_zen2(uint32_t eax)
+{
+       return x86_family(eax) == 0x17 &&
+               x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f;
+}
+
+static bool is_zen3(uint32_t eax)
+{
+       return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f;
+}
+
+/*
+ * Determining AMD support for a PMU event requires consulting the AMD
+ * PPR for the CPU or reference material derived therefrom. The AMD
+ * test code herein has been verified to work on Zen1, Zen2, and Zen3.
+ *
+ * Feel free to add more AMD CPUs that are documented to support event
+ * select 0xc2 umask 0 as "retired branch instructions."
+ */
+static bool use_amd_pmu(void)
+{
+       struct kvm_cpuid_entry2 *entry;
+
+       entry = kvm_get_supported_cpuid_index(1, 0);
+       return is_amd_cpu() && entry &&
+               (is_zen1(entry->eax) ||
+                is_zen2(entry->eax) ||
+                is_zen3(entry->eax));
+}
+
+int main(int argc, char *argv[])
+{
+       void (*guest_code)(void) = NULL;
+       struct kvm_vm *vm;
+       int r;
+
+       /* Tell stdout not to buffer its content */
+       setbuf(stdout, NULL);
+
+       r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER);
+       if (!r) {
+               print_skip("KVM_CAP_PMU_EVENT_FILTER not supported");
+               exit(KSFT_SKIP);
+       }
+
+       if (use_intel_pmu())
+               guest_code = intel_guest_code;
+       else if (use_amd_pmu())
+               guest_code = amd_guest_code;
+
+       if (!guest_code) {
+               print_skip("Don't know how to test this guest PMU");
+               exit(KSFT_SKIP);
+       }
+
+       vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+       vm_init_descriptor_tables(vm);
+       vcpu_init_descriptor_tables(vm, VCPU_ID);
+
+       if (!sanity_check_pmu(vm)) {
+               print_skip("Guest PMU is not functional");
+               exit(KSFT_SKIP);
+       }
+
+       test_without_filter(vm);
+       test_member_deny_list(vm);
+       test_member_allow_list(vm);
+       test_not_member_deny_list(vm);
+       test_not_member_allow_list(vm);
+
+       kvm_vm_free(vm);
+
+       return 0;
+}
index 5a6a662..a426078 100644 (file)
@@ -77,8 +77,8 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
        switch (get_ucall(vm, vcpuid, &uc)) {
        case UCALL_SYNC:
                TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-                            uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
-                            stage + 1, (ulong)uc.args[1]);
+                           uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
+                           stage + 1, (ulong)uc.args[1]);
                return;
        case UCALL_DONE:
                return;
index 2835a17..edac883 100644 (file)
@@ -30,8 +30,8 @@ static struct kvm_vm *vm;
 static void l2_guest_code(void)
 {
        /* Exit to L0 */
-        asm volatile("inb %%dx, %%al"
-                     : : [port] "d" (PORT_L0_EXIT) : "rax");
+       asm volatile("inb %%dx, %%al"
+                    : : [port] "d" (PORT_L0_EXIT) : "rax");
 }
 
 static void l1_guest_code(struct vmx_pages *vmx_pages)
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
new file mode 100644 (file)
index 0000000..27a850f
--- /dev/null
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <signal.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+
+#include "kselftest.h"
+
+#define VCPU_ID        0
+
+static struct kvm_vm *vm;
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+       /* Loop on the ud2 until guest state is made invalid. */
+}
+
+static void guest_code(void)
+{
+       asm volatile("ud2");
+}
+
+static void __run_vcpu_with_invalid_state(void)
+{
+       struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+
+       vcpu_run(vm, VCPU_ID);
+
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+                   "Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n",
+                   run->exit_reason, exit_reason_str(run->exit_reason));
+       TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
+                   "Expected emulation failure, got %d\n",
+                   run->emulation_failure.suberror);
+}
+
+static void run_vcpu_with_invalid_state(void)
+{
+       /*
+        * Always run twice to verify KVM handles the case where _KVM_ queues
+        * an exception with invalid state and then exits to userspace, i.e.
+        * that KVM doesn't explode if userspace ignores the initial error.
+        */
+       __run_vcpu_with_invalid_state();
+       __run_vcpu_with_invalid_state();
+}
+
+static void set_timer(void)
+{
+       struct itimerval timer;
+
+       timer.it_value.tv_sec  = 0;
+       timer.it_value.tv_usec = 200;
+       timer.it_interval = timer.it_value;
+       ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0);
+}
+
+static void set_or_clear_invalid_guest_state(bool set)
+{
+       static struct kvm_sregs sregs;
+
+       if (!sregs.cr0)
+               vcpu_sregs_get(vm, VCPU_ID, &sregs);
+       sregs.tr.unusable = !!set;
+       vcpu_sregs_set(vm, VCPU_ID, &sregs);
+}
+
+static void set_invalid_guest_state(void)
+{
+       set_or_clear_invalid_guest_state(true);
+}
+
+static void clear_invalid_guest_state(void)
+{
+       set_or_clear_invalid_guest_state(false);
+}
+
+static void sigalrm_handler(int sig)
+{
+       struct kvm_vcpu_events events;
+
+       TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig);
+
+       vcpu_events_get(vm, VCPU_ID, &events);
+
+       /*
+        * If an exception is pending, attempt KVM_RUN with invalid guest,
+        * otherwise rearm the timer and keep doing so until the timer fires
+        * between KVM queueing an exception and re-entering the guest.
+        */
+       if (events.exception.pending) {
+               set_invalid_guest_state();
+               run_vcpu_with_invalid_state();
+       } else {
+               set_timer();
+       }
+}
+
+int main(int argc, char *argv[])
+{
+       if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) {
+               print_skip("Must be run with kvm_intel.unrestricted_guest=0");
+               exit(KSFT_SKIP);
+       }
+
+       vm = vm_create_default(VCPU_ID, 0, (void *)guest_code);
+
+       vm_init_descriptor_tables(vm);
+       vcpu_init_descriptor_tables(vm, VCPU_ID);
+
+       vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+
+       /*
+        * Stuff invalid guest state for L2 by making TR unusuable.  The next
+        * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
+        * emulating invalid guest state for L2.
+        */
+       set_invalid_guest_state();
+       run_vcpu_with_invalid_state();
+
+       /*
+        * Verify KVM also handles the case where userspace gains control while
+        * an exception is pending and stuffs invalid state.  Run with valid
+        * guest state and a timer firing every 200us, and attempt to enter the
+        * guest with invalid state when the handler interrupts KVM with an
+        * exception pending.
+        */
+       clear_invalid_guest_state();
+       TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR,
+                   "Failed to register SIGALRM handler, errno = %d (%s)",
+                   errno, strerror(errno));
+
+       set_timer();
+       run_vcpu_with_invalid_state();
+}
index 478e0ae..865e171 100644 (file)
@@ -46,20 +46,20 @@ static struct kvm_vm *vm;
 #define MIN_STEAL_TIME         50000
 
 struct pvclock_vcpu_time_info {
-        u32   version;
-        u32   pad0;
-        u64   tsc_timestamp;
-        u64   system_time;
-        u32   tsc_to_system_mul;
-        s8    tsc_shift;
-        u8    flags;
-        u8    pad[2];
+       u32   version;
+       u32   pad0;
+       u64   tsc_timestamp;
+       u64   system_time;
+       u32   tsc_to_system_mul;
+       s8    tsc_shift;
+       u8    flags;
+       u8    pad[2];
 } __attribute__((__packed__)); /* 32 bytes */
 
 struct pvclock_wall_clock {
-        u32   version;
-        u32   sec;
-        u32   nsec;
+       u32   version;
+       u32   sec;
+       u32   nsec;
 } __attribute__((__packed__));
 
 struct vcpu_runstate_info {
@@ -74,11 +74,11 @@ struct arch_vcpu_info {
 };
 
 struct vcpu_info {
-        uint8_t evtchn_upcall_pending;
-        uint8_t evtchn_upcall_mask;
-        unsigned long evtchn_pending_sel;
-        struct arch_vcpu_info arch;
-        struct pvclock_vcpu_time_info time;
+       uint8_t evtchn_upcall_pending;
+       uint8_t evtchn_upcall_mask;
+       unsigned long evtchn_pending_sel;
+       struct arch_vcpu_info arch;
+       struct pvclock_vcpu_time_info time;
 }; /* 64 bytes (x86) */
 
 struct shared_info {
@@ -493,7 +493,7 @@ int main(int argc, char *argv[])
 
        vm_ts.tv_sec = wc->sec;
        vm_ts.tv_nsec = wc->nsec;
-        TEST_ASSERT(wc->version && !(wc->version & 1),
+       TEST_ASSERT(wc->version && !(wc->version & 1),
                    "Bad wallclock version %x", wc->version);
        TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
        TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
index 504158f..9a20f22 100644 (file)
@@ -427,9 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 #endif
        kvm_async_pf_vcpu_init(vcpu);
 
-       vcpu->pre_pcpu = -1;
-       INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
-
        kvm_vcpu_set_in_spin_loop(vcpu, false);
        kvm_vcpu_set_dy_eligible(vcpu, false);
        vcpu->preempted = false;
@@ -3163,8 +3160,10 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
 {
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
 
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING
        if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
                return;
+#endif
 
        if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
                unsigned long rel_gfn = gfn - memslot->base_gfn;