Merge tag 'kvm-x86-svm-6.4' of https://github.com/kvm-x86/linux into HEAD

author Paolo Bonzini <pbonzini@redhat.com>

Wed, 26 Apr 2023 19:56:27 +0000 (15:56 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Wed, 26 Apr 2023 19:56:27 +0000 (15:56 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Wed, 26 Apr 2023 19:56:27 +0000 (15:56 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Wed, 26 Apr 2023 19:56:27 +0000 (15:56 -0400)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h

index 42abcd3..97327a1 100644 (file)
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -226,10 +226,9 @@
  
  /* Virtualization flags: Linux defined, word 8 */
  #define X86_FEATURE_TPR_SHADOW         ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI               ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY       ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT                        ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID               ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_FLEXPRIORITY       ( 8*32+ 1) /* Intel FlexPriority */
+#define X86_FEATURE_EPT                        ( 8*32+ 2) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID               ( 8*32+ 3) /* Intel Virtual Processor ID */
  
  #define X86_FEATURE_VMMCALL            ( 8*32+15) /* Prefer VMMCALL to VMCALL */
  #define X86_FEATURE_XENPV              ( 8*32+16) /* "" Xen paravirtual guest */
@@ -370,6 +369,7 @@
  #define X86_FEATURE_VGIF               (15*32+16) /* Virtual GIF */
  #define X86_FEATURE_X2AVIC             (15*32+18) /* Virtual x2apic */
  #define X86_FEATURE_V_SPEC_CTRL                (15*32+20) /* Virtual SPEC_CTRL */
+#define X86_FEATURE_VNMI               (15*32+25) /* Virtual NMI */
  #define X86_FEATURE_SVME_ADDR_CHK      (15*32+28) /* "" SVME addr check */
  
  /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h

index 430ca22..13bc212 100644 (file)
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -68,6 +68,8 @@ KVM_X86_OP(get_interrupt_shadow)
  KVM_X86_OP(patch_hypercall)
  KVM_X86_OP(inject_irq)
  KVM_X86_OP(inject_nmi)
+KVM_X86_OP_OPTIONAL_RET0(is_vnmi_pending)
+KVM_X86_OP_OPTIONAL_RET0(set_vnmi_pending)
  KVM_X86_OP(inject_exception)
  KVM_X86_OP(cancel_injection)
  KVM_X86_OP(interrupt_allowed)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index d197c54..fb9d1f2 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -874,7 +874,8 @@ struct kvm_vcpu_arch {
         u64 tsc_scaling_ratio; /* current scaling ratio */
  
         atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
-       unsigned nmi_pending; /* NMI queued after currently running handler */
+       /* Number of NMIs pending injection, not including hardware vNMIs. */
+       unsigned int nmi_pending;
         bool nmi_injected;    /* Trying to inject an NMI this entry */
         bool smi_pending;    /* SMI queued after currently running handler */
         u8 handling_intr_from_guest;
@@ -1619,6 +1620,13 @@ struct kvm_x86_ops {
         int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
         bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
         void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
+       /* Whether or not a virtual NMI is pending in hardware. */
+       bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu);
+       /*
+        * Attempt to pend a virtual NMI in harware.  Returns %true on success
+        * to allow using static_call_ret0 as the fallback.
+        */
+       bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu);
         void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
         void (*enable_irq_window)(struct kvm_vcpu *vcpu);
         void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -1999,6 +2007,7 @@ int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
  void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
  
  void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
  
  void kvm_update_dr7(struct kvm_vcpu *vcpu);
  
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h

index 770dcf7..e7c7379 100644 (file)
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -183,6 +183,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
  #define V_GIF_SHIFT 9
  #define V_GIF_MASK (1 << V_GIF_SHIFT)
  
+#define V_NMI_PENDING_SHIFT 11
+#define V_NMI_PENDING_MASK (1 << V_NMI_PENDING_SHIFT)
+
+#define V_NMI_BLOCKING_SHIFT 12
+#define V_NMI_BLOCKING_MASK (1 << V_NMI_BLOCKING_SHIFT)
+
  #define V_INTR_PRIO_SHIFT 16
  #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
  
@@ -197,6 +203,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
  #define V_GIF_ENABLE_SHIFT 25
  #define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
  
+#define V_NMI_ENABLE_SHIFT 26
+#define V_NMI_ENABLE_MASK (1 << V_NMI_ENABLE_SHIFT)
+
  #define AVIC_ENABLE_SHIFT 31
  #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
  
@@ -278,7 +287,6 @@ static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_
  static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
  
  #define AVIC_HPA_MASK  ~((0xFFFULL << 52) | 0xFFF)
-#define VMCB_AVIC_APIC_BAR_MASK                0xFFFFFFFFFF000ULL
  
  
  struct vmcb_seg {
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c

index 05d3894..96936dd 100644 (file)
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -139,13 +139,18 @@ void recalc_intercepts(struct vcpu_svm *svm)
  
         if (g->int_ctl & V_INTR_MASKING_MASK) {
                 /*
-                * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8
-                * does not affect any interrupt we may want to inject;
-                * therefore, writes to CR8 are irrelevant to L0, as are
-                * interrupt window vmexits.
+                * If L2 is active and V_INTR_MASKING is enabled in vmcb12,
+                * disable intercept of CR8 writes as L2's CR8 does not affect
+                * any interrupt KVM may want to inject.
+                *
+                * Similarly, disable intercept of virtual interrupts (used to
+                * detect interrupt windows) if the saved RFLAGS.IF is '0', as
+                * the effective RFLAGS.IF for L1 interrupts will never be set
+                * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
                  */
                 vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
-               vmcb_clr_intercept(c, INTERCEPT_VINTR);
+               if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF))
+                       vmcb_clr_intercept(c, INTERCEPT_VINTR);
         }
  
         /*
@@ -276,6 +281,11 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
         if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl)))
                 return false;
  
+       if (CC((control->int_ctl & V_NMI_ENABLE_MASK) &&
+              !vmcb12_is_intercept(control, INTERCEPT_NMI))) {
+               return false;
+       }
+
         return true;
  }
  
@@ -416,22 +426,24 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
  
         /* Only a few fields of int_ctl are written by the processor.  */
         mask = V_IRQ_MASK | V_TPR_MASK;
-       if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
-           svm_is_intercept(svm, INTERCEPT_VINTR)) {
-               /*
-                * In order to request an interrupt window, L0 is usurping
-                * svm->vmcb->control.int_ctl and possibly setting V_IRQ
-                * even if it was clear in L1's VMCB.  Restoring it would be
-                * wrong.  However, in this case V_IRQ will remain true until
-                * interrupt_window_interception calls svm_clear_vintr and
-                * restores int_ctl.  We can just leave it aside.
-                */
+       /*
+        * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting
+        * virtual interrupts in order to request an interrupt window, as KVM
+        * has usurped vmcb02's int_ctl.  If an interrupt window opens before
+        * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl.
+        * If no window opens, V_IRQ will be correctly preserved in vmcb12's
+        * int_ctl (because it was never recognized while L2 was running).
+        */
+       if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
+           !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts))
                 mask &= ~V_IRQ_MASK;
-       }
  
         if (nested_vgif_enabled(svm))
                 mask |= V_GIF_MASK;
  
+       if (nested_vnmi_enabled(svm))
+               mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK;
+
         svm->nested.ctl.int_ctl        &= ~mask;
         svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
  }
@@ -651,6 +663,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
         else
                 int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
  
+       if (vnmi) {
+               if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) {
+                       svm->vcpu.arch.nmi_pending++;
+                       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+               }
+               if (nested_vnmi_enabled(svm))
+                       int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK |
+                                               V_NMI_ENABLE_MASK |
+                                               V_NMI_BLOCKING_MASK);
+       }
+
         /* Copied from vmcb01.  msrpm_base can be overwritten later.  */
         vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
         vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
@@ -1021,6 +1044,28 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
  
         svm_switch_vmcb(svm, &svm->vmcb01);
  
+       /*
+        * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01:
+        *
+        * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR:  If L1 doesn't
+        * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related
+        * flags) to detect interrupt windows for L1 IRQs (even if L1 uses
+        * virtual interrupt masking).  Raise KVM_REQ_EVENT to ensure that
+        * KVM re-requests an interrupt window if necessary, which implicitly
+        * copies this bits from vmcb02 to vmcb01.
+        *
+        * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR
+        * is stored in vmcb02, but its value doesn't need to be copied from/to
+        * vmcb01 because it is copied from/to the virtual APIC's TPR register
+        * on each VM entry/exit.
+        *
+        * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's
+        * V_GIF.  However, GIF is architecturally clear on each VM exit, thus
+        * there is no need to copy V_GIF from vmcb02 to vmcb01.
+        */
+       if (!nested_exit_on_intr(svm))
+               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
         if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
                 svm_copy_lbrs(vmcb12, vmcb02);
                 svm_update_lbrv(vcpu);
@@ -1029,6 +1074,20 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
                 svm_update_lbrv(vcpu);
         }
  
+       if (vnmi) {
+               if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
+                       vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK;
+               else
+                       vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+               if (vcpu->arch.nmi_pending) {
+                       vcpu->arch.nmi_pending--;
+                       vmcb01->control.int_ctl |= V_NMI_PENDING_MASK;
+               } else {
+                       vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK;
+               }
+       }
+
         /*
          * On vmexit the  GIF is set to false and
          * no event can be injected in L1.
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index a64ede4..eb308c9 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -231,6 +231,8 @@ module_param(dump_invalid_vmcb, bool, 0644);
  bool intercept_smi = true;
  module_param(intercept_smi, bool, 0444);
  
+bool vnmi = true;
+module_param(vnmi, bool, 0444);
  
  static bool svm_gp_erratum_intercept = true;
  
@@ -1312,6 +1314,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
         if (kvm_vcpu_apicv_active(vcpu))
                 avic_init_vmcb(svm, vmcb);
  
+       if (vnmi)
+               svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
+
         if (vgif) {
                 svm_clr_intercept(svm, INTERCEPT_STGI);
                 svm_clr_intercept(svm, INTERCEPT_CLGI);
@@ -1585,6 +1590,16 @@ static void svm_set_vintr(struct vcpu_svm *svm)
         svm_set_intercept(svm, INTERCEPT_VINTR);
  
         /*
+        * Recalculating intercepts may have cleared the VINTR intercept.  If
+        * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
+        * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
+        * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
+        * interrupts will never be unblocked while L2 is running.
+        */
+       if (!svm_is_intercept(svm, INTERCEPT_VINTR))
+               return;
+
+       /*
          * This is just a dummy VINTR to actually cause a vmexit to happen.
          * Actual injection of virtual interrupts happens through EVENTINJ.
          */
@@ -2481,16 +2496,29 @@ static int task_switch_interception(struct kvm_vcpu *vcpu)
                                has_error_code, error_code);
  }
  
+static void svm_clr_iret_intercept(struct vcpu_svm *svm)
+{
+       if (!sev_es_guest(svm->vcpu.kvm))
+               svm_clr_intercept(svm, INTERCEPT_IRET);
+}
+
+static void svm_set_iret_intercept(struct vcpu_svm *svm)
+{
+       if (!sev_es_guest(svm->vcpu.kvm))
+               svm_set_intercept(svm, INTERCEPT_IRET);
+}
+
  static int iret_interception(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
         ++vcpu->stat.nmi_window_exits;
         svm->awaiting_iret_completion = true;
-       if (!sev_es_guest(vcpu->kvm)) {
-               svm_clr_intercept(svm, INTERCEPT_IRET);
+
+       svm_clr_iret_intercept(svm);
+       if (!sev_es_guest(vcpu->kvm))
                 svm->nmi_iret_rip = kvm_rip_read(vcpu);
-       }
+
         kvm_make_request(KVM_REQ_EVENT, vcpu);
         return 1;
  }
@@ -3467,11 +3495,43 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
                 return;
  
         svm->nmi_masked = true;
-       if (!sev_es_guest(vcpu->kvm))
-               svm_set_intercept(svm, INTERCEPT_IRET);
+       svm_set_iret_intercept(svm);
         ++vcpu->stat.nmi_injections;
  }
  
+static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (!is_vnmi_enabled(svm))
+               return false;
+
+       return !!(svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK);
+}
+
+static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (!is_vnmi_enabled(svm))
+               return false;
+
+       if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
+               return false;
+
+       svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
+       vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+
+       /*
+        * Because the pending NMI is serviced by hardware, KVM can't know when
+        * the NMI is "injected", but for all intents and purposes, passing the
+        * NMI off to hardware counts as injection.
+        */
+       ++vcpu->stat.nmi_injections;
+
+       return true;
+}
+
  static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -3567,6 +3627,35 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
  }
  
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (is_vnmi_enabled(svm))
+               return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
+       else
+               return svm->nmi_masked;
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (is_vnmi_enabled(svm)) {
+               if (masked)
+                       svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
+               else
+                       svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+       } else {
+               svm->nmi_masked = masked;
+               if (masked)
+                       svm_set_iret_intercept(svm);
+               else
+                       svm_clr_iret_intercept(svm);
+       }
+}
+
  bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -3578,8 +3667,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
         if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
                 return false;
  
-       return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-              svm->nmi_masked;
+       if (svm_get_nmi_mask(vcpu))
+               return true;
+
+       return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
  }
  
  static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -3597,26 +3688,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
         return 1;
  }
  
-static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
-       return to_svm(vcpu)->nmi_masked;
-}
-
-static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (masked) {
-               svm->nmi_masked = true;
-               if (!sev_es_guest(vcpu->kvm))
-                       svm_set_intercept(svm, INTERCEPT_IRET);
-       } else {
-               svm->nmi_masked = false;
-               if (!sev_es_guest(vcpu->kvm))
-                       svm_clr_intercept(svm, INTERCEPT_IRET);
-       }
-}
-
  bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -3697,7 +3768,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (svm->nmi_masked && !svm->awaiting_iret_completion)
+       /*
+        * KVM should never request an NMI window when vNMI is enabled, as KVM
+        * allows at most one to-be-injected NMI and one pending NMI, i.e. if
+        * two NMIs arrive simultaneously, KVM will inject one and set
+        * V_NMI_PENDING for the other.  WARN, but continue with the standard
+        * single-step approach to try and salvage the pending NMI.
+        */
+       WARN_ON_ONCE(is_vnmi_enabled(svm));
+
+       if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
                 return; /* IRET will cause a vm exit */
  
         if (!gif_set(svm)) {
@@ -4135,6 +4215,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
  
         svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
  
+       svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
+
         svm_recalc_instruction_intercepts(vcpu, svm);
  
         if (boot_cpu_has(X86_FEATURE_IBPB))
@@ -4752,6 +4834,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
         .patch_hypercall = svm_patch_hypercall,
         .inject_irq = svm_inject_irq,
         .inject_nmi = svm_inject_nmi,
+       .is_vnmi_pending = svm_is_vnmi_pending,
+       .set_vnmi_pending = svm_set_vnmi_pending,
         .inject_exception = svm_inject_exception,
         .cancel_injection = svm_cancel_injection,
         .interrupt_allowed = svm_interrupt_allowed,
@@ -4894,6 +4978,9 @@ static __init void svm_set_cpu_caps(void)
                 if (vgif)
                         kvm_cpu_cap_set(X86_FEATURE_VGIF);
  
+               if (vnmi)
+                       kvm_cpu_cap_set(X86_FEATURE_VNMI);
+
                 /* Nested VM can receive #VMEXIT instead of triggering #GP */
                 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
         }
@@ -5045,6 +5132,16 @@ static __init int svm_hardware_setup(void)
                         pr_info("Virtual GIF supported\n");
         }
  
+       vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
+       if (vnmi)
+               pr_info("Virtual NMI enabled\n");
+
+       if (!vnmi) {
+               svm_x86_ops.is_vnmi_pending = NULL;
+               svm_x86_ops.set_vnmi_pending = NULL;
+       }
+
+
         if (lbrv) {
                 if (!boot_cpu_has(X86_FEATURE_LBRV))
                         lbrv = false;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h

index 8398099..f44751d 100644 (file)
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -36,6 +36,7 @@ extern bool npt_enabled;
  extern int vgif;
  extern bool intercept_smi;
  extern bool x2avic_enabled;
+extern bool vnmi;
  
  /*
   * Clean bits in VMCB.
@@ -265,6 +266,7 @@ struct vcpu_svm {
         bool pause_filter_enabled         : 1;
         bool pause_threshold_enabled      : 1;
         bool vgif_enabled                 : 1;
+       bool vnmi_enabled                 : 1;
  
         u32 ldr_reg;
         u32 dfr_reg;
@@ -539,6 +541,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
         return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
  }
  
+static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
+{
+       return svm->vnmi_enabled &&
+              (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
+}
+
  static inline bool is_x2apic_msrpm_offset(u32 offset)
  {
         /* 4 msrs per u8, and 4 u8 in u32 */
@@ -548,6 +556,27 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
                (msr < (APIC_BASE_MSR + 0x100));
  }
  
+static inline struct vmcb *get_vnmi_vmcb_l1(struct vcpu_svm *svm)
+{
+       if (!vnmi)
+               return NULL;
+
+       if (is_guest_mode(&svm->vcpu))
+               return NULL;
+       else
+               return svm->vmcb01.ptr;
+}
+
+static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
+{
+       struct vmcb *vmcb = get_vnmi_vmcb_l1(svm);
+
+       if (vmcb)
+               return !!(vmcb->control.int_ctl & V_NMI_ENABLE_MASK);
+       else
+               return false;
+}
+
  /* svm.c */
  #define MSR_INVALID                            0xffffffffU
  
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 6a41bdb..523c39a 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5185,7 +5185,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
         events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
  
         events->nmi.injected = vcpu->arch.nmi_injected;
-       events->nmi.pending = vcpu->arch.nmi_pending != 0;
+       events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
         events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
  
         /* events->sipi_vector is never valid when reporting to user space */
@@ -5272,8 +5272,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                                 events->interrupt.shadow);
  
         vcpu->arch.nmi_injected = events->nmi.injected;
-       if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
-               vcpu->arch.nmi_pending = events->nmi.pending;
+       if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
+               vcpu->arch.nmi_pending = 0;
+               atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
+               kvm_make_request(KVM_REQ_NMI, vcpu);
+       }
         static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
  
         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
@@ -10210,19 +10213,46 @@ out:
  
  static void process_nmi(struct kvm_vcpu *vcpu)
  {
-       unsigned limit = 2;
+       unsigned int limit;
  
         /*
-        * x86 is limited to one NMI running, and one NMI pending after it.
-        * If an NMI is already in progress, limit further NMIs to just one.
-        * Otherwise, allow two (and we'll inject the first one immediately).
+        * x86 is limited to one NMI pending, but because KVM can't react to
+        * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
+        * scheduled out, KVM needs to play nice with two queued NMIs showing
+        * up at the same time.  To handle this scenario, allow two NMIs to be
+        * (temporarily) pending so long as NMIs are not blocked and KVM is not
+        * waiting for a previous NMI injection to complete (which effectively
+        * blocks NMIs).  KVM will immediately inject one of the two NMIs, and
+        * will request an NMI window to handle the second NMI.
          */
         if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
                 limit = 1;
+       else
+               limit = 2;
+
+       /*
+        * Adjust the limit to account for pending virtual NMIs, which aren't
+        * tracked in vcpu->arch.nmi_pending.
+        */
+       if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
+               limit--;
  
         vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
         vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+       if (vcpu->arch.nmi_pending &&
+           (static_call(kvm_x86_set_vnmi_pending)(vcpu)))
+               vcpu->arch.nmi_pending--;
+
+       if (vcpu->arch.nmi_pending)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+/* Return total number of NMIs pending injection to the VM */
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.nmi_pending +
+              static_call(kvm_x86_is_vnmi_pending)(vcpu);
  }
  
  void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
author	Paolo Bonzini <pbonzini@redhat.com>
	Wed, 26 Apr 2023 19:56:27 +0000 (15:56 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Wed, 26 Apr 2023 19:56:27 +0000 (15:56 -0400)
arch/x86/include/asm/cpufeatures.h		patch \| blob \| history
arch/x86/include/asm/kvm-x86-ops.h		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/include/asm/svm.h		patch \| blob \| history
arch/x86/kvm/svm/nested.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/svm/svm.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history