KVM: nVMX: Unconditionally clear mtf_pending on nested VM-Exit
authorSean Christopherson <seanjc@google.com>
Tue, 30 Aug 2022 23:15:58 +0000 (23:15 +0000)
committerPaolo Bonzini <pbonzini@redhat.com>
Mon, 26 Sep 2022 16:03:07 +0000 (12:03 -0400)
Clear mtf_pending on nested VM-Exit instead of handling the clear on a
case-by-case basis in vmx_check_nested_events().  The pending MTF should
never survive nested VM-Exit, as it is a property of KVM's run of the
current L2, i.e. should never affect the next L2 run by L1.  In practice,
this is likely a nop as getting to L1 with nested_run_pending is
impossible, and KVM doesn't correctly handle morphing a pending exception
that occurs on a prior injected exception (need for re-injected exception
being the other case where MTF isn't cleared).  However, KVM will
hopefully soon correctly deal with a pending exception on top of an
injected exception.

Add a TODO to document that KVM has an inversion priority bug between
SMIs and MTF (and trap-like #DBS), and that KVM also doesn't properly
save/restore MTF across SMI/RSM.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Link: https://lore.kernel.org/r/20220830231614.3580124-12-seanjc@google.com
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/kvm/vmx/nested.c

index 04585f5..cb1b3d1 100644 (file)
@@ -3951,16 +3951,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
        unsigned long exit_qual;
        bool block_nested_events =
            vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
-       bool mtf_pending = vmx->nested.mtf_pending;
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       /*
-        * Clear the MTF state. If a higher priority VM-exit is delivered first,
-        * this state is discarded.
-        */
-       if (!block_nested_events)
-               vmx->nested.mtf_pending = false;
-
        if (lapic_in_kernel(vcpu) &&
                test_bit(KVM_APIC_INIT, &apic->pending_events)) {
                if (block_nested_events)
@@ -3969,6 +3961,9 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
                clear_bit(KVM_APIC_INIT, &apic->pending_events);
                if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
                        nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+
+               /* MTF is discarded if the vCPU is in WFS. */
+               vmx->nested.mtf_pending = false;
                return 0;
        }
 
@@ -3991,6 +3986,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
         * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
         * could theoretically come in from userspace), and ICEBP (INT1).
         *
+        * TODO: SMIs have higher priority than MTF and trap-like #DBs (except
+        * for TSS T flag #DBs).  KVM also doesn't save/restore pending MTF
+        * across SMI/RSM as it should; that needs to be addressed in order to
+        * prioritize SMI over MTF and trap-like #DBs.
+        *
         * Note that only a pending nested run can block a pending exception.
         * Otherwise an injected NMI/interrupt should either be
         * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
@@ -4006,7 +4006,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       if (mtf_pending) {
+       if (vmx->nested.mtf_pending) {
                if (block_nested_events)
                        return -EBUSY;
                nested_vmx_update_pending_dbg(vcpu);
@@ -4603,6 +4603,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
+       /* Pending MTF traps are discarded on VM-Exit. */
+       vmx->nested.mtf_pending = false;
+
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);