KVM: x86/mmu: Use EMULTYPE flag to track write #PFs to shadow pages
authorSean Christopherson <seanjc@google.com>
Thu, 2 Feb 2023 18:28:15 +0000 (18:28 +0000)
committerPaolo Bonzini <pbonzini@redhat.com>
Tue, 14 Mar 2023 14:28:56 +0000 (10:28 -0400)
Use a new EMULTYPE flag, EMULTYPE_WRITE_PF_TO_SP, to track page faults
on self-changing writes to shadowed page tables instead of propagating
that information to the emulator via a semi-persistent vCPU flag.  Using
a flag in "struct kvm_vcpu_arch" is confusing, especially as implemented,
as it's not at all obvious that clearing the flag only when emulation
actually occurs is correct.

E.g. if KVM sets the flag and then retries the fault without ever getting
to the emulator, the flag will be left set for future calls into the
emulator.  But because the flag is consumed if and only if both
EMULTYPE_PF and EMULTYPE_ALLOW_RETRY_PF are set, and because
EMULTYPE_ALLOW_RETRY_PF is deliberately not set for direct MMUs, emulated
MMIO, or while L2 is active, KVM avoids false positives on a stale flag
since FNAME(page_fault) is guaranteed to be run and refresh the flag
before it's ultimately consumed by the tail end of reexecute_instruction().

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20230202182817.407394-2-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/mmu_internal.h
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/x86.c

index 808c292..a45de11 100644 (file)
@@ -947,23 +947,6 @@ struct kvm_vcpu_arch {
 
        u64 msr_kvm_poll_control;
 
-       /*
-        * Indicates the guest is trying to write a gfn that contains one or
-        * more of the PTEs used to translate the write itself, i.e. the access
-        * is changing its own translation in the guest page tables.  KVM exits
-        * to userspace if emulation of the faulting instruction fails and this
-        * flag is set, as KVM cannot make forward progress.
-        *
-        * If emulation fails for a write to guest page tables, KVM unprotects
-        * (zaps) the shadow page for the target gfn and resumes the guest to
-        * retry the non-emulatable instruction (on hardware).  Unprotecting the
-        * gfn doesn't allow forward progress for a self-changing access because
-        * doing so also zaps the translation for the gfn, i.e. retrying the
-        * instruction will hit a !PRESENT fault, which results in a new shadow
-        * page and sends KVM back to square one.
-        */
-       bool write_fault_to_shadow_pgtable;
-
        /* set at EPT violation at this point */
        unsigned long exit_qualification;
 
@@ -1907,6 +1890,25 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
  * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
  *                              state and inject single-step #DBs after skipping
  *                              an instruction (after completing userspace I/O).
+ *
+ * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that
+ *                          is attempting to write a gfn that contains one or
+ *                          more of the PTEs used to translate the write itself,
+ *                          and the owning page table is being shadowed by KVM.
+ *                          If emulation of the faulting instruction fails and
+ *                          this flag is set, KVM will exit to userspace instead
+ *                          of retrying emulation as KVM cannot make forward
+ *                          progress.
+ *
+ *                          If emulation fails for a write to guest page tables,
+ *                          KVM unprotects (zaps) the shadow page for the target
+ *                          gfn and resumes the guest to retry the non-emulatable
+ *                          instruction (on hardware).  Unprotecting the gfn
+ *                          doesn't allow forward progress for a self-changing
+ *                          access because doing so also zaps the translation for
+ *                          the gfn, i.e. retrying the instruction will hit a
+ *                          !PRESENT fault, which results in a new shadow page
+ *                          and sends KVM back to square one.
  */
 #define EMULTYPE_NO_DECODE         (1 << 0)
 #define EMULTYPE_TRAP_UD           (1 << 1)
@@ -1916,6 +1918,7 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
 #define EMULTYPE_VMWARE_GP         (1 << 5)
 #define EMULTYPE_PF                (1 << 6)
 #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
+#define EMULTYPE_WRITE_PF_TO_SP            (1 << 8)
 
 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
 int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
index c8ebe54..144c5a0 100644 (file)
@@ -4203,7 +4203,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
              work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
                return;
 
-       kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
+       kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
 }
 
 static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -5664,7 +5664,8 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 
        if (r == RET_PF_INVALID) {
                r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
-                                         lower_32_bits(error_code), false);
+                                         lower_32_bits(error_code), false,
+                                         &emulation_type);
                if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
                        return -EIO;
        }
index cc58631..2cbb155 100644 (file)
@@ -240,6 +240,13 @@ struct kvm_page_fault {
        kvm_pfn_t pfn;
        hva_t hva;
        bool map_writable;
+
+       /*
+        * Indicates the guest is trying to write a gfn that contains one or
+        * more of the PTEs used to translate the write itself, i.e. the access
+        * is changing its own translation in the guest page tables.
+        */
+       bool write_fault_to_shadow_pgtable;
 };
 
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
@@ -273,7 +280,7 @@ enum {
 };
 
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                                       u32 err, bool prefetch)
+                                       u32 err, bool prefetch, int *emulation_type)
 {
        struct kvm_page_fault fault = {
                .addr = cr2_or_gpa,
@@ -312,6 +319,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        else
                r = vcpu->arch.mmu->page_fault(vcpu, &fault);
 
+       if (fault.write_fault_to_shadow_pgtable && emulation_type)
+               *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+
        /*
         * Similar to above, prefetch faults aren't truly spurious, and the
         * async #PF path doesn't do emulation.  Do count faults that are fixed
index 57f0b75..5d29582 100644 (file)
@@ -825,10 +825,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
        if (r)
                return r;
 
-       vcpu->arch.write_fault_to_shadow_pgtable = false;
-
        is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
-             &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
+             &walker, fault->user, &fault->write_fault_to_shadow_pgtable);
 
        if (is_self_change_mapping)
                fault->max_level = PG_LEVEL_4K;
index 7713420..ff7f398 100644 (file)
@@ -8463,7 +8463,6 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 }
 
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                                 bool write_fault_to_shadow_pgtable,
                                  int emulation_type)
 {
        gpa_t gpa = cr2_or_gpa;
@@ -8534,7 +8533,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         * be fixed by unprotecting shadow page and it should
         * be reported to userspace.
         */
-       return !write_fault_to_shadow_pgtable;
+       return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
 }
 
 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -8782,20 +8781,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        int r;
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        bool writeback = true;
-       bool write_fault_to_spt;
 
        if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
                return 1;
 
        vcpu->arch.l1tf_flush_l1d = true;
 
-       /*
-        * Clear write_fault_to_shadow_pgtable here to ensure it is
-        * never reused.
-        */
-       write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
-       vcpu->arch.write_fault_to_shadow_pgtable = false;
-
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                kvm_clear_exception_queue(vcpu);
 
@@ -8816,7 +8807,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                                return 1;
                        }
                        if (reexecute_instruction(vcpu, cr2_or_gpa,
-                                                 write_fault_to_spt,
                                                  emulation_type))
                                return 1;
 
@@ -8895,8 +8885,7 @@ restart:
                return 1;
 
        if (r == EMULATION_FAILED) {
-               if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
-                                       emulation_type))
+               if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
                        return 1;
 
                return handle_emulation_failure(vcpu, emulation_type);