KVM: nVMX: restore host state in nested_vmx_vmexit for VMFail

author Sean Christopherson <sean.j.christopherson@intel.com>

Wed, 22 Aug 2018 21:57:07 +0000 (14:57 -0700)

committer Paolo Bonzini <pbonzini@redhat.com>

Tue, 16 Oct 2018 22:29:39 +0000 (00:29 +0200)
author Sean Christopherson <sean.j.christopherson@intel.com>
Wed, 22 Aug 2018 21:57:07 +0000 (14:57 -0700)
committer Paolo Bonzini <pbonzini@redhat.com>
Tue, 16 Oct 2018 22:29:39 +0000 (00:29 +0200)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 14d4463..8efce8e 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -13156,24 +13156,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         kvm_clear_interrupt_queue(vcpu);
  }
  
-static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
-                       struct vmcs12 *vmcs12)
-{
-       u32 entry_failure_code;
-
-       nested_ept_uninit_mmu_context(vcpu);
-
-       /*
-        * Only PDPTE load can fail as the value of cr3 was checked on entry and
-        * couldn't have changed.
-        */
-       if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
-               nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
-
-       if (!enable_ept)
-               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-}
-
  /*
   * A part of what we need to when the nested L2 guest exits and we want to
   * run its L1 parent, is to reset L1's guest state to the host state specified
@@ -13187,6 +13169,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                    struct vmcs12 *vmcs12)
  {
         struct kvm_segment seg;
+       u32 entry_failure_code;
  
         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                 vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -13213,7 +13196,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
         vmx_set_cr4(vcpu, vmcs12->host_cr4);
  
-       load_vmcs12_mmu_host_state(vcpu, vmcs12);
+       nested_ept_uninit_mmu_context(vcpu);
+
+       /*
+        * Only PDPTE load can fail as the value of cr3 was checked on entry and
+        * couldn't have changed.
+        */
+       if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+               nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+       if (!enable_ept)
+               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
  
         /*
          * If vmcs01 doesn't use VPID, CPU flushes TLB on every
@@ -13311,6 +13304,140 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
  }
  
+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
+{
+       struct shared_msr_entry *efer_msr;
+       unsigned int i;
+
+       if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
+               return vmcs_read64(GUEST_IA32_EFER);
+
+       if (cpu_has_load_ia32_efer)
+               return host_efer;
+
+       for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
+               if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
+                       return vmx->msr_autoload.guest.val[i].value;
+       }
+
+       efer_msr = find_msr_entry(vmx, MSR_EFER);
+       if (efer_msr)
+               return efer_msr->data;
+
+       return host_efer;
+}
+
+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vmx_msr_entry g, h;
+       struct msr_data msr;
+       gpa_t gpa;
+       u32 i, j;
+
+       vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
+
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+               /*
+                * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
+                * as vmcs01.GUEST_DR7 contains a userspace defined value
+                * and vcpu->arch.dr7 is not squirreled away before the
+                * nested VMENTER (not worth adding a variable in nested_vmx).
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+                       kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+               else
+                       WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+       }
+
+       /*
+        * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+        * handle a variety of side effects to KVM's software model.
+        */
+       vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
+
+       vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+       vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
+
+       vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+       vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
+
+       nested_ept_uninit_mmu_context(vcpu);
+       vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+       /*
+        * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
+        * from vmcs01 (if necessary).  The PDPTRs are not loaded on
+        * VMFail, like everything else we just need to ensure our
+        * software model is up-to-date.
+        */
+       ept_save_pdptrs(vcpu);
+
+       kvm_mmu_reset_context(vcpu);
+
+       if (cpu_has_vmx_msr_bitmap())
+               vmx_update_msr_bitmap(vcpu);
+
+       /*
+        * This nasty bit of open coding is a compromise between blindly
+        * loading L1's MSRs using the exit load lists (incorrect emulation
+        * of VMFail), leaving the nested VM's MSRs in the software model
+        * (incorrect behavior) and snapshotting the modified MSRs (too
+        * expensive since the lists are unbound by hardware).  For each
+        * MSR that was (prematurely) loaded from the nested VMEntry load
+        * list, reload it from the exit load list if it exists and differs
+        * from the guest value.  The intent is to stuff host state as
+        * silently as possible, not to fully process the exit load list.
+        */
+       msr.host_initiated = false;
+       for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
+               gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
+               if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
+                       pr_debug_ratelimited(
+                               "%s read MSR index failed (%u, 0x%08llx)\n",
+                               __func__, i, gpa);
+                       goto vmabort;
+               }
+
+               for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
+                       gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
+                       if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
+                               pr_debug_ratelimited(
+                                       "%s read MSR failed (%u, 0x%08llx)\n",
+                                       __func__, j, gpa);
+                               goto vmabort;
+                       }
+                       if (h.index != g.index)
+                               continue;
+                       if (h.value == g.value)
+                               break;
+
+                       if (nested_vmx_load_msr_check(vcpu, &h)) {
+                               pr_debug_ratelimited(
+                                       "%s check failed (%u, 0x%x, 0x%x)\n",
+                                       __func__, j, h.index, h.reserved);
+                               goto vmabort;
+                       }
+
+                       msr.index = h.index;
+                       msr.data = h.value;
+                       if (kvm_set_msr(vcpu, &msr)) {
+                               pr_debug_ratelimited(
+                                       "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
+                                       __func__, j, h.index, h.value);
+                               goto vmabort;
+                       }
+               }
+       }
+
+       return;
+
+vmabort:
+       nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
  /*
   * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
   * and modify vmcs12 to make it see what it would expect to see there if
@@ -13455,7 +13582,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
          */
         nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  
-       load_vmcs12_mmu_host_state(vcpu, vmcs12);
+       /*
+        * Restore L1's host state to KVM's software model.  We're here
+        * because a consistency check was caught by hardware, which
+        * means some amount of guest state has been propagated to KVM's
+        * model and needs to be unwound to the host's state.
+        */
+       nested_vmx_restore_host_state(vcpu);
  
         /*
          * The emulated instruction was already skipped in
author	Sean Christopherson <sean.j.christopherson@intel.com>
	Wed, 22 Aug 2018 21:57:07 +0000 (14:57 -0700)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 16 Oct 2018 22:29:39 +0000 (00:29 +0200)