Merge tag 'kvmarm-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm...

[platform/kernel/linux-starfive.git] / arch / x86 / kvm / vmx / vmx.c
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index 40b1e61..a7dd678 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -435,7 +435,6 @@ static const struct kvm_vmx_segment_field {
         VMX_SEGMENT_FIELD(LDTR),
  };
  
-u64 host_efer;
  static unsigned long host_idt_base;
  
  /*
@@ -656,53 +655,16 @@ static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr,
         return ret;
  }
  
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
-{
-       vmcs_clear(loaded_vmcs->vmcs);
-       if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
-               vmcs_clear(loaded_vmcs->shadow_vmcs);
-       loaded_vmcs->cpu = -1;
-       loaded_vmcs->launched = 0;
-}
-
  #ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
-       cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
-       cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
-       return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
  static void crash_vmclear_local_loaded_vmcss(void)
  {
         int cpu = raw_smp_processor_id();
         struct loaded_vmcs *v;
  
-       if (!crash_local_vmclear_enabled(cpu))
-               return;
-
         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                             loaded_vmcss_on_cpu_link)
                 vmcs_clear(v->vmcs);
  }
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
  #endif /* CONFIG_KEXEC_CORE */
  
  static void __loaded_vmcs_clear(void *arg)
@@ -714,19 +676,24 @@ static void __loaded_vmcs_clear(void *arg)
                 return; /* vcpu migration can race with cpu offline */
         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                 per_cpu(current_vmcs, cpu) = NULL;
-       crash_disable_local_vmclear(cpu);
+
+       vmcs_clear(loaded_vmcs->vmcs);
+       if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+               vmcs_clear(loaded_vmcs->shadow_vmcs);
+
         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
  
         /*
-        * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
-        * is before setting loaded_vmcs->vcpu to -1 which is done in
-        * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
-        * then adds the vmcs into percpu list before it is deleted.
+        * Ensure all writes to loaded_vmcs, including deleting it from its
+        * current percpu list, complete before setting loaded_vmcs->vcpu to
+        * -1, otherwise a different cpu can see vcpu == -1 first and add
+        * loaded_vmcs to its percpu list before it's deleted from this cpu's
+        * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
          */
         smp_wmb();
  
-       loaded_vmcs_init(loaded_vmcs);
-       crash_enable_local_vmclear(cpu);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
  }
  
  void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -810,7 +777,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
         if (to_vmx(vcpu)->rmode.vm86_active)
                 eb = ~0;
         if (enable_ept)
-               eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+               eb &= ~(1u << PF_VECTOR);
  
         /* When we are running a nested L2 guest and L1 specified for it a
          * certain exception bitmap, we must trap the same exceptions and pass
@@ -1061,7 +1028,7 @@ static unsigned long segment_base(u16 selector)
  
  static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
  {
-       return (pt_mode == PT_MODE_HOST_GUEST) &&
+       return vmx_pt_mode_is_host_guest() &&
                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
  }
  
@@ -1095,7 +1062,7 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
  
  static void pt_guest_enter(struct vcpu_vmx *vmx)
  {
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                 return;
  
         /*
@@ -1112,7 +1079,7 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
  
  static void pt_guest_exit(struct vcpu_vmx *vmx)
  {
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                 return;
  
         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
@@ -1345,18 +1312,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
         if (!already_loaded) {
                 loaded_vmcs_clear(vmx->loaded_vmcs);
                 local_irq_disable();
-               crash_disable_local_vmclear(cpu);
  
                 /*
-                * Read loaded_vmcs->cpu should be before fetching
-                * loaded_vmcs->loaded_vmcss_on_cpu_link.
-                * See the comments in __loaded_vmcs_clear().
+                * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+                * this cpu's percpu list, otherwise it may not yet be deleted
+                * from its previous cpu's percpu list.  Pairs with the
+                * smb_wmb() in __loaded_vmcs_clear().
                  */
                 smp_rmb();
  
                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                          &per_cpu(loaded_vmcss_on_cpu, cpu));
-               crash_enable_local_vmclear(cpu);
                 local_irq_enable();
         }
  
@@ -1689,16 +1655,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
         vmx_clear_hlt(vcpu);
  }
  
-static bool vmx_rdtscp_supported(void)
-{
-       return cpu_has_vmx_rdtscp();
-}
-
-static bool vmx_invpcid_supported(void)
-{
-       return cpu_has_vmx_invpcid();
-}
-
  /*
   * Swap MSR entry in host/guest MSR entry array.
   */
@@ -1906,24 +1862,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                                                         &msr_info->data);
                 break;
         case MSR_IA32_RTIT_CTL:
-               if (pt_mode != PT_MODE_HOST_GUEST)
+               if (!vmx_pt_mode_is_host_guest())
                         return 1;
                 msr_info->data = vmx->pt_desc.guest.ctl;
                 break;
         case MSR_IA32_RTIT_STATUS:
-               if (pt_mode != PT_MODE_HOST_GUEST)
+               if (!vmx_pt_mode_is_host_guest())
                         return 1;
                 msr_info->data = vmx->pt_desc.guest.status;
                 break;
         case MSR_IA32_RTIT_CR3_MATCH:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
                                                 PT_CAP_cr3_filtering))
                         return 1;
                 msr_info->data = vmx->pt_desc.guest.cr3_match;
                 break;
         case MSR_IA32_RTIT_OUTPUT_BASE:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                         PT_CAP_topa_output) &&
                          !intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1932,7 +1888,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 msr_info->data = vmx->pt_desc.guest.output_base;
                 break;
         case MSR_IA32_RTIT_OUTPUT_MASK:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                         PT_CAP_topa_output) &&
                          !intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1942,7 +1898,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 break;
         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                         (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
                                         PT_CAP_num_address_ranges)))
                         return 1;
@@ -2148,7 +2104,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         return 1;
                 return vmx_set_vmx_msr(vcpu, msr_index, data);
         case MSR_IA32_RTIT_CTL:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                         vmx_rtit_ctl_check(vcpu, data) ||
                         vmx->nested.vmxon)
                         return 1;
@@ -2264,18 +2220,33 @@ static __init int vmx_disabled_by_bios(void)
                !boot_cpu_has(X86_FEATURE_VMX);
  }
  
-static void kvm_cpu_vmxon(u64 addr)
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
  {
+       u64 msr;
+
         cr4_set_bits(X86_CR4_VMXE);
         intel_pt_handle_vmx(1);
  
-       asm volatile ("vmxon %0" : : "m"(addr));
+       asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+                         _ASM_EXTABLE(1b, %l[fault])
+                         : : [vmxon_pointer] "m"(vmxon_pointer)
+                         : : fault);
+       return 0;
+
+fault:
+       WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+                 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+       intel_pt_handle_vmx(0);
+       cr4_clear_bits(X86_CR4_VMXE);
+
+       return -EFAULT;
  }
  
  static int hardware_enable(void)
  {
         int cpu = raw_smp_processor_id();
         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+       int r;
  
         if (cr4_read_shadow() & X86_CR4_VMXE)
                 return -EBUSY;
@@ -2292,18 +2263,10 @@ static int hardware_enable(void)
         INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
         spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
  
-       /*
-        * Now we can enable the vmclear operation in kdump
-        * since the loaded_vmcss_on_cpu list on this cpu
-        * has been initialized.
-        *
-        * Though the cpu is not in VMX operation now, there
-        * is no problem to enable the vmclear operation
-        * for the loaded_vmcss_on_cpu list is empty!
-        */
-       crash_enable_local_vmclear(cpu);
+       r = kvm_cpu_vmxon(phys_addr);
+       if (r)
+               return r;
  
-       kvm_cpu_vmxon(phys_addr);
         if (enable_ept)
                 ept_sync_global();
  
@@ -2603,9 +2566,12 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
         if (!loaded_vmcs->vmcs)
                 return -ENOMEM;
  
+       vmcs_clear(loaded_vmcs->vmcs);
+
         loaded_vmcs->shadow_vmcs = NULL;
         loaded_vmcs->hv_timer_soft_disabled = false;
-       loaded_vmcs_init(loaded_vmcs);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
  
         if (cpu_has_vmx_msr_bitmap()) {
                 loaded_vmcs->msr_bitmap = (unsigned long *)
@@ -2987,9 +2953,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  
  static int get_ept_level(struct kvm_vcpu *vcpu)
  {
-       /* Nested EPT currently only supports 4-level walks. */
         if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
-               return 4;
+               return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
         if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
                 return 5;
         return 4;
@@ -3009,7 +2974,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
         return eptp;
  }
  
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
         struct kvm *kvm = vcpu->kvm;
         bool update_guest_cr3 = true;
@@ -4026,7 +3991,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
  
         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
  
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
         if (!cpu_need_virtualize_apic_accesses(vcpu))
                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
@@ -4081,7 +4046,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
                 }
         }
  
-       if (vmx_rdtscp_supported()) {
+       if (cpu_has_vmx_rdtscp()) {
                 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
                 if (!rdtscp_enabled)
                         exec_control &= ~SECONDARY_EXEC_RDTSCP;
@@ -4096,7 +4061,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
                 }
         }
  
-       if (vmx_invpcid_supported()) {
+       if (cpu_has_vmx_invpcid()) {
                 /* Exposing INVPCID only when PCID is exposed */
                 bool invpcid_enabled =
                         guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
@@ -4267,7 +4232,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
         if (cpu_has_vmx_encls_vmexit())
                 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
  
-       if (pt_mode == PT_MODE_HOST_GUEST) {
+       if (vmx_pt_mode_is_host_guest()) {
                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
                 /* Bit[6~0] are forced to 1, writes are ignored. */
                 vmx->pt_desc.guest.output_mask = 0x7F;
@@ -4495,8 +4460,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
  
  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
-       return (!to_vmx(vcpu)->nested.nested_run_pending &&
-               vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return false;
+
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+               return true;
+
+       return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
  }
@@ -4552,7 +4522,6 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
         case GP_VECTOR:
         case MF_VECTOR:
                 return true;
-       break;
         }
         return false;
  }
@@ -5329,7 +5298,6 @@ static void vmx_enable_tdp(void)
                 VMX_EPT_RWX_MASK, 0ull);
  
         ept_set_mmio_spte_mask();
-       kvm_enable_tdp();
  }
  
  /*
@@ -5862,8 +5830,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
         if (vmx->emulation_required)
                 return handle_invalid_guest_state(vcpu);
  
-       if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
-               return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+       if (is_guest_mode(vcpu)) {
+               /*
+                * The host physical addresses of some pages of guest memory
+                * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+                * Page). The CPU may write to these pages via their host
+                * physical address while L2 is running, bypassing any
+                * address-translation-based dirty tracking (e.g. EPT write
+                * protection).
+                *
+                * Mark them dirty on every exit from L2 to prevent them from
+                * getting out of sync with dirty tracking.
+                */
+               nested_mark_vmcs12_pages_dirty(vcpu);
+
+               if (nested_vmx_exit_reflected(vcpu, exit_reason))
+                       return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+       }
  
         if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
                 dump_vmcs();
@@ -6223,15 +6206,13 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
         vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
  
         /* if exit due to PF check for async PF */
-       if (is_page_fault(vmx->exit_intr_info))
+       if (is_page_fault(vmx->exit_intr_info)) {
                 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
         /* Handle machine checks before interrupts are enabled */
-       if (is_machine_check(vmx->exit_intr_info))
+       } else if (is_machine_check(vmx->exit_intr_info)) {
                 kvm_machine_check();
-
         /* We need to handle NMIs before interrupts are enabled */
-       if (is_nmi(vmx->exit_intr_info)) {
+       } else if (is_nmi(vmx->exit_intr_info)) {
                 kvm_before_interrupt(&vmx->vcpu);
                 asm("int $2");
                 kvm_after_interrupt(&vmx->vcpu);
@@ -6317,11 +6298,6 @@ static bool vmx_has_emulated_msr(int index)
         }
  }
  
-static bool vmx_pt_supported(void)
-{
-       return pt_mode == PT_MODE_HOST_GUEST;
-}
-
  static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
  {
         u32 exit_intr_info;
@@ -6567,7 +6543,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
  
         pt_guest_enter(vmx);
  
-       atomic_switch_perf_msrs(vmx);
+       if (vcpu_to_pmu(vcpu)->version)
+               atomic_switch_perf_msrs(vmx);
         atomic_switch_umwait_control_msr(vmx);
  
         if (enable_preemption_timer)
@@ -6684,20 +6661,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
         vmx_complete_interrupts(vmx);
  }
  
-static struct kvm *vmx_vm_alloc(void)
-{
-       struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
-                                           GFP_KERNEL_ACCOUNT | __GFP_ZERO,
-                                           PAGE_KERNEL);
-       return &kvm_vmx->kvm;
-}
-
-static void vmx_vm_free(struct kvm *kvm)
-{
-       kfree(kvm->arch.hyperv.hv_pa_pg);
-       vfree(to_kvm_vmx(kvm));
-}
-
  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6900,17 +6863,24 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
         u8 cache;
         u64 ipat = 0;
  
-       /* For VT-d and EPT combination
-        * 1. MMIO: always map as UC
-        * 2. EPT with VT-d:
-        *   a. VT-d without snooping control feature: can't guarantee the
-        *      result, try to trust guest.
-        *   b. VT-d with snooping control feature: snooping control feature of
-        *      VT-d engine can guarantee the cache correctness. Just set it
-        *      to WB to keep consistent with host. So the same as item 3.
-        * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
-        *    consistent with host MTRR
+       /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
+        * memory aliases with conflicting memory types and sometimes MCEs.
+        * We have to be careful as to what are honored and when.
+        *
+        * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
+        * UC.  The effective memory type is UC or WC depending on guest PAT.
+        * This was historically the source of MCEs and we want to be
+        * conservative.
+        *
+        * When there is no need to deal with noncoherent DMA (e.g., no VT-d
+        * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
+        * EPT memory type is set to WB.  The effective memory type is forced
+        * WB.
+        *
+        * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
+        * EPT memory type is used to emulate guest CD/MTRR.
          */
+
         if (is_mmio) {
                 cache = MTRR_TYPE_UNCACHABLE;
                 goto exit;
@@ -6937,15 +6907,6 @@ exit:
         return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
  }
  
-static int vmx_get_lpage_level(void)
-{
-       if (enable_ept && !cpu_has_vmx_ept_1g_page())
-               return PT_DIRECTORY_LEVEL;
-       else
-               /* For shadow and EPT supported 1GB page */
-               return PT_PDPE_LEVEL;
-}
-
  static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
  {
         /*
@@ -7136,10 +7097,37 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
         }
  }
  
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static __init void vmx_set_cpu_caps(void)
  {
-       if (func == 1 && nested)
-               entry->ecx |= feature_bit(VMX);
+       kvm_set_cpu_caps();
+
+       /* CPUID 0x1 */
+       if (nested)
+               kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+       /* CPUID 0x7 */
+       if (kvm_mpx_supported())
+               kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+       if (cpu_has_vmx_invpcid())
+               kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+       if (vmx_pt_mode_is_host_guest())
+               kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+
+       /* PKU is not yet implemented for shadow paging. */
+       if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
+               kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
+
+       if (vmx_umip_emulated())
+               kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
+       /* CPUID 0xD.1 */
+       supported_xss = 0;
+       if (!vmx_xsaves_supported())
+               kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
+       /* CPUID 0x80000001 */
+       if (!cpu_has_vmx_rdtscp())
+               kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
  }
  
  static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -7183,10 +7171,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
  
  static int vmx_check_intercept(struct kvm_vcpu *vcpu,
                                struct x86_instruction_info *info,
-                              enum x86_intercept_stage stage)
+                              enum x86_intercept_stage stage,
+                              struct x86_exception *exception)
  {
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
  
         switch (info->intercept) {
         /*
@@ -7195,8 +7183,8 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
          */
         case x86_intercept_rdtscp:
                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
-                       ctxt->exception.vector = UD_VECTOR;
-                       ctxt->exception.error_code_valid = false;
+                       exception->vector = UD_VECTOR;
+                       exception->error_code_valid = false;
                         return X86EMUL_PROPAGATE_FAULT;
                 }
                 break;
@@ -7307,7 +7295,8 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
  static void vmx_slot_enable_log_dirty(struct kvm *kvm,
                                      struct kvm_memory_slot *slot)
  {
-       kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+       if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
+               kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
         kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
  }
  
@@ -7661,9 +7650,7 @@ static __init int hardware_setup(void)
  {
         unsigned long host_bndcfgs;
         struct desc_ptr dt;
-       int r, i;
-
-       rdmsrl_safe(MSR_EFER, &host_efer);
+       int r, i, ept_lpage_level;
  
         store_idt(&dt);
         host_idt_base = dt.address;
@@ -7682,6 +7669,10 @@ static __init int hardware_setup(void)
                 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
         }
  
+       if (!cpu_has_vmx_mpx())
+               supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+                                   XFEATURE_MASK_BNDCSR);
+
         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
             !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                 enable_vpid = 0;
@@ -7715,9 +7706,6 @@ static __init int hardware_setup(void)
         if (!cpu_has_vmx_tpr_shadow())
                 kvm_x86_ops->update_cr8_intercept = NULL;
  
-       if (enable_ept && !cpu_has_vmx_ept_2m_page())
-               kvm_disable_largepages();
-
  #if IS_ENABLED(CONFIG_HYPERV)
         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
             && enable_ept) {
@@ -7750,8 +7738,16 @@ static __init int hardware_setup(void)
  
         if (enable_ept)
                 vmx_enable_tdp();
+
+       if (!enable_ept)
+               ept_lpage_level = 0;
+       else if (cpu_has_vmx_ept_1g_page())
+               ept_lpage_level = PT_PDPE_LEVEL;
+       else if (cpu_has_vmx_ept_2m_page())
+               ept_lpage_level = PT_DIRECTORY_LEVEL;
         else
-               kvm_disable_tdp();
+               ept_lpage_level = PT_PAGE_TABLE_LEVEL;
+       kvm_configure_mmu(enable_ept, ept_lpage_level);
  
         /*
          * Only enable PML when hardware supports PML feature, and both EPT
@@ -7815,6 +7811,8 @@ static __init int hardware_setup(void)
                         return r;
         }
  
+       vmx_set_cpu_caps();
+
         r = alloc_kvm_area();
         if (r)
                 nested_vmx_hardware_unsetup();
@@ -7848,9 +7846,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .cpu_has_accelerated_tpr = report_flexpriority,
         .has_emulated_msr = vmx_has_emulated_msr,
  
+       .vm_size = sizeof(struct kvm_vmx),
         .vm_init = vmx_vm_init,
-       .vm_alloc = vmx_vm_alloc,
-       .vm_free = vmx_vm_free,
  
         .vcpu_create = vmx_create_vcpu,
         .vcpu_free = vmx_free_vcpu,
@@ -7872,7 +7869,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
         .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
         .set_cr0 = vmx_set_cr0,
-       .set_cr3 = vmx_set_cr3,
         .set_cr4 = vmx_set_cr4,
         .set_efer = vmx_set_efer,
         .get_idt = vmx_get_idt,
@@ -7928,29 +7924,17 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
  
         .get_exit_info = vmx_get_exit_info,
  
-       .get_lpage_level = vmx_get_lpage_level,
-
         .cpuid_update = vmx_cpuid_update,
  
-       .rdtscp_supported = vmx_rdtscp_supported,
-       .invpcid_supported = vmx_invpcid_supported,
-
-       .set_supported_cpuid = vmx_set_supported_cpuid,
-
         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  
         .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
         .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
  
-       .set_tdp_cr3 = vmx_set_cr3,
+       .load_mmu_pgd = vmx_load_mmu_pgd,
  
         .check_intercept = vmx_check_intercept,
         .handle_exit_irqoff = vmx_handle_exit_irqoff,
-       .mpx_supported = vmx_mpx_supported,
-       .xsaves_supported = vmx_xsaves_supported,
-       .umip_emulated = vmx_umip_emulated,
-       .pt_supported = vmx_pt_supported,
-       .pku_supported = vmx_pku_supported,
  
         .request_immediate_exit = vmx_request_immediate_exit,