Merge tag 'kvmarm-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm...
[platform/kernel/linux-starfive.git] / arch / x86 / kvm / vmx / vmx.c
index 40b1e61..a7dd678 100644 (file)
@@ -435,7 +435,6 @@ static const struct kvm_vmx_segment_field {
        VMX_SEGMENT_FIELD(LDTR),
 };
 
-u64 host_efer;
 static unsigned long host_idt_base;
 
 /*
@@ -656,53 +655,16 @@ static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr,
        return ret;
 }
 
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
-{
-       vmcs_clear(loaded_vmcs->vmcs);
-       if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
-               vmcs_clear(loaded_vmcs->shadow_vmcs);
-       loaded_vmcs->cpu = -1;
-       loaded_vmcs->launched = 0;
-}
-
 #ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
-       cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
-       cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
-       return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
 static void crash_vmclear_local_loaded_vmcss(void)
 {
        int cpu = raw_smp_processor_id();
        struct loaded_vmcs *v;
 
-       if (!crash_local_vmclear_enabled(cpu))
-               return;
-
        list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                            loaded_vmcss_on_cpu_link)
                vmcs_clear(v->vmcs);
 }
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
 #endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
@@ -714,19 +676,24 @@ static void __loaded_vmcs_clear(void *arg)
                return; /* vcpu migration can race with cpu offline */
        if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
-       crash_disable_local_vmclear(cpu);
+
+       vmcs_clear(loaded_vmcs->vmcs);
+       if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+               vmcs_clear(loaded_vmcs->shadow_vmcs);
+
        list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
 
        /*
-        * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
-        * is before setting loaded_vmcs->vcpu to -1 which is done in
-        * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
-        * then adds the vmcs into percpu list before it is deleted.
+        * Ensure all writes to loaded_vmcs, including deleting it from its
+        * current percpu list, complete before setting loaded_vmcs->vcpu to
+        * -1, otherwise a different cpu can see vcpu == -1 first and add
+        * loaded_vmcs to its percpu list before it's deleted from this cpu's
+        * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
         */
        smp_wmb();
 
-       loaded_vmcs_init(loaded_vmcs);
-       crash_enable_local_vmclear(cpu);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
 }
 
 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -810,7 +777,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
        if (to_vmx(vcpu)->rmode.vm86_active)
                eb = ~0;
        if (enable_ept)
-               eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+               eb &= ~(1u << PF_VECTOR);
 
        /* When we are running a nested L2 guest and L1 specified for it a
         * certain exception bitmap, we must trap the same exceptions and pass
@@ -1061,7 +1028,7 @@ static unsigned long segment_base(u16 selector)
 
 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
 {
-       return (pt_mode == PT_MODE_HOST_GUEST) &&
+       return vmx_pt_mode_is_host_guest() &&
               !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
 }
 
@@ -1095,7 +1062,7 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
 
 static void pt_guest_enter(struct vcpu_vmx *vmx)
 {
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                return;
 
        /*
@@ -1112,7 +1079,7 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
 
 static void pt_guest_exit(struct vcpu_vmx *vmx)
 {
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                return;
 
        if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
@@ -1345,18 +1312,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
        if (!already_loaded) {
                loaded_vmcs_clear(vmx->loaded_vmcs);
                local_irq_disable();
-               crash_disable_local_vmclear(cpu);
 
                /*
-                * Read loaded_vmcs->cpu should be before fetching
-                * loaded_vmcs->loaded_vmcss_on_cpu_link.
-                * See the comments in __loaded_vmcs_clear().
+                * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+                * this cpu's percpu list, otherwise it may not yet be deleted
+                * from its previous cpu's percpu list.  Pairs with the
+                * smb_wmb() in __loaded_vmcs_clear().
                 */
                smp_rmb();
 
                list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
-               crash_enable_local_vmclear(cpu);
                local_irq_enable();
        }
 
@@ -1689,16 +1655,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
        vmx_clear_hlt(vcpu);
 }
 
-static bool vmx_rdtscp_supported(void)
-{
-       return cpu_has_vmx_rdtscp();
-}
-
-static bool vmx_invpcid_supported(void)
-{
-       return cpu_has_vmx_invpcid();
-}
-
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
@@ -1906,24 +1862,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                                                        &msr_info->data);
                break;
        case MSR_IA32_RTIT_CTL:
-               if (pt_mode != PT_MODE_HOST_GUEST)
+               if (!vmx_pt_mode_is_host_guest())
                        return 1;
                msr_info->data = vmx->pt_desc.guest.ctl;
                break;
        case MSR_IA32_RTIT_STATUS:
-               if (pt_mode != PT_MODE_HOST_GUEST)
+               if (!vmx_pt_mode_is_host_guest())
                        return 1;
                msr_info->data = vmx->pt_desc.guest.status;
                break;
        case MSR_IA32_RTIT_CR3_MATCH:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        !intel_pt_validate_cap(vmx->pt_desc.caps,
                                                PT_CAP_cr3_filtering))
                        return 1;
                msr_info->data = vmx->pt_desc.guest.cr3_match;
                break;
        case MSR_IA32_RTIT_OUTPUT_BASE:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_topa_output) &&
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1932,7 +1888,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vmx->pt_desc.guest.output_base;
                break;
        case MSR_IA32_RTIT_OUTPUT_MASK:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_topa_output) &&
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1942,7 +1898,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
                index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_num_address_ranges)))
                        return 1;
@@ -2148,7 +2104,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                return vmx_set_vmx_msr(vcpu, msr_index, data);
        case MSR_IA32_RTIT_CTL:
-               if ((pt_mode != PT_MODE_HOST_GUEST) ||
+               if (!vmx_pt_mode_is_host_guest() ||
                        vmx_rtit_ctl_check(vcpu, data) ||
                        vmx->nested.vmxon)
                        return 1;
@@ -2264,18 +2220,33 @@ static __init int vmx_disabled_by_bios(void)
               !boot_cpu_has(X86_FEATURE_VMX);
 }
 
-static void kvm_cpu_vmxon(u64 addr)
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
 {
+       u64 msr;
+
        cr4_set_bits(X86_CR4_VMXE);
        intel_pt_handle_vmx(1);
 
-       asm volatile ("vmxon %0" : : "m"(addr));
+       asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+                         _ASM_EXTABLE(1b, %l[fault])
+                         : : [vmxon_pointer] "m"(vmxon_pointer)
+                         : : fault);
+       return 0;
+
+fault:
+       WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+                 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+       intel_pt_handle_vmx(0);
+       cr4_clear_bits(X86_CR4_VMXE);
+
+       return -EFAULT;
 }
 
 static int hardware_enable(void)
 {
        int cpu = raw_smp_processor_id();
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+       int r;
 
        if (cr4_read_shadow() & X86_CR4_VMXE)
                return -EBUSY;
@@ -2292,18 +2263,10 @@ static int hardware_enable(void)
        INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
        spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 
-       /*
-        * Now we can enable the vmclear operation in kdump
-        * since the loaded_vmcss_on_cpu list on this cpu
-        * has been initialized.
-        *
-        * Though the cpu is not in VMX operation now, there
-        * is no problem to enable the vmclear operation
-        * for the loaded_vmcss_on_cpu list is empty!
-        */
-       crash_enable_local_vmclear(cpu);
+       r = kvm_cpu_vmxon(phys_addr);
+       if (r)
+               return r;
 
-       kvm_cpu_vmxon(phys_addr);
        if (enable_ept)
                ept_sync_global();
 
@@ -2603,9 +2566,12 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
        if (!loaded_vmcs->vmcs)
                return -ENOMEM;
 
+       vmcs_clear(loaded_vmcs->vmcs);
+
        loaded_vmcs->shadow_vmcs = NULL;
        loaded_vmcs->hv_timer_soft_disabled = false;
-       loaded_vmcs_init(loaded_vmcs);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
 
        if (cpu_has_vmx_msr_bitmap()) {
                loaded_vmcs->msr_bitmap = (unsigned long *)
@@ -2987,9 +2953,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static int get_ept_level(struct kvm_vcpu *vcpu)
 {
-       /* Nested EPT currently only supports 4-level walks. */
        if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
-               return 4;
+               return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
        if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
                return 5;
        return 4;
@@ -3009,7 +2974,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
        return eptp;
 }
 
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        struct kvm *kvm = vcpu->kvm;
        bool update_guest_cr3 = true;
@@ -4026,7 +3991,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 
        u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
 
-       if (pt_mode == PT_MODE_SYSTEM)
+       if (vmx_pt_mode_is_system())
                exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
        if (!cpu_need_virtualize_apic_accesses(vcpu))
                exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
@@ -4081,7 +4046,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
                }
        }
 
-       if (vmx_rdtscp_supported()) {
+       if (cpu_has_vmx_rdtscp()) {
                bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
                if (!rdtscp_enabled)
                        exec_control &= ~SECONDARY_EXEC_RDTSCP;
@@ -4096,7 +4061,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
                }
        }
 
-       if (vmx_invpcid_supported()) {
+       if (cpu_has_vmx_invpcid()) {
                /* Exposing INVPCID only when PCID is exposed */
                bool invpcid_enabled =
                        guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
@@ -4267,7 +4232,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
        if (cpu_has_vmx_encls_vmexit())
                vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
 
-       if (pt_mode == PT_MODE_HOST_GUEST) {
+       if (vmx_pt_mode_is_host_guest()) {
                memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
                /* Bit[6~0] are forced to 1, writes are ignored. */
                vmx->pt_desc.guest.output_mask = 0x7F;
@@ -4495,8 +4460,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-       return (!to_vmx(vcpu)->nested.nested_run_pending &&
-               vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return false;
+
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+               return true;
+
+       return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
@@ -4552,7 +4522,6 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
        case GP_VECTOR:
        case MF_VECTOR:
                return true;
-       break;
        }
        return false;
 }
@@ -5329,7 +5298,6 @@ static void vmx_enable_tdp(void)
                VMX_EPT_RWX_MASK, 0ull);
 
        ept_set_mmio_spte_mask();
-       kvm_enable_tdp();
 }
 
 /*
@@ -5862,8 +5830,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
 
-       if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
-               return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+       if (is_guest_mode(vcpu)) {
+               /*
+                * The host physical addresses of some pages of guest memory
+                * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+                * Page). The CPU may write to these pages via their host
+                * physical address while L2 is running, bypassing any
+                * address-translation-based dirty tracking (e.g. EPT write
+                * protection).
+                *
+                * Mark them dirty on every exit from L2 to prevent them from
+                * getting out of sync with dirty tracking.
+                */
+               nested_mark_vmcs12_pages_dirty(vcpu);
+
+               if (nested_vmx_exit_reflected(vcpu, exit_reason))
+                       return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+       }
 
        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
                dump_vmcs();
@@ -6223,15 +6206,13 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
        vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
        /* if exit due to PF check for async PF */
-       if (is_page_fault(vmx->exit_intr_info))
+       if (is_page_fault(vmx->exit_intr_info)) {
                vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
        /* Handle machine checks before interrupts are enabled */
-       if (is_machine_check(vmx->exit_intr_info))
+       } else if (is_machine_check(vmx->exit_intr_info)) {
                kvm_machine_check();
-
        /* We need to handle NMIs before interrupts are enabled */
-       if (is_nmi(vmx->exit_intr_info)) {
+       } else if (is_nmi(vmx->exit_intr_info)) {
                kvm_before_interrupt(&vmx->vcpu);
                asm("int $2");
                kvm_after_interrupt(&vmx->vcpu);
@@ -6317,11 +6298,6 @@ static bool vmx_has_emulated_msr(int index)
        }
 }
 
-static bool vmx_pt_supported(void)
-{
-       return pt_mode == PT_MODE_HOST_GUEST;
-}
-
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -6567,7 +6543,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        pt_guest_enter(vmx);
 
-       atomic_switch_perf_msrs(vmx);
+       if (vcpu_to_pmu(vcpu)->version)
+               atomic_switch_perf_msrs(vmx);
        atomic_switch_umwait_control_msr(vmx);
 
        if (enable_preemption_timer)
@@ -6684,20 +6661,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx_complete_interrupts(vmx);
 }
 
-static struct kvm *vmx_vm_alloc(void)
-{
-       struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
-                                           GFP_KERNEL_ACCOUNT | __GFP_ZERO,
-                                           PAGE_KERNEL);
-       return &kvm_vmx->kvm;
-}
-
-static void vmx_vm_free(struct kvm *kvm)
-{
-       kfree(kvm->arch.hyperv.hv_pa_pg);
-       vfree(to_kvm_vmx(kvm));
-}
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6900,17 +6863,24 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
        u8 cache;
        u64 ipat = 0;
 
-       /* For VT-d and EPT combination
-        * 1. MMIO: always map as UC
-        * 2. EPT with VT-d:
-        *   a. VT-d without snooping control feature: can't guarantee the
-        *      result, try to trust guest.
-        *   b. VT-d with snooping control feature: snooping control feature of
-        *      VT-d engine can guarantee the cache correctness. Just set it
-        *      to WB to keep consistent with host. So the same as item 3.
-        * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
-        *    consistent with host MTRR
+       /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
+        * memory aliases with conflicting memory types and sometimes MCEs.
+        * We have to be careful as to what are honored and when.
+        *
+        * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
+        * UC.  The effective memory type is UC or WC depending on guest PAT.
+        * This was historically the source of MCEs and we want to be
+        * conservative.
+        *
+        * When there is no need to deal with noncoherent DMA (e.g., no VT-d
+        * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
+        * EPT memory type is set to WB.  The effective memory type is forced
+        * WB.
+        *
+        * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
+        * EPT memory type is used to emulate guest CD/MTRR.
         */
+
        if (is_mmio) {
                cache = MTRR_TYPE_UNCACHABLE;
                goto exit;
@@ -6937,15 +6907,6 @@ exit:
        return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
 }
 
-static int vmx_get_lpage_level(void)
-{
-       if (enable_ept && !cpu_has_vmx_ept_1g_page())
-               return PT_DIRECTORY_LEVEL;
-       else
-               /* For shadow and EPT supported 1GB page */
-               return PT_PDPE_LEVEL;
-}
-
 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
 {
        /*
@@ -7136,10 +7097,37 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
        }
 }
 
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static __init void vmx_set_cpu_caps(void)
 {
-       if (func == 1 && nested)
-               entry->ecx |= feature_bit(VMX);
+       kvm_set_cpu_caps();
+
+       /* CPUID 0x1 */
+       if (nested)
+               kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+       /* CPUID 0x7 */
+       if (kvm_mpx_supported())
+               kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+       if (cpu_has_vmx_invpcid())
+               kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+       if (vmx_pt_mode_is_host_guest())
+               kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+
+       /* PKU is not yet implemented for shadow paging. */
+       if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
+               kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
+
+       if (vmx_umip_emulated())
+               kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
+       /* CPUID 0xD.1 */
+       supported_xss = 0;
+       if (!vmx_xsaves_supported())
+               kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
+       /* CPUID 0x80000001 */
+       if (!cpu_has_vmx_rdtscp())
+               kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
 }
 
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -7183,10 +7171,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
 
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
-                              enum x86_intercept_stage stage)
+                              enum x86_intercept_stage stage,
+                              struct x86_exception *exception)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 
        switch (info->intercept) {
        /*
@@ -7195,8 +7183,8 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
         */
        case x86_intercept_rdtscp:
                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
-                       ctxt->exception.vector = UD_VECTOR;
-                       ctxt->exception.error_code_valid = false;
+                       exception->vector = UD_VECTOR;
+                       exception->error_code_valid = false;
                        return X86EMUL_PROPAGATE_FAULT;
                }
                break;
@@ -7307,7 +7295,8 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_slot_enable_log_dirty(struct kvm *kvm,
                                     struct kvm_memory_slot *slot)
 {
-       kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+       if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
+               kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
        kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
 }
 
@@ -7661,9 +7650,7 @@ static __init int hardware_setup(void)
 {
        unsigned long host_bndcfgs;
        struct desc_ptr dt;
-       int r, i;
-
-       rdmsrl_safe(MSR_EFER, &host_efer);
+       int r, i, ept_lpage_level;
 
        store_idt(&dt);
        host_idt_base = dt.address;
@@ -7682,6 +7669,10 @@ static __init int hardware_setup(void)
                WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
        }
 
+       if (!cpu_has_vmx_mpx())
+               supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+                                   XFEATURE_MASK_BNDCSR);
+
        if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
            !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                enable_vpid = 0;
@@ -7715,9 +7706,6 @@ static __init int hardware_setup(void)
        if (!cpu_has_vmx_tpr_shadow())
                kvm_x86_ops->update_cr8_intercept = NULL;
 
-       if (enable_ept && !cpu_has_vmx_ept_2m_page())
-               kvm_disable_largepages();
-
 #if IS_ENABLED(CONFIG_HYPERV)
        if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
            && enable_ept) {
@@ -7750,8 +7738,16 @@ static __init int hardware_setup(void)
 
        if (enable_ept)
                vmx_enable_tdp();
+
+       if (!enable_ept)
+               ept_lpage_level = 0;
+       else if (cpu_has_vmx_ept_1g_page())
+               ept_lpage_level = PT_PDPE_LEVEL;
+       else if (cpu_has_vmx_ept_2m_page())
+               ept_lpage_level = PT_DIRECTORY_LEVEL;
        else
-               kvm_disable_tdp();
+               ept_lpage_level = PT_PAGE_TABLE_LEVEL;
+       kvm_configure_mmu(enable_ept, ept_lpage_level);
 
        /*
         * Only enable PML when hardware supports PML feature, and both EPT
@@ -7815,6 +7811,8 @@ static __init int hardware_setup(void)
                        return r;
        }
 
+       vmx_set_cpu_caps();
+
        r = alloc_kvm_area();
        if (r)
                nested_vmx_hardware_unsetup();
@@ -7848,9 +7846,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_accelerated_tpr = report_flexpriority,
        .has_emulated_msr = vmx_has_emulated_msr,
 
+       .vm_size = sizeof(struct kvm_vmx),
        .vm_init = vmx_vm_init,
-       .vm_alloc = vmx_vm_alloc,
-       .vm_free = vmx_vm_free,
 
        .vcpu_create = vmx_create_vcpu,
        .vcpu_free = vmx_free_vcpu,
@@ -7872,7 +7869,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
        .set_cr0 = vmx_set_cr0,
-       .set_cr3 = vmx_set_cr3,
        .set_cr4 = vmx_set_cr4,
        .set_efer = vmx_set_efer,
        .get_idt = vmx_get_idt,
@@ -7928,29 +7924,17 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
        .get_exit_info = vmx_get_exit_info,
 
-       .get_lpage_level = vmx_get_lpage_level,
-
        .cpuid_update = vmx_cpuid_update,
 
-       .rdtscp_supported = vmx_rdtscp_supported,
-       .invpcid_supported = vmx_invpcid_supported,
-
-       .set_supported_cpuid = vmx_set_supported_cpuid,
-
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
        .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
        .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
 
-       .set_tdp_cr3 = vmx_set_cr3,
+       .load_mmu_pgd = vmx_load_mmu_pgd,
 
        .check_intercept = vmx_check_intercept,
        .handle_exit_irqoff = vmx_handle_exit_irqoff,
-       .mpx_supported = vmx_mpx_supported,
-       .xsaves_supported = vmx_xsaves_supported,
-       .umip_emulated = vmx_umip_emulated,
-       .pt_supported = vmx_pt_supported,
-       .pku_supported = vmx_pku_supported,
 
        .request_immediate_exit = vmx_request_immediate_exit,