Merge branch 'kvm-fixes' into 'next'
[platform/kernel/linux-rpi.git] / arch / x86 / kvm / vmx / vmx.c
index f4e9c31..7558967 100644 (file)
@@ -56,7 +56,6 @@
 #include "lapic.h"
 #include "mmu.h"
 #include "nested.h"
-#include "ops.h"
 #include "pmu.h"
 #include "trace.h"
 #include "vmcs.h"
@@ -149,8 +148,25 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
        RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
        RTIT_STATUS_BYTECNT))
 
-#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
-       (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+/*
+ * List of MSRs that can be directly passed to the guest.
+ * In addition to these x2apic and PT MSRs are handled specially.
+ */
+static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
+       MSR_IA32_SPEC_CTRL,
+       MSR_IA32_PRED_CMD,
+       MSR_IA32_TSC,
+       MSR_FS_BASE,
+       MSR_GS_BASE,
+       MSR_KERNEL_GS_BASE,
+       MSR_IA32_SYSENTER_CS,
+       MSR_IA32_SYSENTER_ESP,
+       MSR_IA32_SYSENTER_EIP,
+       MSR_CORE_C1_RES,
+       MSR_CORE_C3_RESIDENCY,
+       MSR_CORE_C6_RESIDENCY,
+       MSR_CORE_C7_RESIDENCY,
+};
 
 /*
  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
@@ -344,9 +360,8 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 };
 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 
-static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
                                                          u32 msr, int type);
 
 void vmx_vmexit(void);
@@ -401,13 +416,6 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs);
  */
 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 
-/*
- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
- */
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
-
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);
 
@@ -450,9 +458,9 @@ static unsigned long host_idt_base;
  * will emulate SYSCALL in legacy mode if the vendor string in guest
  * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
  * support this emulation, IA32_STAR must always be included in
- * vmx_msr_index[], even in i386 builds.
+ * vmx_uret_msrs_list[], even in i386 builds.
  */
-const u32 vmx_msr_index[] = {
+static const u32 vmx_uret_msrs_list[] = {
 #ifdef CONFIG_X86_64
        MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 #endif
@@ -626,36 +634,71 @@ static inline bool report_flexpriority(void)
        return flexpriority_enabled;
 }
 
-static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+static int possible_passthrough_msr_slot(u32 msr)
+{
+       u32 i;
+
+       for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
+               if (vmx_possible_passthrough_msrs[i] == msr)
+                       return i;
+
+       return -ENOENT;
+}
+
+static bool is_valid_passthrough_msr(u32 msr)
+{
+       bool r;
+
+       switch (msr) {
+       case 0x800 ... 0x8ff:
+               /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
+               return true;
+       case MSR_IA32_RTIT_STATUS:
+       case MSR_IA32_RTIT_OUTPUT_BASE:
+       case MSR_IA32_RTIT_OUTPUT_MASK:
+       case MSR_IA32_RTIT_CR3_MATCH:
+       case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+               /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+               return true;
+       }
+
+       r = possible_passthrough_msr_slot(msr) != -ENOENT;
+
+       WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+
+       return r;
+}
+
+static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 {
        int i;
 
-       for (i = 0; i < vmx->nmsrs; ++i)
-               if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+       for (i = 0; i < vmx->nr_uret_msrs; ++i)
+               if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
                        return i;
        return -1;
 }
 
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 {
        int i;
 
-       i = __find_msr_index(vmx, msr);
+       i = __vmx_find_uret_msr(vmx, msr);
        if (i >= 0)
-               return &vmx->guest_msrs[i];
+               return &vmx->guest_uret_msrs[i];
        return NULL;
 }
 
-static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
+static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
+                                 struct vmx_uret_msr *msr, u64 data)
 {
        int ret = 0;
 
        u64 old_msr_data = msr->data;
        msr->data = data;
-       if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
+       if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
                preempt_disable();
-               ret = kvm_set_shared_msr(msr->index, msr->data,
-                                        msr->mask);
+               ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
                preempt_enable();
                if (ret)
                        msr->data = old_msr_data;
@@ -840,7 +883,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
        vm_exit_controls_clearbit(vmx, exit);
 }
 
-int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
 {
        unsigned int i;
 
@@ -874,7 +917,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
                }
                break;
        }
-       i = vmx_find_msr_index(&m->guest, msr);
+       i = vmx_find_loadstore_msr_slot(&m->guest, msr);
        if (i < 0)
                goto skip_guest;
        --m->guest.nr;
@@ -882,7 +925,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
 
 skip_guest:
-       i = vmx_find_msr_index(&m->host, msr);
+       i = vmx_find_loadstore_msr_slot(&m->host, msr);
        if (i < 0)
                return;
 
@@ -941,12 +984,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
        }
 
-       i = vmx_find_msr_index(&m->guest, msr);
+       i = vmx_find_loadstore_msr_slot(&m->guest, msr);
        if (!entry_only)
-               j = vmx_find_msr_index(&m->host, msr);
+               j = vmx_find_loadstore_msr_slot(&m->host, msr);
 
-       if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
-               (j < 0 &&  m->host.nr == NR_LOADSTORE_MSRS)) {
+       if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
+           (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
                printk_once(KERN_WARNING "Not enough msr switch entries. "
                                "Can't add msr %x\n", msr);
                return;
@@ -969,10 +1012,11 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
        m->host.val[j].value = host_val;
 }
 
-static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+static bool update_transition_efer(struct vcpu_vmx *vmx)
 {
        u64 guest_efer = vmx->vcpu.arch.efer;
        u64 ignore_bits = 0;
+       int i;
 
        /* Shadow paging assumes NX to be available.  */
        if (!enable_ept)
@@ -1004,17 +1048,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
                else
                        clear_atomic_switch_msr(vmx, MSR_EFER);
                return false;
-       } else {
-               clear_atomic_switch_msr(vmx, MSR_EFER);
+       }
 
-               guest_efer &= ~ignore_bits;
-               guest_efer |= host_efer & ignore_bits;
+       i = __vmx_find_uret_msr(vmx, MSR_EFER);
+       if (i < 0)
+               return false;
 
-               vmx->guest_msrs[efer_offset].data = guest_efer;
-               vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+       clear_atomic_switch_msr(vmx, MSR_EFER);
 
-               return true;
-       }
+       guest_efer &= ~ignore_bits;
+       guest_efer |= host_efer & ignore_bits;
+
+       vmx->guest_uret_msrs[i].data = guest_efer;
+       vmx->guest_uret_msrs[i].mask = ~ignore_bits;
+
+       return true;
 }
 
 #ifdef CONFIG_X86_32
@@ -1052,6 +1100,12 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
               !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
 }
 
+static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
+{
+       /* The base must be 128-byte aligned and a legal physical address. */
+       return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
+}
+
 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
 {
        u32 i;
@@ -1156,12 +1210,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
         * when guest state is loaded. This happens when guest transitions
         * to/from long-mode by setting MSR_EFER.LMA.
         */
-       if (!vmx->guest_msrs_ready) {
-               vmx->guest_msrs_ready = true;
-               for (i = 0; i < vmx->save_nmsrs; ++i)
-                       kvm_set_shared_msr(vmx->guest_msrs[i].index,
-                                          vmx->guest_msrs[i].data,
-                                          vmx->guest_msrs[i].mask);
+       if (!vmx->guest_uret_msrs_loaded) {
+               vmx->guest_uret_msrs_loaded = true;
+               for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
+                       kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+                                               vmx->guest_uret_msrs[i].data,
+                                               vmx->guest_uret_msrs[i].mask);
 
        }
 
@@ -1245,7 +1299,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 #endif
        load_fixmap_gdt(raw_smp_processor_id());
        vmx->guest_state_loaded = false;
-       vmx->guest_msrs_ready = false;
+       vmx->guest_uret_msrs_loaded = false;
 }
 
 #ifdef CONFIG_X86_64
@@ -1268,62 +1322,6 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
 }
 #endif
 
-static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-       struct pi_desc old, new;
-       unsigned int dest;
-
-       /*
-        * In case of hot-plug or hot-unplug, we may have to undo
-        * vmx_vcpu_pi_put even if there is no assigned device.  And we
-        * always keep PI.NDST up to date for simplicity: it makes the
-        * code easier, and CPU migration is not a fast path.
-        */
-       if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
-               return;
-
-       /*
-        * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
-        * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
-        * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
-        * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
-        * correctly.
-        */
-       if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
-               pi_clear_sn(pi_desc);
-               goto after_clear_sn;
-       }
-
-       /* The full case.  */
-       do {
-               old.control = new.control = pi_desc->control;
-
-               dest = cpu_physical_id(cpu);
-
-               if (x2apic_enabled())
-                       new.ndst = dest;
-               else
-                       new.ndst = (dest << 8) & 0xFF00;
-
-               new.sn = 0;
-       } while (cmpxchg64(&pi_desc->control, old.control,
-                          new.control) != old.control);
-
-after_clear_sn:
-
-       /*
-        * Clear SN before reading the bitmap.  The VT-d firmware
-        * writes the bitmap and reads SN atomically (5.2.3 in the
-        * spec), so it doesn't really have a memory barrier that
-        * pairs with this, but we cannot do that and we need one.
-        */
-       smp_mb__after_atomic();
-
-       if (!pi_is_pir_empty(pi_desc))
-               pi_set_on(pi_desc);
-}
-
 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
                        struct loaded_vmcs *buddy)
 {
@@ -1407,20 +1405,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        vmx->host_debugctlmsr = get_debugctlmsr();
 }
 
-static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
-               !kvm_vcpu_apicv_active(vcpu))
-               return;
-
-       /* Set SN when the vCPU is preempted */
-       if (vcpu->preempted)
-               pi_set_sn(pi_desc);
-}
-
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
        vmx_vcpu_pi_put(vcpu);
@@ -1430,7 +1414,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 
 static bool emulation_required(struct kvm_vcpu *vcpu)
 {
-       return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+       return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
 }
 
 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1456,7 +1440,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long old_rflags;
 
-       if (enable_unrestricted_guest) {
+       if (is_unrestricted_guest(vcpu)) {
                kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
                vmx->rflags = rflags;
                vmcs_writel(GUEST_RFLAGS, rflags);
@@ -1576,6 +1560,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
        return 0;
 }
 
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+{
+       return true;
+}
+
 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
        unsigned long rip, orig_rip;
@@ -1614,33 +1603,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 }
 
 /*
- * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
- * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
- * indicates whether exit to userspace is needed.
- */
-int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
-                             struct x86_exception *e)
-{
-       if (r == X86EMUL_PROPAGATE_FAULT) {
-               kvm_inject_emulated_page_fault(vcpu, e);
-               return 1;
-       }
-
-       /*
-        * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
-        * while handling a VMX instruction KVM could've handled the request
-        * correctly by exiting to userspace and performing I/O but there
-        * doesn't seem to be a real use-case behind such requests, just return
-        * KVM_EXIT_INTERNAL_ERROR for now.
-        */
-       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-       vcpu->run->internal.ndata = 0;
-
-       return 0;
-}
-
-/*
  * Recognizes a pending MTF VM-exit and records the nested state for later
  * delivery.
  */
@@ -1723,16 +1685,19 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
        vmx_clear_hlt(vcpu);
 }
 
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
 {
-       struct shared_msr_entry tmp;
+       struct vmx_uret_msr tmp;
+       int from, to;
+
+       from = __vmx_find_uret_msr(vmx, msr);
+       if (from < 0)
+               return;
+       to = vmx->nr_active_uret_msrs++;
 
-       tmp = vmx->guest_msrs[to];
-       vmx->guest_msrs[to] = vmx->guest_msrs[from];
-       vmx->guest_msrs[from] = tmp;
+       tmp = vmx->guest_uret_msrs[to];
+       vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
+       vmx->guest_uret_msrs[from] = tmp;
 }
 
 /*
@@ -1742,38 +1707,26 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
  */
 static void setup_msrs(struct vcpu_vmx *vmx)
 {
-       int save_nmsrs, index;
-
-       save_nmsrs = 0;
+       vmx->guest_uret_msrs_loaded = false;
+       vmx->nr_active_uret_msrs = 0;
 #ifdef CONFIG_X86_64
        /*
         * The SYSCALL MSRs are only needed on long mode guests, and only
         * when EFER.SCE is set.
         */
        if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
-               index = __find_msr_index(vmx, MSR_STAR);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
-               index = __find_msr_index(vmx, MSR_LSTAR);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
-               index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
+               vmx_setup_uret_msr(vmx, MSR_STAR);
+               vmx_setup_uret_msr(vmx, MSR_LSTAR);
+               vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
        }
 #endif
-       index = __find_msr_index(vmx, MSR_EFER);
-       if (index >= 0 && update_transition_efer(vmx, index))
-               move_msr_up(vmx, index, save_nmsrs++);
-       index = __find_msr_index(vmx, MSR_TSC_AUX);
-       if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
-               move_msr_up(vmx, index, save_nmsrs++);
-       index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
-       if (index >= 0)
-               move_msr_up(vmx, index, save_nmsrs++);
-
-       vmx->save_nmsrs = save_nmsrs;
-       vmx->guest_msrs_ready = false;
+       if (update_transition_efer(vmx))
+               vmx_setup_uret_msr(vmx, MSR_EFER);
+
+       if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
+               vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+
+       vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
 
        if (cpu_has_vmx_msr_bitmap())
                vmx_update_msr_bitmap(&vmx->vcpu);
@@ -1843,7 +1796,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct shared_msr_entry *msr;
+       struct vmx_uret_msr *msr;
        u32 index;
 
        switch (msr_info->index) {
@@ -1864,7 +1817,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (!msr_info->host_initiated &&
                    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
                        return 1;
-               goto find_shared_msr;
+               goto find_uret_msr;
        case MSR_IA32_UMWAIT_CONTROL:
                if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
                        return 1;
@@ -1971,10 +1924,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (!msr_info->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
                        return 1;
-               goto find_shared_msr;
+               goto find_uret_msr;
        default:
-       find_shared_msr:
-               msr = find_msr_entry(vmx, msr_info->index);
+       find_uret_msr:
+               msr = vmx_find_uret_msr(vmx, msr_info->index);
                if (msr) {
                        msr_info->data = msr->data;
                        break;
@@ -2003,7 +1956,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct shared_msr_entry *msr;
+       struct vmx_uret_msr *msr;
        int ret = 0;
        u32 msr_index = msr_info->index;
        u64 data = msr_info->data;
@@ -2097,7 +2050,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 * in the merging. We update the vmcs01 here for L1 as well
                 * since it will end up touching the MSR anyway now.
                 */
-               vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+               vmx_disable_intercept_for_msr(vcpu,
                                              MSR_IA32_SPEC_CTRL,
                                              MSR_TYPE_RW);
                break;
@@ -2107,7 +2060,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
                        return 1;
-               goto find_shared_msr;
+               goto find_uret_msr;
        case MSR_IA32_PRED_CMD:
                if (!msr_info->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -2133,8 +2086,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 * vmcs02.msr_bitmap here since it gets completely overwritten
                 * in the merging.
                 */
-               vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
-                                             MSR_TYPE_W);
+               vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
                break;
        case MSR_IA32_CR_PAT:
                if (!kvm_pat_valid(data))
@@ -2184,7 +2136,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                vmcs_write64(GUEST_IA32_RTIT_CTL, data);
                vmx->pt_desc.guest.ctl = data;
-               pt_update_intercept_for_msr(vmx);
+               pt_update_intercept_for_msr(vcpu);
                break;
        case MSR_IA32_RTIT_STATUS:
                if (!pt_can_write_msr(vmx))
@@ -2209,7 +2161,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                    !intel_pt_validate_cap(vmx->pt_desc.caps,
                                           PT_CAP_single_range_output))
                        return 1;
-               if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)
+               if (!pt_output_base_valid(vcpu, data))
                        return 1;
                vmx->pt_desc.guest.output_base = data;
                break;
@@ -2244,13 +2196,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                /* Check reserved bit, higher 32 bits should be zero */
                if ((data >> 32) != 0)
                        return 1;
-               goto find_shared_msr;
+               goto find_uret_msr;
 
        default:
-       find_shared_msr:
-               msr = find_msr_entry(vmx, msr_index);
+       find_uret_msr:
+               msr = vmx_find_uret_msr(vmx, msr_index);
                if (msr)
-                       ret = vmx_set_guest_msr(vmx, msr, data);
+                       ret = vmx_set_guest_uret_msr(vmx, msr, data);
                else
                        ret = kvm_set_msr_common(vcpu, msr_info);
        }
@@ -2282,7 +2234,8 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
                vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
                break;
        case VCPU_EXREG_CR3:
-               if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
+               if (is_unrestricted_guest(vcpu) ||
+                   (enable_ept && is_paging(vcpu)))
                        vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
                break;
        case VCPU_EXREG_CR4:
@@ -2463,7 +2416,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                        SECONDARY_EXEC_UNRESTRICTED_GUEST |
                        SECONDARY_EXEC_PAUSE_LOOP_EXITING |
                        SECONDARY_EXEC_DESC |
-                       SECONDARY_EXEC_RDTSCP |
+                       SECONDARY_EXEC_ENABLE_RDTSCP |
                        SECONDARY_EXEC_ENABLE_INVPCID |
                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -2877,13 +2830,14 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
        kvm_mmu_reset_context(vcpu);
 }
 
-void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+       struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
 
+       /* Nothing to do if hardware doesn't support EFER. */
        if (!msr)
-               return;
+               return 0;
 
        vcpu->arch.efer = efer;
        if (efer & EFER_LMA) {
@@ -2895,6 +2849,7 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                msr->data = efer & ~EFER_LME;
        }
        setup_msrs(vmx);
+       return 0;
 }
 
 #ifdef CONFIG_X86_64
@@ -3048,7 +3003,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        unsigned long hw_cr0;
 
        hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
-       if (enable_unrestricted_guest)
+       if (is_unrestricted_guest(vcpu))
                hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
        else {
                hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
@@ -3069,7 +3024,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        }
 #endif
 
-       if (enable_ept && !enable_unrestricted_guest)
+       if (enable_ept && !is_unrestricted_guest(vcpu))
                ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
 
        vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -3149,7 +3104,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        unsigned long hw_cr4;
 
        hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
-       if (enable_unrestricted_guest)
+       if (is_unrestricted_guest(vcpu))
                hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
        else if (vmx->rmode.vm86_active)
                hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
@@ -3184,7 +3139,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        vcpu->arch.cr4 = cr4;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
 
-       if (!enable_unrestricted_guest) {
+       if (!is_unrestricted_guest(vcpu)) {
                if (enable_ept) {
                        if (!is_paging(vcpu)) {
                                hw_cr4 &= ~X86_CR4_PAE;
@@ -3324,7 +3279,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
         * tree. Newer qemu binaries with that qemu fix would not need this
         * kvm hack.
         */
-       if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+       if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
                var->type |= 0x1; /* Accessed */
 
        vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
@@ -3513,11 +3468,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
  * not.
  * We assume that registers are always usable
  */
-static bool guest_state_valid(struct kvm_vcpu *vcpu)
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
 {
-       if (enable_unrestricted_guest)
-               return true;
-
        /* real mode guest state checks */
        if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
                if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3703,11 +3655,52 @@ void free_vpid(int vpid)
        spin_unlock(&vmx_vpid_lock);
 }
 
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
-                                                         u32 msr, int type)
+static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+       int f = sizeof(unsigned long);
+
+       if (msr <= 0x1fff)
+               __clear_bit(msr, msr_bitmap + 0x000 / f);
+       else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+               __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
 {
        int f = sizeof(unsigned long);
 
+       if (msr <= 0x1fff)
+               __clear_bit(msr, msr_bitmap + 0x800 / f);
+       else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+               __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+       int f = sizeof(unsigned long);
+
+       if (msr <= 0x1fff)
+               __set_bit(msr, msr_bitmap + 0x000 / f);
+       else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+               __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+       int f = sizeof(unsigned long);
+
+       if (msr <= 0x1fff)
+               __set_bit(msr, msr_bitmap + 0x800 / f);
+       else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+               __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+                                                         u32 msr, int type)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+
        if (!cpu_has_vmx_msr_bitmap())
                return;
 
@@ -3715,36 +3708,44 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit
                evmcs_touch_msr_bitmap();
 
        /*
-        * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
-        * have the write-low and read-high bitmap offsets the wrong way round.
-        * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
-        */
-       if (msr <= 0x1fff) {
-               if (type & MSR_TYPE_R)
-                       /* read-low */
-                       __clear_bit(msr, msr_bitmap + 0x000 / f);
+        * Mark the desired intercept state in shadow bitmap, this is needed
+        * for resync when the MSR filters change.
+       */
+       if (is_valid_passthrough_msr(msr)) {
+               int idx = possible_passthrough_msr_slot(msr);
+
+               if (idx != -ENOENT) {
+                       if (type & MSR_TYPE_R)
+                               clear_bit(idx, vmx->shadow_msr_intercept.read);
+                       if (type & MSR_TYPE_W)
+                               clear_bit(idx, vmx->shadow_msr_intercept.write);
+               }
+       }
 
-               if (type & MSR_TYPE_W)
-                       /* write-low */
-                       __clear_bit(msr, msr_bitmap + 0x800 / f);
+       if ((type & MSR_TYPE_R) &&
+           !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
+               vmx_set_msr_bitmap_read(msr_bitmap, msr);
+               type &= ~MSR_TYPE_R;
+       }
 
-       } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
-               msr &= 0x1fff;
-               if (type & MSR_TYPE_R)
-                       /* read-high */
-                       __clear_bit(msr, msr_bitmap + 0x400 / f);
+       if ((type & MSR_TYPE_W) &&
+           !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
+               vmx_set_msr_bitmap_write(msr_bitmap, msr);
+               type &= ~MSR_TYPE_W;
+       }
 
-               if (type & MSR_TYPE_W)
-                       /* write-high */
-                       __clear_bit(msr, msr_bitmap + 0xc00 / f);
+       if (type & MSR_TYPE_R)
+               vmx_clear_msr_bitmap_read(msr_bitmap, msr);
 
-       }
+       if (type & MSR_TYPE_W)
+               vmx_clear_msr_bitmap_write(msr_bitmap, msr);
 }
 
-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
                                                         u32 msr, int type)
 {
-       int f = sizeof(unsigned long);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
 
        if (!cpu_has_vmx_msr_bitmap())
                return;
@@ -3753,39 +3754,34 @@ static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitm
                evmcs_touch_msr_bitmap();
 
        /*
-        * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
-        * have the write-low and read-high bitmap offsets the wrong way round.
-        * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
-        */
-       if (msr <= 0x1fff) {
-               if (type & MSR_TYPE_R)
-                       /* read-low */
-                       __set_bit(msr, msr_bitmap + 0x000 / f);
-
-               if (type & MSR_TYPE_W)
-                       /* write-low */
-                       __set_bit(msr, msr_bitmap + 0x800 / f);
-
-       } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
-               msr &= 0x1fff;
-               if (type & MSR_TYPE_R)
-                       /* read-high */
-                       __set_bit(msr, msr_bitmap + 0x400 / f);
+        * Mark the desired intercept state in shadow bitmap, this is needed
+        * for resync when the MSR filter changes.
+       */
+       if (is_valid_passthrough_msr(msr)) {
+               int idx = possible_passthrough_msr_slot(msr);
+
+               if (idx != -ENOENT) {
+                       if (type & MSR_TYPE_R)
+                               set_bit(idx, vmx->shadow_msr_intercept.read);
+                       if (type & MSR_TYPE_W)
+                               set_bit(idx, vmx->shadow_msr_intercept.write);
+               }
+       }
 
-               if (type & MSR_TYPE_W)
-                       /* write-high */
-                       __set_bit(msr, msr_bitmap + 0xc00 / f);
+       if (type & MSR_TYPE_R)
+               vmx_set_msr_bitmap_read(msr_bitmap, msr);
 
-       }
+       if (type & MSR_TYPE_W)
+               vmx_set_msr_bitmap_write(msr_bitmap, msr);
 }
 
-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
-                                                     u32 msr, int type, bool value)
+static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+                                                     u32 msr, int type, bool value)
 {
        if (value)
-               vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+               vmx_enable_intercept_for_msr(vcpu, msr, type);
        else
-               vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+               vmx_disable_intercept_for_msr(vcpu, msr, type);
 }
 
 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
@@ -3803,35 +3799,47 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
        return mode;
 }
 
-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
-                                        u8 mode)
+static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
 {
+       unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
+       unsigned long read_intercept;
        int msr;
 
+       read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
+
        for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
-               unsigned word = msr / BITS_PER_LONG;
-               msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
-               msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+               unsigned int read_idx = msr / BITS_PER_LONG;
+               unsigned int write_idx = read_idx + (0x800 / sizeof(long));
+
+               msr_bitmap[read_idx] = read_intercept;
+               msr_bitmap[write_idx] = ~0ul;
        }
+}
 
-       if (mode & MSR_BITMAP_MODE_X2APIC) {
-               /*
-                * TPR reads and writes can be virtualized even if virtual interrupt
-                * delivery is not in use.
-                */
-               vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
-               if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
-                       vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
-                       vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
-                       vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
-               }
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
+{
+       if (!cpu_has_vmx_msr_bitmap())
+               return;
+
+       vmx_reset_x2apic_msrs(vcpu, mode);
+
+       /*
+        * TPR reads and writes can be virtualized even if virtual interrupt
+        * delivery is not in use.
+        */
+       vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
+                                 !(mode & MSR_BITMAP_MODE_X2APIC));
+
+       if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+               vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+               vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+               vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
        }
 }
 
 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
        u8 mode = vmx_msr_bitmap_mode(vcpu);
        u8 changed = mode ^ vmx->msr_bitmap_mode;
 
@@ -3839,30 +3847,24 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
                return;
 
        if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
-               vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+               vmx_update_msr_bitmap_x2apic(vcpu, mode);
 
        vmx->msr_bitmap_mode = mode;
 }
 
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
 {
-       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
        u32 i;
 
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
-                                                       MSR_TYPE_RW, flag);
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
-                                                       MSR_TYPE_RW, flag);
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
-                                                       MSR_TYPE_RW, flag);
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
-                                                       MSR_TYPE_RW, flag);
+       vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+       vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+       vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+       vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
        for (i = 0; i < vmx->pt_desc.addr_range; i++) {
-               vmx_set_intercept_for_msr(msr_bitmap,
-                       MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
-               vmx_set_intercept_for_msr(msr_bitmap,
-                       MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+               vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+               vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
        }
 }
 
@@ -3886,6 +3888,29 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
        return ((rvi & 0xf0) > (vppr & 0xf0));
 }
 
+static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 i;
+
+       /*
+        * Set intercept permissions for all potentially passed through MSRs
+        * again. They will automatically get filtered through the MSR filter,
+        * so we are back in sync after this.
+        */
+       for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
+               u32 msr = vmx_possible_passthrough_msrs[i];
+               bool read = test_bit(i, vmx->shadow_msr_intercept.read);
+               bool write = test_bit(i, vmx->shadow_msr_intercept.write);
+
+               vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
+               vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
+       }
+
+       pt_update_intercept_for_msr(vcpu);
+       vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
+}
+
 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
                                                     bool nested)
 {
@@ -4043,13 +4068,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 
 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
 {
-       vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS;
+       struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+       vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
+                                         ~vcpu->arch.cr4_guest_rsvd_bits;
        if (!enable_ept)
-               vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
+               vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
        if (is_guest_mode(&vmx->vcpu))
-               vmx->vcpu.arch.cr4_guest_owned_bits &=
-                       ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
-       vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+               vcpu->arch.cr4_guest_owned_bits &=
+                       ~get_vmcs12(vcpu)->cr4_guest_host_mask;
+       vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
 }
 
 u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
@@ -4114,6 +4142,61 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx)
        return exec_control;
 }
 
+/*
+ * Adjust a single secondary execution control bit to intercept/allow an
+ * instruction in the guest.  This is usually done based on whether or not a
+ * feature has been exposed to the guest in order to correctly emulate faults.
+ */
+static inline void
+vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
+                                 u32 control, bool enabled, bool exiting)
+{
+       /*
+        * If the control is for an opt-in feature, clear the control if the
+        * feature is not exposed to the guest, i.e. not enabled.  If the
+        * control is opt-out, i.e. an exiting control, clear the control if
+        * the feature _is_ exposed to the guest, i.e. exiting/interception is
+        * disabled for the associated instruction.  Note, the caller is
+        * responsible presetting exec_control to set all supported bits.
+        */
+       if (enabled == exiting)
+               *exec_control &= ~control;
+
+       /*
+        * Update the nested MSR settings so that a nested VMM can/can't set
+        * controls for features that are/aren't exposed to the guest.
+        */
+       if (nested) {
+               if (enabled)
+                       vmx->nested.msrs.secondary_ctls_high |= control;
+               else
+                       vmx->nested.msrs.secondary_ctls_high &= ~control;
+       }
+}
+
+/*
+ * Wrapper macro for the common case of adjusting a secondary execution control
+ * based on a single guest CPUID bit, with a dedicated feature bit.  This also
+ * verifies that the control is actually supported by KVM and hardware.
+ */
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({                                                                      \
+       bool __enabled;                                                  \
+                                                                        \
+       if (cpu_has_vmx_##name()) {                                      \
+               __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
+                                           X86_FEATURE_##feat_name);    \
+               vmx_adjust_secondary_exec_control(vmx, exec_control,     \
+                       SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
+       }                                                                \
+})
+
+/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
+#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
+       vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
+
+#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
+       vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
 
 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 {
@@ -4154,7 +4237,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        if (!enable_pml)
                exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 
-       if (vmx_xsaves_supported()) {
+       if (cpu_has_vmx_xsaves()) {
                /* Exposing XSAVES only when XSAVE is exposed */
                bool xsaves_enabled =
                        boot_cpu_has(X86_FEATURE_XSAVE) &&
@@ -4163,101 +4246,29 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 
                vcpu->arch.xsaves_enabled = xsaves_enabled;
 
-               if (!xsaves_enabled)
-                       exec_control &= ~SECONDARY_EXEC_XSAVES;
-
-               if (nested) {
-                       if (xsaves_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_XSAVES;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_XSAVES;
-               }
-       }
-
-       if (cpu_has_vmx_rdtscp()) {
-               bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
-               if (!rdtscp_enabled)
-                       exec_control &= ~SECONDARY_EXEC_RDTSCP;
-
-               if (nested) {
-                       if (rdtscp_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_RDTSCP;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_RDTSCP;
-               }
-       }
-
-       if (cpu_has_vmx_invpcid()) {
-               /* Exposing INVPCID only when PCID is exposed */
-               bool invpcid_enabled =
-                       guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
-                       guest_cpuid_has(vcpu, X86_FEATURE_PCID);
-
-               if (!invpcid_enabled) {
-                       exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-                       guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
-               }
-
-               if (nested) {
-                       if (invpcid_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_ENABLE_INVPCID;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_ENABLE_INVPCID;
-               }
-       }
-
-       if (vmx_rdrand_supported()) {
-               bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
-               if (rdrand_enabled)
-                       exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
-
-               if (nested) {
-                       if (rdrand_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_RDRAND_EXITING;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_RDRAND_EXITING;
-               }
+               vmx_adjust_secondary_exec_control(vmx, &exec_control,
+                                                 SECONDARY_EXEC_XSAVES,
+                                                 xsaves_enabled, false);
        }
 
-       if (vmx_rdseed_supported()) {
-               bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
-               if (rdseed_enabled)
-                       exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
+       vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
 
-               if (nested) {
-                       if (rdseed_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_RDSEED_EXITING;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_RDSEED_EXITING;
-               }
-       }
+       /*
+        * Expose INVPCID if and only if PCID is also exposed to the guest.
+        * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
+        * if CR4.PCIDE=0.  Enumerating CPUID.INVPCID=1 would lead to incorrect
+        * behavior from the guest perspective (it would expect #GP or #PF).
+        */
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
+               guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
+       vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
 
-       if (vmx_waitpkg_supported()) {
-               bool waitpkg_enabled =
-                       guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
 
-               if (!waitpkg_enabled)
-                       exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+       vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
+       vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
 
-               if (nested) {
-                       if (waitpkg_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-               }
-       }
+       vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
+                                   ENABLE_USR_WAIT_PAUSE, false);
 
        vmx->secondary_exec_control = exec_control;
 }
@@ -4350,7 +4361,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
        if (vmx->vpid != 0)
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
-       if (vmx_xsaves_supported())
+       if (cpu_has_vmx_xsaves())
                vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
 
        if (enable_pml) {
@@ -4659,7 +4670,7 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
                        vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
                        return false;
-               /* fall through */
+               fallthrough;
        case DB_VECTOR:
                return !(vcpu->guest_debug &
                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
@@ -4833,7 +4844,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
                }
                kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
                kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
-               /* fall through */
+               fallthrough;
        case BP_VECTOR:
                /*
                 * Update instruction length as we may reinject #BP from
@@ -5154,7 +5165,8 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
 
 static int handle_invd(struct kvm_vcpu *vcpu)
 {
-       return kvm_emulate_instruction(vcpu, 0);
+       /* Treat an INVD instruction as a NOP and just skip it. */
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
 static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -5263,7 +5275,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
                                error_code =
                                        vmcs_read32(IDT_VECTORING_ERROR_CODE);
                        }
-                       /* fall through */
+                       fallthrough;
                case INTR_TYPE_SOFT_EXCEPTION:
                        kvm_clear_exception_queue(vcpu);
                        break;
@@ -5337,7 +5349,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
         * would also use advanced VM-exit information for EPT violations to
         * reconstruct the page fault error code.
         */
-       if (unlikely(allow_smaller_maxphyaddr && kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+       if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
                return kvm_emulate_instruction(vcpu, 0);
 
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -5448,25 +5460,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
        }
 }
 
-/*
- * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
- */
-static void wakeup_handler(void)
-{
-       struct kvm_vcpu *vcpu;
-       int cpu = smp_processor_id();
-
-       spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-       list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
-                       blocked_vcpu_list) {
-               struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-               if (pi_test_on(pi_desc) == 1)
-                       kvm_vcpu_kick(vcpu);
-       }
-       spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-}
-
 static void vmx_enable_tdp(void)
 {
        kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -5530,16 +5523,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 {
        u32 vmx_instruction_info;
        unsigned long type;
-       bool pcid_enabled;
        gva_t gva;
-       struct x86_exception e;
-       unsigned i;
-       unsigned long roots_to_free = 0;
        struct {
                u64 pcid;
                u64 gla;
        } operand;
-       int r;
 
        if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
                kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5562,68 +5550,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
                                sizeof(operand), &gva))
                return 1;
 
-       r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
-       if (r != X86EMUL_CONTINUE)
-               return vmx_handle_memory_failure(vcpu, r, &e);
-
-       if (operand.pcid >> 12 != 0) {
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-
-       pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
-       switch (type) {
-       case INVPCID_TYPE_INDIV_ADDR:
-               if ((!pcid_enabled && (operand.pcid != 0)) ||
-                   is_noncanonical_address(operand.gla, vcpu)) {
-                       kvm_inject_gp(vcpu, 0);
-                       return 1;
-               }
-               kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
-               return kvm_skip_emulated_instruction(vcpu);
-
-       case INVPCID_TYPE_SINGLE_CTXT:
-               if (!pcid_enabled && (operand.pcid != 0)) {
-                       kvm_inject_gp(vcpu, 0);
-                       return 1;
-               }
-
-               if (kvm_get_active_pcid(vcpu) == operand.pcid) {
-                       kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-               }
-
-               for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
-                           == operand.pcid)
-                               roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
-               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
-               /*
-                * If neither the current cr3 nor any of the prev_roots use the
-                * given PCID, then nothing needs to be done here because a
-                * resync will happen anyway before switching to any other CR3.
-                */
-
-               return kvm_skip_emulated_instruction(vcpu);
-
-       case INVPCID_TYPE_ALL_NON_GLOBAL:
-               /*
-                * Currently, KVM doesn't mark global entries in the shadow
-                * page tables, so a non-global flush just degenerates to a
-                * global flush. If needed, we could optimize this later by
-                * keeping track of global entries in shadow page tables.
-                */
-
-               /* fall-through */
-       case INVPCID_TYPE_ALL_INCL_GLOBAL:
-               kvm_mmu_unload(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-
-       default:
-               BUG(); /* We have already checked above that type <= 3 */
-       }
+       return kvm_handle_invpcid(vcpu, type, gva);
 }
 
 static int handle_pml_full(struct kvm_vcpu *vcpu)
@@ -5752,10 +5679,24 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 static const int kvm_vmx_max_exit_handlers =
        ARRAY_SIZE(kvm_vmx_exit_handlers);
 
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+                             u32 *intr_info, u32 *error_code)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
        *info1 = vmx_get_exit_qual(vcpu);
-       *info2 = vmx_get_intr_info(vcpu);
+       if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+               *info2 = vmx->idt_vectoring_info;
+               *intr_info = vmx_get_intr_info(vcpu);
+               if (is_exception_with_error_code(*intr_info))
+                       *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+               else
+                       *error_code = 0;
+       } else {
+               *info2 = 0;
+               *intr_info = 0;
+               *error_code = 0;
+       }
 }
 
 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
@@ -6389,14 +6330,6 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
        return max_irr;
 }
 
-static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-       return pi_test_on(pi_desc) ||
-               (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
-}
-
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
        if (!kvm_vcpu_apicv_active(vcpu))
@@ -6416,70 +6349,43 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
        memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
 }
 
+void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
+
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+{
+       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+       gate_desc *desc = (gate_desc *)host_idt_base + vector;
+
+       kvm_before_interrupt(vcpu);
+       vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+       kvm_after_interrupt(vcpu);
+}
+
 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 {
        u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
 
        /* if exit due to PF check for async PF */
-       if (is_page_fault(intr_info)) {
+       if (is_page_fault(intr_info))
                vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
        /* Handle machine checks before interrupts are enabled */
-       } else if (is_machine_check(intr_info)) {
+       else if (is_machine_check(intr_info))
                kvm_machine_check();
        /* We need to handle NMIs before interrupts are enabled */
-       } else if (is_nmi(intr_info)) {
-               kvm_before_interrupt(&vmx->vcpu);
-               asm("int $2");
-               kvm_after_interrupt(&vmx->vcpu);
-       }
+       else if (is_nmi(intr_info))
+               handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
 }
 
 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 {
-       unsigned int vector;
-       unsigned long entry;
-#ifdef CONFIG_X86_64
-       unsigned long tmp;
-#endif
-       gate_desc *desc;
        u32 intr_info = vmx_get_intr_info(vcpu);
 
        if (WARN_ONCE(!is_external_intr(intr_info),
            "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
                return;
 
-       vector = intr_info & INTR_INFO_VECTOR_MASK;
-       desc = (gate_desc *)host_idt_base + vector;
-       entry = gate_offset(desc);
-
-       kvm_before_interrupt(vcpu);
-
-       asm volatile(
-#ifdef CONFIG_X86_64
-               "mov %%rsp, %[sp]\n\t"
-               "and $-16, %%rsp\n\t"
-               "push %[ss]\n\t"
-               "push %[sp]\n\t"
-#endif
-               "pushf\n\t"
-               "push %[cs]\n\t"
-               CALL_NOSPEC
-               :
-#ifdef CONFIG_X86_64
-               [sp]"=&r"(tmp),
-#endif
-               ASM_CALL_CONSTRAINT
-               :
-               [thunk_target]"r"(entry),
-#ifdef CONFIG_X86_64
-               [ss]"i"(__KERNEL_DS),
-#endif
-               [cs]"i"(__KERNEL_CS)
-       );
-
-       kvm_after_interrupt(vcpu);
+       handle_interrupt_nmi_irqoff(vcpu, intr_info);
 }
-STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
 
 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
@@ -6585,7 +6491,7 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
                break;
        case INTR_TYPE_SOFT_EXCEPTION:
                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
-               /* fall through */
+               fallthrough;
        case INTR_TYPE_HARD_EXCEPTION:
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
                        u32 err = vmcs_read32(error_code_field);
@@ -6595,7 +6501,7 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
                break;
        case INTR_TYPE_SOFT_INTR:
                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
-               /* fall through */
+               fallthrough;
        case INTR_TYPE_EXT_INTR:
                kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
                break;
@@ -6806,9 +6712,7 @@ reenter_guest:
        if (enable_preemption_timer)
                vmx_update_hv_timer(vcpu);
 
-       if (lapic_in_kernel(vcpu) &&
-               vcpu->arch.apic->lapic_timer.timer_advance_ns)
-               kvm_wait_lapic_expire(vcpu);
+       kvm_wait_lapic_expire(vcpu);
 
        /*
         * If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -6952,20 +6856,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
                        goto free_vpid;
        }
 
-       BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
+       BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
 
-       for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
-               u32 index = vmx_msr_index[i];
+       for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
+               u32 index = vmx_uret_msrs_list[i];
                u32 data_low, data_high;
-               int j = vmx->nmsrs;
+               int j = vmx->nr_uret_msrs;
 
                if (rdmsr_safe(index, &data_low, &data_high) < 0)
                        continue;
                if (wrmsr_safe(index, data_low, data_high) < 0)
                        continue;
 
-               vmx->guest_msrs[j].index = i;
-               vmx->guest_msrs[j].data = 0;
+               vmx->guest_uret_msrs[j].slot = i;
+               vmx->guest_uret_msrs[j].data = 0;
                switch (index) {
                case MSR_IA32_TSX_CTRL:
                        /*
@@ -6973,32 +6877,36 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
                         * let's avoid changing CPUID bits under the host
                         * kernel's feet.
                         */
-                       vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+                       vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
                        break;
                default:
-                       vmx->guest_msrs[j].mask = -1ull;
+                       vmx->guest_uret_msrs[j].mask = -1ull;
                        break;
                }
-               ++vmx->nmsrs;
+               ++vmx->nr_uret_msrs;
        }
 
        err = alloc_loaded_vmcs(&vmx->vmcs01);
        if (err < 0)
                goto free_pml;
 
+       /* The MSR bitmap starts with all ones */
+       bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+       bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+
        msr_bitmap = vmx->vmcs01.msr_bitmap;
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+       vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
        if (kvm_cstate_in_guest(vcpu->kvm)) {
-               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
-               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
-               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
-               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
        }
        vmx->msr_bitmap_mode = 0;
 
@@ -7022,8 +6930,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        }
 
        if (nested)
-               nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
-                                          vmx_capability.ept);
+               memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
        else
                memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
 
@@ -7343,13 +7250,18 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                update_intel_pt_cfg(vcpu);
 
        if (boot_cpu_has(X86_FEATURE_RTM)) {
-               struct shared_msr_entry *msr;
-               msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
+               struct vmx_uret_msr *msr;
+               msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
                if (msr) {
                        bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
-                       vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+                       vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
                }
        }
+
+       set_cr4_guest_host_mask(vmx);
+
+       /* Refresh #PF interception to account for MAXPHYADDR changes. */
+       update_exception_bitmap(vcpu);
 }
 
 static __init void vmx_set_cpu_caps(void)
@@ -7373,14 +7285,14 @@ static __init void vmx_set_cpu_caps(void)
 
        /* CPUID 0xD.1 */
        supported_xss = 0;
-       if (!vmx_xsaves_supported())
+       if (!cpu_has_vmx_xsaves())
                kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
 
        /* CPUID 0x80000001 */
        if (!cpu_has_vmx_rdtscp())
                kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
 
-       if (vmx_waitpkg_supported())
+       if (cpu_has_vmx_waitpkg())
                kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
 }
 
@@ -7436,7 +7348,7 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
         * Because it is marked as EmulateOnUD, we need to intercept it here.
         */
        case x86_intercept_rdtscp:
-               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
                        exception->vector = UD_VECTOR;
                        exception->error_code_valid = false;
                        return X86EMUL_PROPAGATE_FAULT;
@@ -7568,107 +7480,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
        kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-       struct pi_desc old, new;
-       unsigned int dest;
-
-       do {
-               old.control = new.control = pi_desc->control;
-               WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
-                    "Wakeup handler not enabled while the VCPU is blocked\n");
-
-               dest = cpu_physical_id(vcpu->cpu);
-
-               if (x2apic_enabled())
-                       new.ndst = dest;
-               else
-                       new.ndst = (dest << 8) & 0xFF00;
-
-               /* set 'NV' to 'notification vector' */
-               new.nv = POSTED_INTR_VECTOR;
-       } while (cmpxchg64(&pi_desc->control, old.control,
-                          new.control) != old.control);
-
-       if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
-               spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-               list_del(&vcpu->blocked_vcpu_list);
-               spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-               vcpu->pre_pcpu = -1;
-       }
-}
-
-/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- *   we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- *      'NDST' <-- vcpu->pre_pcpu
- *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- *   interrupt is posted for this vCPU, we cannot block it, in
- *   this case, return 1, otherwise, return 0.
- *
- */
-static int pi_pre_block(struct kvm_vcpu *vcpu)
-{
-       unsigned int dest;
-       struct pi_desc old, new;
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
-               !kvm_vcpu_apicv_active(vcpu))
-               return 0;
-
-       WARN_ON(irqs_disabled());
-       local_irq_disable();
-       if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
-               vcpu->pre_pcpu = vcpu->cpu;
-               spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-               list_add_tail(&vcpu->blocked_vcpu_list,
-                             &per_cpu(blocked_vcpu_on_cpu,
-                                      vcpu->pre_pcpu));
-               spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-       }
-
-       do {
-               old.control = new.control = pi_desc->control;
-
-               WARN((pi_desc->sn == 1),
-                    "Warning: SN field of posted-interrupts "
-                    "is set before blocking\n");
-
-               /*
-                * Since vCPU can be preempted during this process,
-                * vcpu->cpu could be different with pre_pcpu, we
-                * need to set pre_pcpu as the destination of wakeup
-                * notification event, then we can find the right vCPU
-                * to wakeup in wakeup handler if interrupts happen
-                * when the vCPU is in blocked state.
-                */
-               dest = cpu_physical_id(vcpu->pre_pcpu);
-
-               if (x2apic_enabled())
-                       new.ndst = dest;
-               else
-                       new.ndst = (dest << 8) & 0xFF00;
-
-               /* set 'NV' to 'wakeup vector' */
-               new.nv = POSTED_INTR_WAKEUP_VECTOR;
-       } while (cmpxchg64(&pi_desc->control, old.control,
-                          new.control) != old.control);
-
-       /* We should not block the vCPU if an interrupt is posted for it.  */
-       if (pi_test_on(pi_desc) == 1)
-               __pi_post_block(vcpu);
-
-       local_irq_enable();
-       return (vcpu->pre_pcpu == -1);
-}
-
 static int vmx_pre_block(struct kvm_vcpu *vcpu)
 {
        if (pi_pre_block(vcpu))
@@ -7680,17 +7491,6 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void pi_post_block(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->pre_pcpu == -1)
-               return;
-
-       WARN_ON(irqs_disabled());
-       local_irq_disable();
-       __pi_post_block(vcpu);
-       local_irq_enable();
-}
-
 static void vmx_post_block(struct kvm_vcpu *vcpu)
 {
        if (kvm_x86_ops.set_hv_timer)
@@ -7699,100 +7499,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
        pi_post_block(vcpu);
 }
 
-/*
- * vmx_update_pi_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
-                             uint32_t guest_irq, bool set)
-{
-       struct kvm_kernel_irq_routing_entry *e;
-       struct kvm_irq_routing_table *irq_rt;
-       struct kvm_lapic_irq irq;
-       struct kvm_vcpu *vcpu;
-       struct vcpu_data vcpu_info;
-       int idx, ret = 0;
-
-       if (!kvm_arch_has_assigned_device(kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP) ||
-               !kvm_vcpu_apicv_active(kvm->vcpus[0]))
-               return 0;
-
-       idx = srcu_read_lock(&kvm->irq_srcu);
-       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-       if (guest_irq >= irq_rt->nr_rt_entries ||
-           hlist_empty(&irq_rt->map[guest_irq])) {
-               pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
-                            guest_irq, irq_rt->nr_rt_entries);
-               goto out;
-       }
-
-       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
-               if (e->type != KVM_IRQ_ROUTING_MSI)
-                       continue;
-               /*
-                * VT-d PI cannot support posting multicast/broadcast
-                * interrupts to a vCPU, we still use interrupt remapping
-                * for these kind of interrupts.
-                *
-                * For lowest-priority interrupts, we only support
-                * those with single CPU as the destination, e.g. user
-                * configures the interrupts via /proc/irq or uses
-                * irqbalance to make the interrupts single-CPU.
-                *
-                * We will support full lowest-priority interrupt later.
-                *
-                * In addition, we can only inject generic interrupts using
-                * the PI mechanism, refuse to route others through it.
-                */
-
-               kvm_set_msi_irq(kvm, e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
-                   !kvm_irq_is_postable(&irq)) {
-                       /*
-                        * Make sure the IRTE is in remapped mode if
-                        * we don't handle it in posted mode.
-                        */
-                       ret = irq_set_vcpu_affinity(host_irq, NULL);
-                       if (ret < 0) {
-                               printk(KERN_INFO
-                                  "failed to back to remapped mode, irq: %u\n",
-                                  host_irq);
-                               goto out;
-                       }
-
-                       continue;
-               }
-
-               vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
-               vcpu_info.vector = irq.vector;
-
-               trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
-                               vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
-               if (set)
-                       ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
-               else
-                       ret = irq_set_vcpu_affinity(host_irq, NULL);
-
-               if (ret < 0) {
-                       printk(KERN_INFO "%s: failed to update PI IRTE\n",
-                                       __func__);
-                       goto out;
-               }
-       }
-
-       ret = 0;
-out:
-       srcu_read_unlock(&kvm->irq_srcu, idx);
-       return ret;
-}
-
 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 {
        if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@@ -7850,11 +7556,6 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
        /* RSM will cause a vmexit anyway.  */
 }
 
-static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
-{
-       return false;
-}
-
 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
 {
        return to_vmx(vcpu)->nested.vmxon;
@@ -7961,7 +7662,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
        .sync_pir_to_irr = vmx_sync_pir_to_irr,
        .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
-       .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
+       .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
 
        .set_tss_addr = vmx_set_tss_addr,
        .set_identity_map_addr = vmx_set_identity_map_addr,
@@ -7995,7 +7696,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .pmu_ops = &intel_pmu_ops,
        .nested_ops = &vmx_nested_ops,
 
-       .update_pi_irte = vmx_update_pi_irte,
+       .update_pi_irte = pi_update_irte,
 
 #ifdef CONFIG_X86_64
        .set_hv_timer = vmx_set_hv_timer,
@@ -8009,9 +7710,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .pre_leave_smm = vmx_pre_leave_smm,
        .enable_smi_window = enable_smi_window,
 
-       .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+       .can_emulate_instruction = vmx_can_emulate_instruction,
        .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
        .migrate_timers = vmx_migrate_timers,
+
+       .msr_filter_changed = vmx_msr_filter_changed,
 };
 
 static __init int hardware_setup(void)
@@ -8023,8 +7726,8 @@ static __init int hardware_setup(void)
        store_idt(&dt);
        host_idt_base = dt.address;
 
-       for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
-               kvm_define_shared_msr(i, vmx_msr_index[i]);
+       for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+               kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
 
        if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
                return -EIO;
@@ -8161,7 +7864,7 @@ static __init int hardware_setup(void)
                vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
        }
 
-       kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+       kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
 
        kvm_mce_cap_supported |= MCG_LMCE_P;
 
@@ -8300,8 +8003,8 @@ static int __init vmx_init(void)
 
        for_each_possible_cpu(cpu) {
                INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
-               INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
-               spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+
+               pi_init(cpu);
        }
 
 #ifdef CONFIG_KEXEC_CORE