Merge tag 'kvmarm-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm...
[platform/kernel/linux-rpi.git] / arch / x86 / kvm / x86.c
index 5de2006..1b6d9ac 100644 (file)
@@ -22,6 +22,7 @@
 #include "i8254.h"
 #include "tss.h"
 #include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
 #include "x86.h"
 #include "cpuid.h"
 #include "pmu.h"
@@ -81,7 +82,7 @@ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
 #define emul_to_vcpu(ctxt) \
-       container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+       ((struct kvm_vcpu *)(ctxt)->vcpu)
 
 /* EFER defaults:
  * - enable syscall per default because its emulated by KVM
@@ -180,7 +181,17 @@ struct kvm_shared_msrs {
 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 static struct kvm_shared_msrs __percpu *shared_msrs;
 
+#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+                               | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+                               | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+                               | XFEATURE_MASK_PKRU)
+
+u64 __read_mostly host_efer;
+EXPORT_SYMBOL_GPL(host_efer);
+
 static u64 __read_mostly host_xss;
+u64 __read_mostly supported_xss;
+EXPORT_SYMBOL_GPL(supported_xss);
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -226,10 +237,25 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 };
 
 u64 __read_mostly host_xcr0;
+u64 __read_mostly supported_xcr0;
+EXPORT_SYMBOL_GPL(supported_xcr0);
 
 struct kmem_cache *x86_fpu_cache;
 EXPORT_SYMBOL_GPL(x86_fpu_cache);
 
+static struct kmem_cache *x86_emulator_cache;
+
+static struct kmem_cache *kvm_alloc_emulator_cache(void)
+{
+       unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
+       unsigned int size = sizeof(struct x86_emulate_ctxt);
+
+       return kmem_cache_create_usercopy("x86_emulator", size,
+                                         __alignof__(struct x86_emulate_ctxt),
+                                         SLAB_ACCOUNT, useroffset,
+                                         size - useroffset, NULL);
+}
+
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
@@ -350,6 +376,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        }
 
        kvm_lapic_set_base(vcpu, msr_info->data);
+       kvm_recalculate_apic_map(vcpu->kvm);
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
@@ -903,10 +930,10 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
 {
        u64 reserved_bits = __cr4_reserved_bits(cpu_has, c);
 
-       if (cpuid_ecx(0x7) & feature_bit(LA57))
+       if (kvm_cpu_cap_has(X86_FEATURE_LA57))
                reserved_bits &= ~X86_CR4_LA57;
 
-       if (kvm_x86_ops->umip_emulated())
+       if (kvm_cpu_cap_has(X86_FEATURE_UMIP))
                reserved_bits &= ~X86_CR4_UMIP;
 
        return reserved_bits;
@@ -1558,8 +1585,12 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
                ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
                ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
 
+               data &= ~(1 << 12);
+               kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
                kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
-               return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
+               kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
+               trace_kvm_apic_write(APIC_ICR, (u32)data);
+               return 0;
        }
 
        return 1;
@@ -1568,11 +1599,12 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
 enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
 {
        u32 msr = kvm_rcx_read(vcpu);
-       u64 data = kvm_read_edx_eax(vcpu);
+       u64 data;
        int ret = 0;
 
        switch (msr) {
        case APIC_BASE_MSR + (APIC_ICR >> 4):
+               data = kvm_read_edx_eax(vcpu);
                ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
                break;
        default:
@@ -2523,7 +2555,7 @@ static void kvmclock_sync_fn(struct work_struct *work)
 static bool can_set_mci_status(struct kvm_vcpu *vcpu)
 {
        /* McStatusWrEn enabled? */
-       if (guest_cpuid_is_amd(vcpu))
+       if (guest_cpuid_is_amd_or_hygon(vcpu))
                return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
 
        return false;
@@ -2798,12 +2830,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
                        return 1;
                /*
-                * We do support PT if kvm_x86_ops->pt_supported(), but we do
-                * not support IA32_XSS[bit 8]. Guests will have to use
-                * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
-                * MSRs.
+                * KVM supports exposing PT to the guest, but does not support
+                * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
+                * XSAVES/XRSTORS to save/restore PT MSRs.
                 */
-               if (data != 0)
+               if (data & ~supported_xss)
                        return 1;
                vcpu->arch.ia32_xss = data;
                break;
@@ -3077,7 +3108,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
                return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
-               break;
        case MSR_IA32_TSCDEADLINE:
                msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
                break;
@@ -3160,7 +3190,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                return kvm_hv_get_msr_common(vcpu,
                                             msr_info->index, &msr_info->data,
                                             msr_info->host_initiated);
-               break;
        case MSR_IA32_BBL_CR_CTL3:
                /* This legacy MSR exists but isn't fully documented in current
                 * silicon.  It is however accessed by winxp in very narrow
@@ -3464,7 +3493,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
                r = 0;
                break;
        }
-       case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+       case KVM_X86_GET_MCE_CAP_SUPPORTED:
                r = -EFAULT;
                if (copy_to_user(argp, &kvm_mce_cap_supported,
                                 sizeof(kvm_mce_cap_supported)))
@@ -3496,9 +3525,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
        case KVM_GET_MSRS:
                r = msr_io(NULL, argp, do_get_msr_feature, 1);
                break;
-       }
        default:
                r = -EINVAL;
+               break;
        }
 out:
        return r;
@@ -4101,8 +4130,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
                 * with old userspace.
                 */
-               if (xstate_bv & ~kvm_supported_xcr0() ||
-                       mxcsr & ~mxcsr_feature_mask)
+               if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
                        return -EINVAL;
                load_xsave(vcpu, (u8 *)guest_xsave->region);
        } else {
@@ -4761,77 +4789,13 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
        return 0;
 }
 
-/**
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- * @kvm: kvm instance
- * @log: slot id and address to which we copy the log
- *
- * Steps 1-4 below provide general overview of dirty page logging. See
- * kvm_get_dirty_log_protect() function description for additional details.
- *
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- * always flush the TLB (step 4) even if previous step failed  and the dirty
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- * writes will be marked dirty for next log read.
- *
- *   1. Take a snapshot of the bit and clear it if needed.
- *   2. Write protect the corresponding page.
- *   3. Copy the snapshot to the userspace.
- *   4. Flush TLB's if needed.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
-       bool flush = false;
-       int r;
-
-       mutex_lock(&kvm->slots_lock);
-
        /*
         * Flush potentially hardware-cached dirty pages to dirty_bitmap.
         */
        if (kvm_x86_ops->flush_log_dirty)
                kvm_x86_ops->flush_log_dirty(kvm);
-
-       r = kvm_get_dirty_log_protect(kvm, log, &flush);
-
-       /*
-        * All the TLBs can be flushed out of mmu lock, see the comments in
-        * kvm_mmu_slot_remove_write_access().
-        */
-       lockdep_assert_held(&kvm->slots_lock);
-       if (flush)
-               kvm_flush_remote_tlbs(kvm);
-
-       mutex_unlock(&kvm->slots_lock);
-       return r;
-}
-
-int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
-{
-       bool flush = false;
-       int r;
-
-       mutex_lock(&kvm->slots_lock);
-
-       /*
-        * Flush potentially hardware-cached dirty pages to dirty_bitmap.
-        */
-       if (kvm_x86_ops->flush_log_dirty)
-               kvm_x86_ops->flush_log_dirty(kvm);
-
-       r = kvm_clear_dirty_log_protect(kvm, log, &flush);
-
-       /*
-        * All the TLBs can be flushed out of mmu lock, see the comments in
-        * kvm_mmu_slot_remove_write_access().
-        */
-       lockdep_assert_held(&kvm->slots_lock);
-       if (flush)
-               kvm_flush_remote_tlbs(kvm);
-
-       mutex_unlock(&kvm->slots_lock);
-       return r;
 }
 
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@ -5260,28 +5224,28 @@ static void kvm_init_msr_list(void)
                                continue;
                        break;
                case MSR_TSC_AUX:
-                       if (!kvm_x86_ops->rdtscp_supported())
+                       if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
                                continue;
                        break;
                case MSR_IA32_RTIT_CTL:
                case MSR_IA32_RTIT_STATUS:
-                       if (!kvm_x86_ops->pt_supported())
+                       if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
                                continue;
                        break;
                case MSR_IA32_RTIT_CR3_MATCH:
-                       if (!kvm_x86_ops->pt_supported() ||
+                       if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                            !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
                                continue;
                        break;
                case MSR_IA32_RTIT_OUTPUT_BASE:
                case MSR_IA32_RTIT_OUTPUT_MASK:
-                       if (!kvm_x86_ops->pt_supported() ||
+                       if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                                (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
                                 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
                                continue;
                        break;
                case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
-                       if (!kvm_x86_ops->pt_supported() ||
+                       if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
                                msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
                                intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
                                continue;
@@ -5738,7 +5702,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
        int handled, ret;
        bool write = ops->write;
        struct kvm_mmio_fragment *frag;
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
 
        /*
         * If the exit was due to a NPF we may already have a GPA.
@@ -5747,10 +5711,9 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
         * operation using rep will only have the initial GPA from the NPF
         * occurred.
         */
-       if (vcpu->arch.gpa_available &&
-           emulator_can_use_gpa(ctxt) &&
-           (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
-               gpa = vcpu->arch.gpa_val;
+       if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
+           (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
+               gpa = ctxt->gpa_val;
                ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
        } else {
                ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@ -5970,11 +5933,9 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
        return 0;
 }
 
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-                                   int size, unsigned short port, void *val,
-                                   unsigned int count)
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+                          unsigned short port, void *val, unsigned int count)
 {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        int ret;
 
        if (vcpu->arch.pio.count)
@@ -5994,17 +5955,30 @@ data_avail:
        return 0;
 }
 
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
-                                    int size, unsigned short port,
-                                    const void *val, unsigned int count)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+                                   int size, unsigned short port, void *val,
+                                   unsigned int count)
 {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
+
+}
 
+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
+                           unsigned short port, const void *val,
+                           unsigned int count)
+{
        memcpy(vcpu->arch.pio_data, val, size * count);
        trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
        return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
 }
 
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+                                    int size, unsigned short port,
+                                    const void *val, unsigned int count)
+{
+       return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
+}
+
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
        return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -6267,13 +6241,15 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
                              struct x86_instruction_info *info,
                              enum x86_intercept_stage stage)
 {
-       return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
+       return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage,
+                                           &ctxt->exception);
 }
 
 static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
-                       u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
+                             u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
+                             bool exact_only)
 {
-       return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
+       return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
 }
 
 static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
@@ -6400,7 +6376,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 
 static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        if (ctxt->exception.vector == PF_VECTOR)
                return kvm_propagate_fault(vcpu, &ctxt->exception);
 
@@ -6412,13 +6388,31 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
        return false;
 }
 
+static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+       struct x86_emulate_ctxt *ctxt;
+
+       ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
+       if (!ctxt) {
+               pr_err("kvm: failed to allocate vcpu's emulator\n");
+               return NULL;
+       }
+
+       ctxt->vcpu = vcpu;
+       ctxt->ops = &emulate_ops;
+       vcpu->arch.emulate_ctxt = ctxt;
+
+       return ctxt;
+}
+
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 {
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int cs_db, cs_l;
 
        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
+       ctxt->gpa_available = false;
        ctxt->eflags = kvm_get_rflags(vcpu);
        ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
 
@@ -6438,7 +6432,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 
 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 {
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int ret;
 
        init_emulate_ctxt(vcpu);
@@ -6494,10 +6488,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        gpa_t gpa = cr2_or_gpa;
        kvm_pfn_t pfn;
 
-       if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+       if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                return false;
 
-       if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+       if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+           WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                return false;
 
        if (!vcpu->arch.mmu->direct_map) {
@@ -6585,10 +6580,11 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
         */
        vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
 
-       if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+       if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                return false;
 
-       if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+       if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+           WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                return false;
 
        if (x86_page_table_writing_insn(ctxt))
@@ -6751,7 +6747,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                            int emulation_type, void *insn, int insn_len)
 {
        int r;
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        bool writeback = true;
        bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
@@ -6841,8 +6837,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        }
 
 restart:
-       /* Save the faulting GPA (cr2) in the address field */
-       ctxt->exception.address = cr2_or_gpa;
+       if (emulation_type & EMULTYPE_PF) {
+               /* Save the faulting GPA (cr2) in the address field */
+               ctxt->exception.address = cr2_or_gpa;
+
+               /* With shadow page tables, cr2 contains a GVA or nGPA. */
+               if (vcpu->arch.mmu->direct_map) {
+                       ctxt->gpa_available = true;
+                       ctxt->gpa_val = cr2_or_gpa;
+               }
+       } else {
+               /* Sanitize the address out of an abundance of paranoia. */
+               ctxt->exception.address = 0;
+       }
 
        r = x86_emulate_insn(ctxt);
 
@@ -6943,8 +6950,8 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
                            unsigned short port)
 {
        unsigned long val = kvm_rax_read(vcpu);
-       int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
-                                           size, port, &val, 1);
+       int ret = emulator_pio_out(vcpu, size, port, &val, 1);
+
        if (ret)
                return ret;
 
@@ -6980,11 +6987,10 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
        val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
 
        /*
-        * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
+        * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
         * the copy and tracing
         */
-       emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
-                                vcpu->arch.pio.port, &val, 1);
+       emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
        kvm_rax_write(vcpu, val);
 
        return kvm_skip_emulated_instruction(vcpu);
@@ -6999,8 +7005,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
        /* For size less than 4 we merge, else we zero extend */
        val = (size < 4) ? kvm_rax_read(vcpu) : 0;
 
-       ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
-                                      &val, 1);
+       ret = emulator_pio_in(vcpu, size, port, &val, 1);
        if (ret) {
                kvm_rax_write(vcpu, val);
                return ret;
@@ -7338,10 +7343,16 @@ int kvm_arch_init(void *opaque)
                goto out;
        }
 
+       x86_emulator_cache = kvm_alloc_emulator_cache();
+       if (!x86_emulator_cache) {
+               pr_err("kvm: failed to allocate cache for x86 emulator\n");
+               goto out_free_x86_fpu_cache;
+       }
+
        shared_msrs = alloc_percpu(struct kvm_shared_msrs);
        if (!shared_msrs) {
                printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-               goto out_free_x86_fpu_cache;
+               goto out_free_x86_emulator_cache;
        }
 
        r = kvm_mmu_module_init();
@@ -7357,8 +7368,10 @@ int kvm_arch_init(void *opaque)
 
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
 
-       if (boot_cpu_has(X86_FEATURE_XSAVE))
+       if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+               supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+       }
 
        kvm_lapic_init();
        if (pi_inject_timer == -1)
@@ -7374,6 +7387,8 @@ int kvm_arch_init(void *opaque)
 
 out_free_percpu:
        free_percpu(shared_msrs);
+out_free_x86_emulator_cache:
+       kmem_cache_destroy(x86_emulator_cache);
 out_free_x86_fpu_cache:
        kmem_cache_destroy(x86_fpu_cache);
 out:
@@ -7631,7 +7646,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
+static int inject_pending_event(struct kvm_vcpu *vcpu)
 {
        int r;
 
@@ -7667,7 +7682,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
         * from L2 to L1.
         */
        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
-               r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+               r = kvm_x86_ops->check_nested_events(vcpu);
                if (r != 0)
                        return r;
        }
@@ -7729,7 +7744,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
                 * KVM_REQ_EVENT only on certain events and not unconditionally?
                 */
                if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
-                       r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+                       r = kvm_x86_ops->check_nested_events(vcpu);
                        if (r != 0)
                                return r;
                }
@@ -8039,19 +8054,26 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
  */
 void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
 {
+       unsigned long old, new, expected;
+
        if (!kvm_x86_ops->check_apicv_inhibit_reasons ||
            !kvm_x86_ops->check_apicv_inhibit_reasons(bit))
                return;
 
-       if (activate) {
-               if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
-                   !kvm_apicv_activated(kvm))
-                       return;
-       } else {
-               if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
-                   kvm_apicv_activated(kvm))
-                       return;
-       }
+       old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
+       do {
+               expected = new = old;
+               if (activate)
+                       __clear_bit(bit, &new);
+               else
+                       __set_bit(bit, &new);
+               if (new == old)
+                       break;
+               old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
+       } while (old != expected);
+
+       if (!!old == !!new)
+               return;
 
        trace_kvm_apicv_update_request(activate, bit);
        if (kvm_x86_ops->pre_update_apicv_exec_ctrl)
@@ -8176,8 +8198,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                }
                if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
                        kvm_mmu_sync_roots(vcpu);
-               if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
-                       kvm_mmu_load_cr3(vcpu);
+               if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
+                       kvm_mmu_load_pgd(vcpu);
                if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
                        kvm_vcpu_flush_tlb(vcpu, true);
                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@ -8262,7 +8284,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        goto out;
                }
 
-               if (inject_pending_event(vcpu, req_int_win) != 0)
+               if (inject_pending_event(vcpu) != 0)
                        req_immediate_exit = true;
                else {
                        /* Enable SMI/NMI/IRQ window open exits if needed.
@@ -8443,7 +8465,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (vcpu->arch.apic_attention)
                kvm_lapic_sync_from_vapic(vcpu);
 
-       vcpu->arch.gpa_available = false;
        r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath);
        return r;
 
@@ -8484,7 +8505,6 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
                break;
        default:
                return -EINTR;
-               break;
        }
        return 1;
 }
@@ -8492,7 +8512,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
-               kvm_x86_ops->check_nested_events(vcpu, false);
+               kvm_x86_ops->check_nested_events(vcpu);
 
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
@@ -8753,7 +8773,7 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
                 * that usually, but some bad designed PV devices (vmware
                 * backdoor interface) need this to work
                 */
-               emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
+               emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
        }
        regs->rax = kvm_rax_read(vcpu);
@@ -8939,7 +8959,7 @@ out:
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
                    int reason, bool has_error_code, u32 error_code)
 {
-       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int ret;
 
        init_emulate_ctxt(vcpu);
@@ -9271,7 +9291,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        struct page *page;
        int r;
 
-       vcpu->arch.emulate_ctxt.ops = &emulate_ops;
        if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
@@ -9309,11 +9328,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
                                GFP_KERNEL_ACCOUNT))
                goto fail_free_mce_banks;
 
+       if (!alloc_emulate_ctxt(vcpu))
+               goto free_wbinvd_dirty_mask;
+
        vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
                                                GFP_KERNEL_ACCOUNT);
        if (!vcpu->arch.user_fpu) {
                pr_err("kvm: failed to allocate userspace's fpu\n");
-               goto free_wbinvd_dirty_mask;
+               goto free_emulate_ctxt;
        }
 
        vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
@@ -9355,6 +9377,8 @@ free_guest_fpu:
        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
 free_user_fpu:
        kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+free_emulate_ctxt:
+       kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
 free_wbinvd_dirty_mask:
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 fail_free_mce_banks:
@@ -9389,11 +9413,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
        mutex_unlock(&vcpu->mutex);
 
-       if (!kvmclock_periodic_sync)
-               return;
-
-       schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
-                                       KVMCLOCK_SYNC_PERIOD);
+       if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
+               schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+                                               KVMCLOCK_SYNC_PERIOD);
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -9407,6 +9429,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
        kvm_x86_ops->vcpu_free(vcpu);
 
+       kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
        kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
@@ -9607,10 +9630,18 @@ int kvm_arch_hardware_setup(void)
 {
        int r;
 
+       rdmsrl_safe(MSR_EFER, &host_efer);
+
+       if (boot_cpu_has(X86_FEATURE_XSAVES))
+               rdmsrl(MSR_IA32_XSS, host_xss);
+
        r = kvm_x86_ops->hardware_setup();
        if (r != 0)
                return r;
 
+       if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+               supported_xss = 0;
+
        cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
 
        if (kvm_has_tsc_control) {
@@ -9627,9 +9658,6 @@ int kvm_arch_hardware_setup(void)
                kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
        }
 
-       if (boot_cpu_has(X86_FEATURE_XSAVES))
-               rdmsrl(MSR_IA32_XSS, host_xss);
-
        kvm_init_msr_list();
        return 0;
 }
@@ -9677,6 +9705,13 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
        kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+       kfree(kvm->arch.hyperv.hv_pa_pg);
+       vfree(kvm);
+}
+
+
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
        if (type)
@@ -9759,9 +9794,9 @@ void kvm_arch_sync_events(struct kvm *kvm)
 int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 {
        int i, r;
-       unsigned long hva;
+       unsigned long hva, uninitialized_var(old_npages);
        struct kvm_memslots *slots = kvm_memslots(kvm);
-       struct kvm_memory_slot *slot, old;
+       struct kvm_memory_slot *slot;
 
        /* Called with kvm->slots_lock held.  */
        if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
@@ -9769,7 +9804,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 
        slot = id_to_memslot(slots, id);
        if (size) {
-               if (slot->npages)
+               if (slot && slot->npages)
                        return -EEXIST;
 
                /*
@@ -9781,13 +9816,18 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
                if (IS_ERR((void *)hva))
                        return PTR_ERR((void *)hva);
        } else {
-               if (!slot->npages)
+               if (!slot || !slot->npages)
                        return 0;
 
-               hva = 0;
+               /*
+                * Stuff a non-canonical value to catch use-after-delete.  This
+                * ends up being 0 on 32-bit KVM, but there's no better
+                * alternative.
+                */
+               hva = (unsigned long)(0xdeadull << 48);
+               old_npages = slot->npages;
        }
 
-       old = *slot;
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                struct kvm_userspace_memory_region m;
 
@@ -9802,7 +9842,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
        }
 
        if (!size)
-               vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
+               vm_munmap(hva, old_npages * PAGE_SIZE);
 
        return 0;
 }
@@ -9841,34 +9881,36 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_hv_destroy_vm(kvm);
 }
 
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-                          struct kvm_memory_slot *dont)
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
        int i;
 
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-               if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
-                       kvfree(free->arch.rmap[i]);
-                       free->arch.rmap[i] = NULL;
-               }
+               kvfree(slot->arch.rmap[i]);
+               slot->arch.rmap[i] = NULL;
+
                if (i == 0)
                        continue;
 
-               if (!dont || free->arch.lpage_info[i - 1] !=
-                            dont->arch.lpage_info[i - 1]) {
-                       kvfree(free->arch.lpage_info[i - 1]);
-                       free->arch.lpage_info[i - 1] = NULL;
-               }
+               kvfree(slot->arch.lpage_info[i - 1]);
+               slot->arch.lpage_info[i - 1] = NULL;
        }
 
-       kvm_page_track_free_memslot(free, dont);
+       kvm_page_track_free_memslot(slot);
 }
 
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                           unsigned long npages)
+static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
+                                     unsigned long npages)
 {
        int i;
 
+       /*
+        * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
+        * old arrays will be freed by __kvm_set_memory_region() if installing
+        * the new memslot is successful.
+        */
+       memset(&slot->arch, 0, sizeof(slot->arch));
+
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                struct kvm_lpage_info *linfo;
                unsigned long ugfn;
@@ -9899,11 +9941,9 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
                ugfn = slot->userspace_addr >> PAGE_SHIFT;
                /*
                 * If the gfn and userspace address are not aligned wrt each
-                * other, or if explicitly asked to, disable large page
-                * support for this slot
+                * other, disable large page support for this slot.
                 */
-               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
-                   !kvm_largepages_enabled()) {
+               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
                        unsigned long j;
 
                        for (j = 0; j < lpages; ++j)
@@ -9950,6 +9990,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem,
                                enum kvm_mr_change change)
 {
+       if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
+               return kvm_alloc_memslot_metadata(memslot,
+                                                 mem->memory_size >> PAGE_SHIFT);
        return 0;
 }
 
@@ -9958,7 +10001,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 {
        /* Still write protect RO slot */
        if (new->flags & KVM_MEM_READONLY) {
-               kvm_mmu_slot_remove_write_access(kvm, new);
+               kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL);
                return;
        }
 
@@ -9993,10 +10036,23 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
         * See the comments in fast_page_fault().
         */
        if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-               if (kvm_x86_ops->slot_enable_log_dirty)
+               if (kvm_x86_ops->slot_enable_log_dirty) {
                        kvm_x86_ops->slot_enable_log_dirty(kvm, new);
-               else
-                       kvm_mmu_slot_remove_write_access(kvm, new);
+               } else {
+                       int level =
+                               kvm_dirty_log_manual_protect_and_init_set(kvm) ?
+                               PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL;
+
+                       /*
+                        * If we're with initial-all-set, we don't need
+                        * to write protect any small page because
+                        * they're reported as dirty already.  However
+                        * we still need to write-protect huge pages
+                        * so that the page split can happen lazily on
+                        * the first write to the huge page.
+                        */
+                       kvm_mmu_slot_remove_write_access(kvm, new, level);
+               }
        } else {
                if (kvm_x86_ops->slot_disable_log_dirty)
                        kvm_x86_ops->slot_disable_log_dirty(kvm, new);
@@ -10005,7 +10061,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old,
+                               struct kvm_memory_slot *old,
                                const struct kvm_memory_slot *new,
                                enum kvm_mr_change change)
 {
@@ -10047,6 +10103,10 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         */
        if (change != KVM_MR_DELETE)
                kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
+
+       /* Free the arrays associated with the old memslot. */
+       if (change == KVM_MR_MOVE)
+               kvm_arch_free_memslot(kvm, old);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -10191,7 +10251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
                return;
 
        if (!vcpu->arch.mmu->direct_map &&
-             work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
+             work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
                return;
 
        kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
@@ -10514,4 +10574,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);