KVM: x86: Gracefully handle __vmalloc() failure during VM allocation
[platform/kernel/linux-rpi.git] / arch / x86 / kvm / vmx / vmx.c
index 5d21a4a..e0d16ba 100644 (file)
@@ -95,7 +95,7 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
-static bool __read_mostly enable_apicv = 1;
+bool __read_mostly enable_apicv = 1;
 module_param(enable_apicv, bool, S_IRUGO);
 
 /*
@@ -648,43 +648,15 @@ void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
 }
 
 #ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
-       cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
-       cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
-       return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
 static void crash_vmclear_local_loaded_vmcss(void)
 {
        int cpu = raw_smp_processor_id();
        struct loaded_vmcs *v;
 
-       if (!crash_local_vmclear_enabled(cpu))
-               return;
-
        list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                            loaded_vmcss_on_cpu_link)
                vmcs_clear(v->vmcs);
 }
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
 #endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
@@ -696,19 +668,24 @@ static void __loaded_vmcs_clear(void *arg)
                return; /* vcpu migration can race with cpu offline */
        if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
-       crash_disable_local_vmclear(cpu);
+
+       vmcs_clear(loaded_vmcs->vmcs);
+       if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+               vmcs_clear(loaded_vmcs->shadow_vmcs);
+
        list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
 
        /*
-        * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
-        * is before setting loaded_vmcs->vcpu to -1 which is done in
-        * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
-        * then adds the vmcs into percpu list before it is deleted.
+        * Ensure all writes to loaded_vmcs, including deleting it from its
+        * current percpu list, complete before setting loaded_vmcs->vcpu to
+        * -1, otherwise a different cpu can see vcpu == -1 first and add
+        * loaded_vmcs to its percpu list before it's deleted from this cpu's
+        * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
         */
        smp_wmb();
 
-       loaded_vmcs_init(loaded_vmcs);
-       crash_enable_local_vmclear(cpu);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
 }
 
 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -1268,6 +1245,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
        if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
                return;
 
+       /*
+        * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+        * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+        * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+        * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
+        * correctly.
+        */
+       if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+               pi_clear_sn(pi_desc);
+               goto after_clear_sn;
+       }
+
        /* The full case.  */
        do {
                old.control = new.control = pi_desc->control;
@@ -1283,6 +1272,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
        } while (cmpxchg64(&pi_desc->control, old.control,
                           new.control) != old.control);
 
+after_clear_sn:
+
        /*
         * Clear SN before reading the bitmap.  The VT-d firmware
         * writes the bitmap and reads SN atomically (5.2.3 in the
@@ -1291,7 +1282,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
         */
        smp_mb__after_atomic();
 
-       if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+       if (!pi_is_pir_empty(pi_desc))
                pi_set_on(pi_desc);
 }
 
@@ -1303,18 +1294,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
        if (!already_loaded) {
                loaded_vmcs_clear(vmx->loaded_vmcs);
                local_irq_disable();
-               crash_disable_local_vmclear(cpu);
 
                /*
-                * Read loaded_vmcs->cpu should be before fetching
-                * loaded_vmcs->loaded_vmcss_on_cpu_link.
-                * See the comments in __loaded_vmcs_clear().
+                * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+                * this cpu's percpu list, otherwise it may not yet be deleted
+                * from its previous cpu's percpu list.  Pairs with the
+                * smb_wmb() in __loaded_vmcs_clear().
                 */
                smp_rmb();
 
                list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
-               crash_enable_local_vmclear(cpu);
                local_irq_enable();
        }
 
@@ -2126,6 +2116,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_num_address_ranges)))
                        return 1;
+               if (is_noncanonical_address(data, vcpu))
+                       return 1;
                if (index % 2)
                        vmx->pt_desc.guest.addr_b[index / 2] = data;
                else
@@ -2240,17 +2232,6 @@ static int hardware_enable(void)
        INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
        spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 
-       /*
-        * Now we can enable the vmclear operation in kdump
-        * since the loaded_vmcss_on_cpu list on this cpu
-        * has been initialized.
-        *
-        * Though the cpu is not in VMX operation now, there
-        * is no problem to enable the vmclear operation
-        * for the loaded_vmcss_on_cpu list is empty!
-        */
-       crash_enable_local_vmclear(cpu);
-
        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
        test_bits = FEATURE_CONTROL_LOCKED;
@@ -2959,6 +2940,9 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static int get_ept_level(struct kvm_vcpu *vcpu)
 {
+       /* Nested EPT currently only supports 4-level walks. */
+       if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+               return 4;
        if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
                return 5;
        return 4;
@@ -2981,6 +2965,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
 void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        struct kvm *kvm = vcpu->kvm;
+       bool update_guest_cr3 = true;
        unsigned long guest_cr3;
        u64 eptp;
 
@@ -2997,15 +2982,18 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                        spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
                }
 
-               if (enable_unrestricted_guest || is_paging(vcpu) ||
-                   is_guest_mode(vcpu))
+               /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
+               if (is_guest_mode(vcpu))
+                       update_guest_cr3 = false;
+               else if (enable_unrestricted_guest || is_paging(vcpu))
                        guest_cr3 = kvm_read_cr3(vcpu);
                else
                        guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
                ept_load_pdptrs(vcpu);
        }
 
-       vmcs_writel(GUEST_CR3, guest_cr3);
+       if (update_guest_cr3)
+               vmcs_writel(GUEST_CR3, guest_cr3);
 }
 
 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -3830,24 +3818,29 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
  * 2. If target vcpu isn't running(root mode), kick it to pick up the
  * interrupt from PIR in next vmentry.
  */
-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int r;
 
        r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
        if (!r)
-               return;
+               return 0;
+
+       if (!vcpu->arch.apicv_active)
+               return -1;
 
        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
-               return;
+               return 0;
 
        /* If a previous notification has sent the IPI, nothing to do.  */
        if (pi_test_and_set_on(&vmx->pi_desc))
-               return;
+               return 0;
 
        if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
                kvm_vcpu_kick(vcpu);
+
+       return 0;
 }
 
 /*
@@ -4477,8 +4470,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-       return (!to_vmx(vcpu)->nested.nested_run_pending &&
-               vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return false;
+
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+               return true;
+
+       return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
@@ -6137,7 +6135,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
        if (pi_test_on(&vmx->pi_desc)) {
                pi_clear_on(&vmx->pi_desc);
                /*
-                * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+                * IOMMU can write to PID.ON, so the barrier matters even on UP.
                 * But on x86 this is just a compiler barrier anyway.
                 */
                smp_mb__after_atomic();
@@ -6167,7 +6165,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 
 static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
 {
-       return pi_test_on(vcpu_to_pi_desc(vcpu));
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+       return pi_test_on(pi_desc) ||
+               (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
 }
 
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -6653,6 +6654,10 @@ static struct kvm *vmx_vm_alloc(void)
        struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
                                            GFP_KERNEL_ACCOUNT | __GFP_ZERO,
                                            PAGE_KERNEL);
+
+       if (!kvm_vmx)
+               return NULL;
+
        return &kvm_vmx->kvm;
 }
 
@@ -6776,8 +6781,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
        if (nested)
                nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
-                                          vmx_capability.ept,
-                                          kvm_vcpu_apicv_active(&vmx->vcpu));
+                                          vmx_capability.ept);
        else
                memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
 
@@ -6859,8 +6863,7 @@ static int __init vmx_check_processor_compat(void)
        if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
                return -EIO;
        if (nested)
-               nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
-                                          enable_apicv);
+               nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
        if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
                printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
                                smp_processor_id());
@@ -7106,6 +7109,40 @@ static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
        to_vmx(vcpu)->req_immediate_exit = true;
 }
 
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+                                 struct x86_instruction_info *info)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       unsigned short port;
+       bool intercept;
+       int size;
+
+       if (info->intercept == x86_intercept_in ||
+           info->intercept == x86_intercept_ins) {
+               port = info->src_val;
+               size = info->dst_bytes;
+       } else {
+               port = info->dst_val;
+               size = info->src_bytes;
+       }
+
+       /*
+        * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+        * VM-exits depend on the 'unconditional IO exiting' VM-execution
+        * control.
+        *
+        * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+        */
+       if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+               intercept = nested_cpu_has(vmcs12,
+                                          CPU_BASED_UNCOND_IO_EXITING);
+       else
+               intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+       /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
+       return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage)
@@ -7113,19 +7150,45 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 
+       switch (info->intercept) {
        /*
         * RDPID causes #UD if disabled through secondary execution controls.
         * Because it is marked as EmulateOnUD, we need to intercept it here.
         */
-       if (info->intercept == x86_intercept_rdtscp &&
-           !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
-               ctxt->exception.vector = UD_VECTOR;
-               ctxt->exception.error_code_valid = false;
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+       case x86_intercept_rdtscp:
+               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+                       ctxt->exception.vector = UD_VECTOR;
+                       ctxt->exception.error_code_valid = false;
+                       return X86EMUL_PROPAGATE_FAULT;
+               }
+               break;
+
+       case x86_intercept_in:
+       case x86_intercept_ins:
+       case x86_intercept_out:
+       case x86_intercept_outs:
+               return vmx_check_intercept_io(vcpu, info);
+
+       case x86_intercept_lgdt:
+       case x86_intercept_lidt:
+       case x86_intercept_lldt:
+       case x86_intercept_ltr:
+       case x86_intercept_sgdt:
+       case x86_intercept_sidt:
+       case x86_intercept_sldt:
+       case x86_intercept_str:
+               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+                       return X86EMUL_CONTINUE;
+
+               /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
+               break;
 
        /* TODO: check more intercepts... */
-       return X86EMUL_CONTINUE;
+       default:
+               break;
+       }
+
+       return X86EMUL_UNHANDLEABLE;
 }
 
 #ifdef CONFIG_X86_64
@@ -7710,7 +7773,7 @@ static __init int hardware_setup(void)
 
        if (nested) {
                nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
-                                          vmx_capability.ept, enable_apicv);
+                                          vmx_capability.ept);
 
                r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
                if (r)
@@ -7844,6 +7907,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .xsaves_supported = vmx_xsaves_supported,
        .umip_emulated = vmx_umip_emulated,
        .pt_supported = vmx_pt_supported,
+       .pku_supported = vmx_pku_supported,
 
        .request_immediate_exit = vmx_request_immediate_exit,