KVM: x86: Gracefully handle __vmalloc() failure during VM allocation

[platform/kernel/linux-rpi.git] / arch / x86 / kvm / vmx / vmx.c
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index 8f01019..e0d16ba 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -95,7 +95,7 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
  static bool __read_mostly fasteoi = 1;
  module_param(fasteoi, bool, S_IRUGO);
  
-static bool __read_mostly enable_apicv = 1;
+bool __read_mostly enable_apicv = 1;
  module_param(enable_apicv, bool, S_IRUGO);
  
  /*
@@ -648,43 +648,15 @@ void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
  }
  
  #ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
-       cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
-       cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
-       return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
  static void crash_vmclear_local_loaded_vmcss(void)
  {
         int cpu = raw_smp_processor_id();
         struct loaded_vmcs *v;
  
-       if (!crash_local_vmclear_enabled(cpu))
-               return;
-
         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                             loaded_vmcss_on_cpu_link)
                 vmcs_clear(v->vmcs);
  }
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
  #endif /* CONFIG_KEXEC_CORE */
  
  static void __loaded_vmcs_clear(void *arg)
@@ -696,19 +668,24 @@ static void __loaded_vmcs_clear(void *arg)
                 return; /* vcpu migration can race with cpu offline */
         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                 per_cpu(current_vmcs, cpu) = NULL;
-       crash_disable_local_vmclear(cpu);
+
+       vmcs_clear(loaded_vmcs->vmcs);
+       if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+               vmcs_clear(loaded_vmcs->shadow_vmcs);
+
         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
  
         /*
-        * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
-        * is before setting loaded_vmcs->vcpu to -1 which is done in
-        * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
-        * then adds the vmcs into percpu list before it is deleted.
+        * Ensure all writes to loaded_vmcs, including deleting it from its
+        * current percpu list, complete before setting loaded_vmcs->vcpu to
+        * -1, otherwise a different cpu can see vcpu == -1 first and add
+        * loaded_vmcs to its percpu list before it's deleted from this cpu's
+        * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
          */
         smp_wmb();
  
-       loaded_vmcs_init(loaded_vmcs);
-       crash_enable_local_vmclear(cpu);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
  }
  
  void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -969,17 +946,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
         u64 guest_efer = vmx->vcpu.arch.efer;
         u64 ignore_bits = 0;
  
-       if (!enable_ept) {
-               /*
-                * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
-                * host CPUID is more efficient than testing guest CPUID
-                * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
-                */
-               if (boot_cpu_has(X86_FEATURE_SMEP))
-                       guest_efer |= EFER_NX;
-               else if (!(guest_efer & EFER_NX))
-                       ignore_bits |= EFER_NX;
-       }
+       /* Shadow paging assumes NX to be available.  */
+       if (!enable_ept)
+               guest_efer |= EFER_NX;
  
         /*
          * LMA and LME handled by hardware; SCE meaningless outside long mode.
@@ -1276,6 +1245,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
         if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
                 return;
  
+       /*
+        * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+        * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+        * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+        * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
+        * correctly.
+        */
+       if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+               pi_clear_sn(pi_desc);
+               goto after_clear_sn;
+       }
+
         /* The full case.  */
         do {
                 old.control = new.control = pi_desc->control;
@@ -1291,6 +1272,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
         } while (cmpxchg64(&pi_desc->control, old.control,
                            new.control) != old.control);
  
+after_clear_sn:
+
         /*
          * Clear SN before reading the bitmap.  The VT-d firmware
          * writes the bitmap and reads SN atomically (5.2.3 in the
@@ -1299,7 +1282,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
          */
         smp_mb__after_atomic();
  
-       if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+       if (!pi_is_pir_empty(pi_desc))
                 pi_set_on(pi_desc);
  }
  
@@ -1311,18 +1294,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
         if (!already_loaded) {
                 loaded_vmcs_clear(vmx->loaded_vmcs);
                 local_irq_disable();
-               crash_disable_local_vmclear(cpu);
  
                 /*
-                * Read loaded_vmcs->cpu should be before fetching
-                * loaded_vmcs->loaded_vmcss_on_cpu_link.
-                * See the comments in __loaded_vmcs_clear().
+                * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+                * this cpu's percpu list, otherwise it may not yet be deleted
+                * from its previous cpu's percpu list.  Pairs with the
+                * smb_wmb() in __loaded_vmcs_clear().
                  */
                 smp_rmb();
  
                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                          &per_cpu(loaded_vmcss_on_cpu, cpu));
-               crash_enable_local_vmclear(cpu);
                 local_irq_enable();
         }
  
@@ -2134,6 +2116,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
                                         PT_CAP_num_address_ranges)))
                         return 1;
+               if (is_noncanonical_address(data, vcpu))
+                       return 1;
                 if (index % 2)
                         vmx->pt_desc.guest.addr_b[index / 2] = data;
                 else
@@ -2248,17 +2232,6 @@ static int hardware_enable(void)
         INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
         spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
  
-       /*
-        * Now we can enable the vmclear operation in kdump
-        * since the loaded_vmcss_on_cpu list on this cpu
-        * has been initialized.
-        *
-        * Though the cpu is not in VMX operation now, there
-        * is no problem to enable the vmclear operation
-        * for the loaded_vmcss_on_cpu list is empty!
-        */
-       crash_enable_local_vmclear(cpu);
-
         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
  
         test_bits = FEATURE_CONTROL_LOCKED;
@@ -2967,6 +2940,9 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  
  static int get_ept_level(struct kvm_vcpu *vcpu)
  {
+       /* Nested EPT currently only supports 4-level walks. */
+       if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+               return 4;
         if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
                 return 5;
         return 4;
@@ -2989,6 +2965,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
  void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
         struct kvm *kvm = vcpu->kvm;
+       bool update_guest_cr3 = true;
         unsigned long guest_cr3;
         u64 eptp;
  
@@ -3005,15 +2982,18 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                         spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
                 }
  
-               if (enable_unrestricted_guest || is_paging(vcpu) ||
-                   is_guest_mode(vcpu))
+               /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
+               if (is_guest_mode(vcpu))
+                       update_guest_cr3 = false;
+               else if (enable_unrestricted_guest || is_paging(vcpu))
                         guest_cr3 = kvm_read_cr3(vcpu);
                 else
                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
                 ept_load_pdptrs(vcpu);
         }
  
-       vmcs_writel(GUEST_CR3, guest_cr3);
+       if (update_guest_cr3)
+               vmcs_writel(GUEST_CR3, guest_cr3);
  }
  
  int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -3838,24 +3818,29 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
   * 2. If target vcpu isn't running(root mode), kick it to pick up the
   * interrupt from PIR in next vmentry.
   */
-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         int r;
  
         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
         if (!r)
-               return;
+               return 0;
+
+       if (!vcpu->arch.apicv_active)
+               return -1;
  
         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
-               return;
+               return 0;
  
         /* If a previous notification has sent the IPI, nothing to do.  */
         if (pi_test_and_set_on(&vmx->pi_desc))
-               return;
+               return 0;
  
         if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
                 kvm_vcpu_kick(vcpu);
+
+       return 0;
  }
  
  /*
@@ -4485,8 +4470,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
  
  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
-       return (!to_vmx(vcpu)->nested.nested_run_pending &&
-               vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return false;
+
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+               return true;
+
+       return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
  }
@@ -6145,7 +6135,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
         if (pi_test_on(&vmx->pi_desc)) {
                 pi_clear_on(&vmx->pi_desc);
                 /*
-                * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+                * IOMMU can write to PID.ON, so the barrier matters even on UP.
                  * But on x86 this is just a compiler barrier anyway.
                  */
                 smp_mb__after_atomic();
@@ -6175,7 +6165,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
  
  static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
  {
-       return pi_test_on(vcpu_to_pi_desc(vcpu));
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+       return pi_test_on(pi_desc) ||
+               (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
  }
  
  static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -6661,6 +6654,10 @@ static struct kvm *vmx_vm_alloc(void)
         struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
                                             GFP_KERNEL_ACCOUNT | __GFP_ZERO,
                                             PAGE_KERNEL);
+
+       if (!kvm_vmx)
+               return NULL;
+
         return &kvm_vmx->kvm;
  }
  
@@ -6784,8 +6781,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
  
         if (nested)
                 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
-                                          vmx_capability.ept,
-                                          kvm_vcpu_apicv_active(&vmx->vcpu));
+                                          vmx_capability.ept);
         else
                 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
  
@@ -6867,8 +6863,7 @@ static int __init vmx_check_processor_compat(void)
         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
                 return -EIO;
         if (nested)
-               nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
-                                          enable_apicv);
+               nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
                 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
                                 smp_processor_id());
@@ -7114,6 +7109,40 @@ static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
         to_vmx(vcpu)->req_immediate_exit = true;
  }
  
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+                                 struct x86_instruction_info *info)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       unsigned short port;
+       bool intercept;
+       int size;
+
+       if (info->intercept == x86_intercept_in ||
+           info->intercept == x86_intercept_ins) {
+               port = info->src_val;
+               size = info->dst_bytes;
+       } else {
+               port = info->dst_val;
+               size = info->src_bytes;
+       }
+
+       /*
+        * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+        * VM-exits depend on the 'unconditional IO exiting' VM-execution
+        * control.
+        *
+        * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+        */
+       if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+               intercept = nested_cpu_has(vmcs12,
+                                          CPU_BASED_UNCOND_IO_EXITING);
+       else
+               intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+       /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
+       return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
  static int vmx_check_intercept(struct kvm_vcpu *vcpu,
                                struct x86_instruction_info *info,
                                enum x86_intercept_stage stage)
@@ -7121,19 +7150,45 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
         struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
  
+       switch (info->intercept) {
         /*
          * RDPID causes #UD if disabled through secondary execution controls.
          * Because it is marked as EmulateOnUD, we need to intercept it here.
          */
-       if (info->intercept == x86_intercept_rdtscp &&
-           !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
-               ctxt->exception.vector = UD_VECTOR;
-               ctxt->exception.error_code_valid = false;
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+       case x86_intercept_rdtscp:
+               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+                       ctxt->exception.vector = UD_VECTOR;
+                       ctxt->exception.error_code_valid = false;
+                       return X86EMUL_PROPAGATE_FAULT;
+               }
+               break;
+
+       case x86_intercept_in:
+       case x86_intercept_ins:
+       case x86_intercept_out:
+       case x86_intercept_outs:
+               return vmx_check_intercept_io(vcpu, info);
+
+       case x86_intercept_lgdt:
+       case x86_intercept_lidt:
+       case x86_intercept_lldt:
+       case x86_intercept_ltr:
+       case x86_intercept_sgdt:
+       case x86_intercept_sidt:
+       case x86_intercept_sldt:
+       case x86_intercept_str:
+               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+                       return X86EMUL_CONTINUE;
+
+               /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
+               break;
  
         /* TODO: check more intercepts... */
-       return X86EMUL_CONTINUE;
+       default:
+               break;
+       }
+
+       return X86EMUL_UNHANDLEABLE;
  }
  
  #ifdef CONFIG_X86_64
@@ -7718,7 +7773,7 @@ static __init int hardware_setup(void)
  
         if (nested) {
                 nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
-                                          vmx_capability.ept, enable_apicv);
+                                          vmx_capability.ept);
  
                 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
                 if (r)
@@ -7852,6 +7907,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .xsaves_supported = vmx_xsaves_supported,
         .umip_emulated = vmx_umip_emulated,
         .pt_supported = vmx_pt_supported,
+       .pku_supported = vmx_pku_supported,
  
         .request_immediate_exit = vmx_request_immediate_exit,