From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Aug 2018 17:38:36 +0000 (-0700)
Subject: Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
X-Git-Tag: v4.19~357
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e61cf2e3a5b452cfefcb145021f5a8ea88735cc1;p=platform%2Fkernel%2Flinux-rpi.git

Merge tag 'for-linus' of git://git./virt/kvm/kvm

Pull first set of KVM updates from Paolo Bonzini:
 "PPC:
   - minor code cleanups

  x86:
   - PCID emulation and CR3 caching for shadow page tables
   - nested VMX live migration
   - nested VMCS shadowing
   - optimized IPI hypercall
   - some optimizations

  ARM will come next week"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (85 commits)
  kvm: x86: Set highest physical address bits in non-present/reserved SPTEs
  KVM/x86: Use CC_SET()/CC_OUT in arch/x86/kvm/vmx.c
  KVM: X86: Implement PV IPIs in linux guest
  KVM: X86: Add kvm hypervisor init time platform setup callback
  KVM: X86: Implement "send IPI" hypercall
  KVM/x86: Move X86_CR4_OSXSAVE check into kvm_valid_sregs()
  KVM: x86: Skip pae_root shadow allocation if tdp enabled
  KVM/MMU: Combine flushing remote tlb in mmu_set_spte()
  KVM: vmx: skip VMWRITE of HOST_{FS,GS}_BASE when possible
  KVM: vmx: skip VMWRITE of HOST_{FS,GS}_SEL when possible
  KVM: vmx: always initialize HOST_{FS,GS}_BASE to zero during setup
  KVM: vmx: move struct host_state usage to struct loaded_vmcs
  KVM: vmx: compute need to reload FS/GS/LDT on demand
  KVM: nVMX: remove a misleading comment regarding vmcs02 fields
  KVM: vmx: rename __vmx_load_host_state() and vmx_save_host_state()
  KVM: vmx: add dedicated utility to access guest's kernel_gs_base
  KVM: vmx: track host_state.loaded using a loaded_vmcs pointer
  KVM: vmx: refactor segmentation code in vmx_save_host_state()
  kvm: nVMX: Fix fault priority for VMX operations
  kvm: nVMX: Fix fault vector for VMX operation at CPL > 0
  ...
---

e61cf2e3a5b452cfefcb145021f5a8ea88735cc1
diff --cc arch/x86/include/asm/kvm_host.h
index acebb80,c18958e..00ddb0c
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -1418,7 -1457,10 +1462,11 @@@ int kvm_cpu_get_interrupt(struct kvm_vc
  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
  
+ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+     		    unsigned long ipi_bitmap_high, int min,
+ 		    unsigned long icr, int op_64_bit);
+ 
 +u64 kvm_get_arch_capabilities(void);
  void kvm_define_shared_msr(unsigned index, u32 msr);
  int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
  
diff --cc arch/x86/include/asm/trace/hyperv.h
index 9c0d4b5,e1ffe61..2e6245a
--- a/arch/x86/include/asm/trace/hyperv.h
+++ b/arch/x86/include/asm/trace/hyperv.h
@@@ -28,21 -28,20 +28,35 @@@ TRACE_EVENT(hyperv_mmu_flush_tlb_others
  		      __entry->addr, __entry->end)
  	);
  
+ TRACE_EVENT(hyperv_nested_flush_guest_mapping,
+ 	    TP_PROTO(u64 as, int ret),
+ 	    TP_ARGS(as, ret),
+ 
+ 	    TP_STRUCT__entry(
+ 		    __field(u64, as)
+ 		    __field(int, ret)
+ 		    ),
+ 	    TP_fast_assign(__entry->as = as;
+ 			   __entry->ret = ret;
+ 		    ),
+ 	    TP_printk("address space %llx ret %d", __entry->as, __entry->ret)
+ 	);
+ 
 +TRACE_EVENT(hyperv_send_ipi_mask,
 +	    TP_PROTO(const struct cpumask *cpus,
 +		     int vector),
 +	    TP_ARGS(cpus, vector),
 +	    TP_STRUCT__entry(
 +		    __field(unsigned int, ncpus)
 +		    __field(int, vector)
 +		    ),
 +	    TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
 +			   __entry->vector = vector;
 +		    ),
 +	    TP_printk("ncpus %d vector %x",
 +		      __entry->ncpus, __entry->vector)
 +	);
 +
  #endif /* CONFIG_HYPERV */
  
  #undef TRACE_INCLUDE_PATH
diff --cc arch/x86/kernel/kvm.c
index 09aaabb,62cbd08..0f471bd
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@@ -611,6 -716,19 +703,20 @@@ static uint32_t __init kvm_detect(void
  	return kvm_cpuid_base();
  }
  
+ static void __init kvm_apic_init(void)
+ {
+ #if defined(CONFIG_SMP)
+ 	if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
+ 		kvm_setup_pv_ipi();
+ #endif
+ }
+ 
+ static void __init kvm_init_platform(void)
+ {
++	kvmclock_init();
+ 	x86_platform.apic_post_init = kvm_apic_init;
+ }
+ 
  const __initconst struct hypervisor_x86 x86_hyper_kvm = {
  	.name			= "KVM",
  	.detect			= kvm_detect,
diff --cc arch/x86/kvm/vmx.c
index 46b428c0,16f9373..1519f03
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -188,150 -189,12 +189,156 @@@ module_param(ple_window_max, uint, 0444
  
  extern const ulong vmx_return;
  
 +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
 +static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 +
 +/* Storage for pre module init parameter parsing */
 +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 +
 +static const struct {
 +	const char *option;
 +	enum vmx_l1d_flush_state cmd;
 +} vmentry_l1d_param[] = {
 +	{"auto",	VMENTER_L1D_FLUSH_AUTO},
 +	{"never",	VMENTER_L1D_FLUSH_NEVER},
 +	{"cond",	VMENTER_L1D_FLUSH_COND},
 +	{"always",	VMENTER_L1D_FLUSH_ALWAYS},
 +};
 +
 +#define L1D_CACHE_ORDER 4
 +static void *vmx_l1d_flush_pages;
 +
 +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 +{
 +	struct page *page;
 +	unsigned int i;
 +
 +	if (!enable_ept) {
 +		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 +		return 0;
 +	}
 +
 +       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
 +	       u64 msr;
 +
 +	       rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
 +	       if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
 +		       l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 +		       return 0;
 +	       }
 +       }
 +
 +	/* If set to auto use the default l1tf mitigation method */
 +	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
 +		switch (l1tf_mitigation) {
 +		case L1TF_MITIGATION_OFF:
 +			l1tf = VMENTER_L1D_FLUSH_NEVER;
 +			break;
 +		case L1TF_MITIGATION_FLUSH_NOWARN:
 +		case L1TF_MITIGATION_FLUSH:
 +		case L1TF_MITIGATION_FLUSH_NOSMT:
 +			l1tf = VMENTER_L1D_FLUSH_COND;
 +			break;
 +		case L1TF_MITIGATION_FULL:
 +		case L1TF_MITIGATION_FULL_FORCE:
 +			l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 +			break;
 +		}
 +	} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
 +		l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 +	}
 +
 +	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
 +	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
 +		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
 +		if (!page)
 +			return -ENOMEM;
 +		vmx_l1d_flush_pages = page_address(page);
 +
 +		/*
 +		 * Initialize each page with a different pattern in
 +		 * order to protect against KSM in the nested
 +		 * virtualization case.
 +		 */
 +		for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
 +			memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
 +			       PAGE_SIZE);
 +		}
 +	}
 +
 +	l1tf_vmx_mitigation = l1tf;
 +
 +	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
 +		static_branch_enable(&vmx_l1d_should_flush);
 +	else
 +		static_branch_disable(&vmx_l1d_should_flush);
 +
 +	if (l1tf == VMENTER_L1D_FLUSH_COND)
 +		static_branch_enable(&vmx_l1d_flush_cond);
 +	else
 +		static_branch_disable(&vmx_l1d_flush_cond);
 +	return 0;
 +}
 +
 +static int vmentry_l1d_flush_parse(const char *s)
 +{
 +	unsigned int i;
 +
 +	if (s) {
 +		for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
 +			if (sysfs_streq(s, vmentry_l1d_param[i].option))
 +				return vmentry_l1d_param[i].cmd;
 +		}
 +	}
 +	return -EINVAL;
 +}
 +
 +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 +{
 +	int l1tf, ret;
 +
 +	if (!boot_cpu_has(X86_BUG_L1TF))
 +		return 0;
 +
 +	l1tf = vmentry_l1d_flush_parse(s);
 +	if (l1tf < 0)
 +		return l1tf;
 +
 +	/*
 +	 * Has vmx_init() run already? If not then this is the pre init
 +	 * parameter parsing. In that case just store the value and let
 +	 * vmx_init() do the proper setup after enable_ept has been
 +	 * established.
 +	 */
 +	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
 +		vmentry_l1d_flush_param = l1tf;
 +		return 0;
 +	}
 +
 +	mutex_lock(&vmx_l1d_flush_mutex);
 +	ret = vmx_setup_l1d_flush(l1tf);
 +	mutex_unlock(&vmx_l1d_flush_mutex);
 +	return ret;
 +}
 +
 +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 +{
 +	return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 +}
 +
 +static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 +	.set = vmentry_l1d_flush_set,
 +	.get = vmentry_l1d_flush_get,
 +};
 +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 +
+ enum ept_pointers_status {
+ 	EPT_POINTERS_CHECK = 0,
+ 	EPT_POINTERS_MATCH = 1,
+ 	EPT_POINTERS_MISMATCH = 2
+ };
+ 
  struct kvm_vmx {
  	struct kvm kvm;
  
@@@ -937,21 -828,14 +977,13 @@@ struct vcpu_vmx 
  	 */
  	struct loaded_vmcs    vmcs01;
  	struct loaded_vmcs   *loaded_vmcs;
+ 	struct loaded_vmcs   *loaded_cpu_state;
  	bool                  __launched; /* temporary, used in vmx_vcpu_run */
  	struct msr_autoload {
 -		unsigned nr;
 -		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
 -		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
 +		struct vmx_msrs guest;
 +		struct vmx_msrs host;
  	} msr_autoload;
- 	struct {
- 		int           loaded;
- 		u16           fs_sel, gs_sel, ldt_sel;
- #ifdef CONFIG_X86_64
- 		u16           ds_sel, es_sel;
- #endif
- 		int           gs_ldt_reload_needed;
- 		int           fs_reload_needed;
- 		u64           msr_host_bndcfgs;
- 	} host_state;
+ 
  	struct {
  		int vm86_active;
  		ulong save_rflags;
@@@ -10647,37 -10779,12 +11021,39 @@@ free_vcpu
  	return ERR_PTR(err);
  }
  
 +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
 +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
 +
  static int vmx_vm_init(struct kvm *kvm)
  {
+ 	spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ 
  	if (!ple_gap)
  		kvm->arch.pause_in_guest = true;
 +
 +	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
 +		switch (l1tf_mitigation) {
 +		case L1TF_MITIGATION_OFF:
 +		case L1TF_MITIGATION_FLUSH_NOWARN:
 +			/* 'I explicitly don't care' is set */
 +			break;
 +		case L1TF_MITIGATION_FLUSH:
 +		case L1TF_MITIGATION_FLUSH_NOSMT:
 +		case L1TF_MITIGATION_FULL:
 +			/*
 +			 * Warn upon starting the first VM in a potentially
 +			 * insecure environment.
 +			 */
 +			if (cpu_smt_control == CPU_SMT_ENABLED)
 +				pr_warn_once(L1TF_MSG_SMT);
 +			if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
 +				pr_warn_once(L1TF_MSG_L1D);
 +			break;
 +		case L1TF_MITIGATION_FULL_FORCE:
 +			/* Flush is enforced */
 +			break;
 +		}
 +	}
  	return 0;
  }
  
@@@ -12164,16 -12375,26 +12644,29 @@@ static int nested_vmx_run(struct kvm_vc
  	 */
  
  	vmx->nested.nested_run_pending = 1;
- 	ret = enter_vmx_non_root_mode(vcpu);
+ 	ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
  	if (ret) {
+ 		nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
  		vmx->nested.nested_run_pending = 0;
- 		return ret;
+ 		return 1;
  	}
  
 +	/* Hide L1D cache contents from the nested guest.  */
 +	vmx->vcpu.arch.l1tf_flush_l1d = true;
 +
  	/*
+ 	 * Must happen outside of enter_vmx_non_root_mode() as it will
+ 	 * also be used as part of restoring nVMX state for
+ 	 * snapshot restore (migration).
+ 	 *
+ 	 * In this flow, it is assumed that vmcs12 cache was
+ 	 * trasferred as part of captured nVMX state and should
+ 	 * therefore not be read from guest memory (which may not
+ 	 * exist on destination host yet).
+ 	 */
+ 	nested_cache_shadow_vmcs12(vcpu, vmcs12);
+ 
+ 	/*
  	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
  	 * by event injection, halt vcpu.
  	 */