Merge branch 'kvm-fixes' into 'next'

author Paolo Bonzini <pbonzini@redhat.com>

Wed, 21 Oct 2020 22:05:58 +0000 (18:05 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Wed, 21 Oct 2020 22:05:58 +0000 (18:05 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Wed, 21 Oct 2020 22:05:58 +0000 (18:05 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Wed, 21 Oct 2020 22:05:58 +0000 (18:05 -0400)
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c

index 544bca3..39ca71a 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -31,7 +31,14 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
                 isb();
         }
  
+       /*
+        * __load_guest_stage2() includes an ISB only when the AT
+        * workaround is applied. Take care of the opposite condition,
+        * ensuring that we always have an ISB, but not two ISBs back
+        * to back.
+        */
         __load_guest_stage2(mmu);
+       asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
  }
  
  static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index dc4fe57..cf951e5 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2274,6 +2274,12 @@ static int iret_interception(struct vcpu_svm *svm)
         return 1;
  }
  
+static int invd_interception(struct vcpu_svm *svm)
+{
+       /* Treat an INVD instruction as a NOP and just skip it. */
+       return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
  static int invlpg_interception(struct vcpu_svm *svm)
  {
         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
@@ -2891,7 +2897,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
         [SVM_EXIT_RDPMC]                        = rdpmc_interception,
         [SVM_EXIT_CPUID]                        = cpuid_interception,
         [SVM_EXIT_IRET]                         = iret_interception,
-       [SVM_EXIT_INVD]                         = emulate_on_interception,
+       [SVM_EXIT_INVD]                         = invd_interception,
         [SVM_EXIT_PAUSE]                        = pause_interception,
         [SVM_EXIT_HLT]                          = halt_interception,
         [SVM_EXIT_INVLPG]                       = invlpg_interception,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index 544a35b..7558967 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -128,6 +128,9 @@ static bool __read_mostly enable_preemption_timer = 1;
  module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
  #endif
  
+extern bool __read_mostly allow_smaller_maxphyaddr;
+module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
+
  #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
  #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
  #define KVM_VM_CR0_ALWAYS_ON                           \
@@ -834,6 +837,18 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
          */
         if (is_guest_mode(vcpu))
                 eb |= get_vmcs12(vcpu)->exception_bitmap;
+        else {
+               /*
+                * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
+                * between guest and host.  In that case we only care about present
+                * faults.  For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in
+                * prepare_vmcs02_rare.
+                */
+               bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR));
+               int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0;
+               vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
+               vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask);
+       }
  
         vmcs_write32(EXCEPTION_BITMAP, eb);
  }
@@ -4363,16 +4378,6 @@ static void init_vmcs(struct vcpu_vmx *vmx)
                 vmx->pt_desc.guest.output_mask = 0x7F;
                 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
         }
-
-       /*
-        * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
-        * between guest and host.  In that case we only care about present
-        * faults.
-        */
-       if (enable_ept) {
-               vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
-               vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
-       }
  }
  
  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -4814,6 +4819,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
                          * EPT will cause page fault only if we need to
                          * detect illegal GPAs.
                          */
+                       WARN_ON_ONCE(!allow_smaller_maxphyaddr);
                         kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
                         return 1;
                 } else
@@ -5343,7 +5349,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
          * would also use advanced VM-exit information for EPT violations to
          * reconstruct the page fault error code.
          */
-       if (unlikely(kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
+       if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
                 return kvm_emulate_instruction(vcpu, 0);
  
         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -8008,11 +8014,12 @@ static int __init vmx_init(void)
         vmx_check_vmcs12_offsets();
  
         /*
-        * Intel processors don't have problems with
-        * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
-        * it for VMX by default
+        * Shadow paging doesn't have a (further) performance penalty
+        * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
+        * by default
          */
-       allow_smaller_maxphyaddr = true;
+       if (!enable_ept)
+               allow_smaller_maxphyaddr = true;
  
         return 0;
  }
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h

index 64ff2c0..f6f66e5 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -470,7 +470,10 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
  
  static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
  {
-       return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
+       if (!enable_ept)
+               return true;
+
+       return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
  }
  
  static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 51f75c2..397f599 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -194,7 +194,7 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
  u64 __read_mostly host_efer;
  EXPORT_SYMBOL_GPL(host_efer);
  
-bool __read_mostly allow_smaller_maxphyaddr;
+bool __read_mostly allow_smaller_maxphyaddr = 0;
  EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
  
  static u64 __read_mostly host_xss;
@@ -982,6 +982,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
         unsigned long old_cr4 = kvm_read_cr4(vcpu);
         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
                                    X86_CR4_SMEP;
+       unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
  
         if (kvm_valid_cr4(vcpu, cr4))
                 return 1;
@@ -1009,7 +1010,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
         if (kvm_x86_ops.set_cr4(vcpu, cr4))
                 return 1;
  
-       if (((cr4 ^ old_cr4) & pdptr_bits) ||
+       if (((cr4 ^ old_cr4) & mmu_role_bits) ||
             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                 kvm_mmu_reset_context(vcpu);
  
@@ -3400,9 +3401,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                  * even when not intercepted. AMD manual doesn't explicitly
                  * state this but appears to behave the same.
                  *
-                * Unconditionally return L1's TSC offset on userspace reads
-                * so that userspace reads and writes always operate on L1's
-                * offset, e.g. to ensure deterministic behavior for migration.
+                * On userspace reads and writes, however, we unconditionally
+                * return L1's TSC value to ensure backwards-compatible
+                * behavior for migration.
                  */
                 u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
                                                             vcpu->arch.tsc_offset;
diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c

index b8d14f9..2fc6b3a 100644 (file)
--- a/tools/testing/selftests/kvm/x86_64/debug_regs.c
+++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c
@@ -73,7 +73,7 @@ int main(void)
         int i;
         /* Instruction lengths starting at ss_start */
         int ss_size[4] = {
-               3,              /* xor */
+               2,              /* xor */
                 2,              /* cpuid */
                 5,              /* mov */
                 2,              /* rdmsr */
author	Paolo Bonzini <pbonzini@redhat.com>
	Wed, 21 Oct 2020 22:05:58 +0000 (18:05 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Wed, 21 Oct 2020 22:05:58 +0000 (18:05 -0400)
arch/arm64/kvm/hyp/nvhe/tlb.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
tools/testing/selftests/kvm/x86_64/debug_regs.c		patch \| blob \| history