arch/x86/kvm/vmx/vmx.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * Copyright (C) 2006 Qumranet, Inc.
   9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10  *
  11  * Authors:
  12  *   Avi Kivity   <avi@qumranet.com>
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  */
  15
  16 #include <linux/highmem.h>
  17 #include <linux/hrtimer.h>
  18 #include <linux/kernel.h>
  19 #include <linux/kvm_host.h>
  20 #include <linux/module.h>
  21 #include <linux/moduleparam.h>
  22 #include <linux/mod_devicetable.h>
  23 #include <linux/mm.h>
  24 #include <linux/objtool.h>
  25 #include <linux/sched.h>
  26 #include <linux/sched/smt.h>
  27 #include <linux/slab.h>
  28 #include <linux/tboot.h>
  29 #include <linux/trace_events.h>
  30 #include <linux/entry-kvm.h>
  31
  32 #include <asm/apic.h>
  33 #include <asm/asm.h>
  34 #include <asm/cpu.h>
  35 #include <asm/cpu_device_id.h>
  36 #include <asm/debugreg.h>
  37 #include <asm/desc.h>
  38 #include <asm/fpu/api.h>
  39 #include <asm/fpu/xstate.h>
  40 #include <asm/idtentry.h>
  41 #include <asm/io.h>
  42 #include <asm/irq_remapping.h>
  43 #include <asm/kexec.h>
  44 #include <asm/perf_event.h>
  45 #include <asm/mmu_context.h>
  46 #include <asm/mshyperv.h>
  47 #include <asm/mwait.h>
  48 #include <asm/spec-ctrl.h>
  49 #include <asm/virtext.h>
  50 #include <asm/vmx.h>
  51
  52 #include "capabilities.h"
  53 #include "cpuid.h"
  54 #include "evmcs.h"
  55 #include "hyperv.h"
  56 #include "kvm_onhyperv.h"
  57 #include "irq.h"
  58 #include "kvm_cache_regs.h"
  59 #include "lapic.h"
  60 #include "mmu.h"
  61 #include "nested.h"
  62 #include "pmu.h"
  63 #include "sgx.h"
  64 #include "trace.h"
  65 #include "vmcs.h"
  66 #include "vmcs12.h"
  67 #include "vmx.h"
  68 #include "x86.h"
  69
  70 MODULE_AUTHOR("Qumranet");
  71 MODULE_LICENSE("GPL");
  72
  73 #ifdef MODULE
  74 static const struct x86_cpu_id vmx_cpu_id[] = {
  75         X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
  76         {}
  77 };
  78 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
  79 #endif
  80
  81 bool __read_mostly enable_vpid = 1;
  82 module_param_named(vpid, enable_vpid, bool, 0444);
  83
  84 static bool __read_mostly enable_vnmi = 1;
  85 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
  86
  87 bool __read_mostly flexpriority_enabled = 1;
  88 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
  89
  90 bool __read_mostly enable_ept = 1;
  91 module_param_named(ept, enable_ept, bool, S_IRUGO);
  92
  93 bool __read_mostly enable_unrestricted_guest = 1;
  94 module_param_named(unrestricted_guest,
  95                         enable_unrestricted_guest, bool, S_IRUGO);
  96
  97 bool __read_mostly enable_ept_ad_bits = 1;
  98 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
  99
 100 static bool __read_mostly emulate_invalid_guest_state = true;
 101 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 102
 103 static bool __read_mostly fasteoi = 1;
 104 module_param(fasteoi, bool, S_IRUGO);
 105
 106 module_param(enable_apicv, bool, S_IRUGO);
 107
 108 /*
 109  * If nested=1, nested virtualization is supported, i.e., guests may use
 110  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 111  * use VMX instructions.
 112  */
 113 static bool __read_mostly nested = 1;
 114 module_param(nested, bool, S_IRUGO);
 115
 116 bool __read_mostly enable_pml = 1;
 117 module_param_named(pml, enable_pml, bool, S_IRUGO);
 118
 119 static bool __read_mostly dump_invalid_vmcs = 0;
 120 module_param(dump_invalid_vmcs, bool, 0644);
 121
 122 #define MSR_BITMAP_MODE_X2APIC          1
 123 #define MSR_BITMAP_MODE_X2APIC_APICV    2
 124
 125 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 126
 127 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
 128 static int __read_mostly cpu_preemption_timer_multi;
 129 static bool __read_mostly enable_preemption_timer = 1;
 130 #ifdef CONFIG_X86_64
 131 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 132 #endif
 133
 134 extern bool __read_mostly allow_smaller_maxphyaddr;
 135 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
 136
 137 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
 138 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
 139 #define KVM_VM_CR0_ALWAYS_ON                            \
 140         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
 141
 142 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
 143 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 144 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
 145
 146 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
 147
 148 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
 149         RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
 150         RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
 151         RTIT_STATUS_BYTECNT))
 152
 153 /*
 154  * List of MSRs that can be directly passed to the guest.
 155  * In addition to these x2apic and PT MSRs are handled specially.
 156  */
 157 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
 158         MSR_IA32_SPEC_CTRL,
 159         MSR_IA32_PRED_CMD,
 160         MSR_IA32_TSC,
 161 #ifdef CONFIG_X86_64
 162         MSR_FS_BASE,
 163         MSR_GS_BASE,
 164         MSR_KERNEL_GS_BASE,
 165         MSR_IA32_XFD,
 166         MSR_IA32_XFD_ERR,
 167 #endif
 168         MSR_IA32_SYSENTER_CS,
 169         MSR_IA32_SYSENTER_ESP,
 170         MSR_IA32_SYSENTER_EIP,
 171         MSR_CORE_C1_RES,
 172         MSR_CORE_C3_RESIDENCY,
 173         MSR_CORE_C6_RESIDENCY,
 174         MSR_CORE_C7_RESIDENCY,
 175 };
 176
 177 /*
 178  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 179  * ple_gap:    upper bound on the amount of time between two successive
 180  *             executions of PAUSE in a loop. Also indicate if ple enabled.
 181  *             According to test, this time is usually smaller than 128 cycles.
 182  * ple_window: upper bound on the amount of time a guest is allowed to execute
 183  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 184  *             less than 2^12 cycles
 185  * Time is measured based on a counter that runs at the same rate as the TSC,
 186  * refer SDM volume 3b section 21.6.13 & 22.1.3.
 187  */
 188 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
 189 module_param(ple_gap, uint, 0444);
 190
 191 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 192 module_param(ple_window, uint, 0444);
 193
 194 /* Default doubles per-vcpu window every exit. */
 195 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 196 module_param(ple_window_grow, uint, 0444);
 197
 198 /* Default resets per-vcpu window every exit to ple_window. */
 199 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 200 module_param(ple_window_shrink, uint, 0444);
 201
 202 /* Default is to compute the maximum so we can never overflow. */
 203 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 204 module_param(ple_window_max, uint, 0444);
 205
 206 /* Default is SYSTEM mode, 1 for host-guest mode */
 207 int __read_mostly pt_mode = PT_MODE_SYSTEM;
 208 module_param(pt_mode, int, S_IRUGO);
 209
 210 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 211 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
 212 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 213
 214 /* Storage for pre module init parameter parsing */
 215 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 216
 217 static const struct {
 218         const char *option;
 219         bool for_parse;
 220 } vmentry_l1d_param[] = {
 221         [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
 222         [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
 223         [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
 224         [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
 225         [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
 226         [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
 227 };
 228
 229 #define L1D_CACHE_ORDER 4
 230 static void *vmx_l1d_flush_pages;
 231
 232 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 233 {
 234         struct page *page;
 235         unsigned int i;
 236
 237         if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
 238                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 239                 return 0;
 240         }
 241
 242         if (!enable_ept) {
 243                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 244                 return 0;
 245         }
 246
 247         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
 248                 u64 msr;
 249
 250                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
 251                 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
 252                         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 253                         return 0;
 254                 }
 255         }
 256
 257         /* If set to auto use the default l1tf mitigation method */
 258         if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
 259                 switch (l1tf_mitigation) {
 260                 case L1TF_MITIGATION_OFF:
 261                         l1tf = VMENTER_L1D_FLUSH_NEVER;
 262                         break;
 263                 case L1TF_MITIGATION_FLUSH_NOWARN:
 264                 case L1TF_MITIGATION_FLUSH:
 265                 case L1TF_MITIGATION_FLUSH_NOSMT:
 266                         l1tf = VMENTER_L1D_FLUSH_COND;
 267                         break;
 268                 case L1TF_MITIGATION_FULL:
 269                 case L1TF_MITIGATION_FULL_FORCE:
 270                         l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 271                         break;
 272                 }
 273         } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
 274                 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 275         }
 276
 277         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
 278             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
 279                 /*
 280                  * This allocation for vmx_l1d_flush_pages is not tied to a VM
 281                  * lifetime and so should not be charged to a memcg.
 282                  */
 283                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
 284                 if (!page)
 285                         return -ENOMEM;
 286                 vmx_l1d_flush_pages = page_address(page);
 287
 288                 /*
 289                  * Initialize each page with a different pattern in
 290                  * order to protect against KSM in the nested
 291                  * virtualization case.
 292                  */
 293                 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
 294                         memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
 295                                PAGE_SIZE);
 296                 }
 297         }
 298
 299         l1tf_vmx_mitigation = l1tf;
 300
 301         if (l1tf != VMENTER_L1D_FLUSH_NEVER)
 302                 static_branch_enable(&vmx_l1d_should_flush);
 303         else
 304                 static_branch_disable(&vmx_l1d_should_flush);
 305
 306         if (l1tf == VMENTER_L1D_FLUSH_COND)
 307                 static_branch_enable(&vmx_l1d_flush_cond);
 308         else
 309                 static_branch_disable(&vmx_l1d_flush_cond);
 310         return 0;
 311 }
 312
 313 static int vmentry_l1d_flush_parse(const char *s)
 314 {
 315         unsigned int i;
 316
 317         if (s) {
 318                 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
 319                         if (vmentry_l1d_param[i].for_parse &&
 320                             sysfs_streq(s, vmentry_l1d_param[i].option))
 321                                 return i;
 322                 }
 323         }
 324         return -EINVAL;
 325 }
 326
 327 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 328 {
 329         int l1tf, ret;
 330
 331         l1tf = vmentry_l1d_flush_parse(s);
 332         if (l1tf < 0)
 333                 return l1tf;
 334
 335         if (!boot_cpu_has(X86_BUG_L1TF))
 336                 return 0;
 337
 338         /*
 339          * Has vmx_init() run already? If not then this is the pre init
 340          * parameter parsing. In that case just store the value and let
 341          * vmx_init() do the proper setup after enable_ept has been
 342          * established.
 343          */
 344         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
 345                 vmentry_l1d_flush_param = l1tf;
 346                 return 0;
 347         }
 348
 349         mutex_lock(&vmx_l1d_flush_mutex);
 350         ret = vmx_setup_l1d_flush(l1tf);
 351         mutex_unlock(&vmx_l1d_flush_mutex);
 352         return ret;
 353 }
 354
 355 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 356 {
 357         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
 358                 return sprintf(s, "???\n");
 359
 360         return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 361 }
 362
 363 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 364         .set = vmentry_l1d_flush_set,
 365         .get = vmentry_l1d_flush_get,
 366 };
 367 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 368
 369 static u32 vmx_segment_access_rights(struct kvm_segment *var);
 370
 371 void vmx_vmexit(void);
 372
 373 #define vmx_insn_failed(fmt...)         \
 374 do {                                    \
 375         WARN_ONCE(1, fmt);              \
 376         pr_warn_ratelimited(fmt);       \
 377 } while (0)
 378
 379 asmlinkage void vmread_error(unsigned long field, bool fault)
 380 {
 381         if (fault)
 382                 kvm_spurious_fault();
 383         else
 384                 vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
 385 }
 386
 387 noinline void vmwrite_error(unsigned long field, unsigned long value)
 388 {
 389         vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
 390                         field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 391 }
 392
 393 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
 394 {
 395         vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
 396 }
 397
 398 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
 399 {
 400         vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
 401 }
 402
 403 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
 404 {
 405         vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
 406                         ext, vpid, gva);
 407 }
 408
 409 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
 410 {
 411         vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
 412                         ext, eptp, gpa);
 413 }
 414
 415 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 416 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 417 /*
 418  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 419  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 420  */
 421 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 422
 423 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 424 static DEFINE_SPINLOCK(vmx_vpid_lock);
 425
 426 struct vmcs_config vmcs_config;
 427 struct vmx_capability vmx_capability;
 428
 429 #define VMX_SEGMENT_FIELD(seg)                                  \
 430         [VCPU_SREG_##seg] = {                                   \
 431                 .selector = GUEST_##seg##_SELECTOR,             \
 432                 .base = GUEST_##seg##_BASE,                     \
 433                 .limit = GUEST_##seg##_LIMIT,                   \
 434                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
 435         }
 436
 437 static const struct kvm_vmx_segment_field {
 438         unsigned selector;
 439         unsigned base;
 440         unsigned limit;
 441         unsigned ar_bytes;
 442 } kvm_vmx_segment_fields[] = {
 443         VMX_SEGMENT_FIELD(CS),
 444         VMX_SEGMENT_FIELD(DS),
 445         VMX_SEGMENT_FIELD(ES),
 446         VMX_SEGMENT_FIELD(FS),
 447         VMX_SEGMENT_FIELD(GS),
 448         VMX_SEGMENT_FIELD(SS),
 449         VMX_SEGMENT_FIELD(TR),
 450         VMX_SEGMENT_FIELD(LDTR),
 451 };
 452
 453 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 454 {
 455         vmx->segment_cache.bitmask = 0;
 456 }
 457
 458 static unsigned long host_idt_base;
 459
 460 #if IS_ENABLED(CONFIG_HYPERV)
 461 static bool __read_mostly enlightened_vmcs = true;
 462 module_param(enlightened_vmcs, bool, 0444);
 463
 464 static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
 465 {
 466         struct hv_enlightened_vmcs *evmcs;
 467         struct hv_partition_assist_pg **p_hv_pa_pg =
 468                         &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
 469         /*
 470          * Synthetic VM-Exit is not enabled in current code and so All
 471          * evmcs in singe VM shares same assist page.
 472          */
 473         if (!*p_hv_pa_pg)
 474                 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
 475
 476         if (!*p_hv_pa_pg)
 477                 return -ENOMEM;
 478
 479         evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
 480
 481         evmcs->partition_assist_page =
 482                 __pa(*p_hv_pa_pg);
 483         evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
 484         evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
 485
 486         return 0;
 487 }
 488
 489 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 490
 491 /*
 492  * Comment's format: document - errata name - stepping - processor name.
 493  * Refer from
 494  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
 495  */
 496 static u32 vmx_preemption_cpu_tfms[] = {
 497 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
 498 0x000206E6,
 499 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
 500 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
 501 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
 502 0x00020652,
 503 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
 504 0x00020655,
 505 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
 506 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
 507 /*
 508  * 320767.pdf - AAP86  - B1 -
 509  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
 510  */
 511 0x000106E5,
 512 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
 513 0x000106A0,
 514 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
 515 0x000106A1,
 516 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
 517 0x000106A4,
 518  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
 519  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
 520  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
 521 0x000106A5,
 522  /* Xeon E3-1220 V2 */
 523 0x000306A8,
 524 };
 525
 526 static inline bool cpu_has_broken_vmx_preemption_timer(void)
 527 {
 528         u32 eax = cpuid_eax(0x00000001), i;
 529
 530         /* Clear the reserved bits */
 531         eax &= ~(0x3U << 14 | 0xfU << 28);
 532         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
 533                 if (eax == vmx_preemption_cpu_tfms[i])
 534                         return true;
 535
 536         return false;
 537 }
 538
 539 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 540 {
 541         return flexpriority_enabled && lapic_in_kernel(vcpu);
 542 }
 543
 544 static inline bool report_flexpriority(void)
 545 {
 546         return flexpriority_enabled;
 547 }
 548
 549 static int possible_passthrough_msr_slot(u32 msr)
 550 {
 551         u32 i;
 552
 553         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
 554                 if (vmx_possible_passthrough_msrs[i] == msr)
 555                         return i;
 556
 557         return -ENOENT;
 558 }
 559
 560 static bool is_valid_passthrough_msr(u32 msr)
 561 {
 562         bool r;
 563
 564         switch (msr) {
 565         case 0x800 ... 0x8ff:
 566                 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
 567                 return true;
 568         case MSR_IA32_RTIT_STATUS:
 569         case MSR_IA32_RTIT_OUTPUT_BASE:
 570         case MSR_IA32_RTIT_OUTPUT_MASK:
 571         case MSR_IA32_RTIT_CR3_MATCH:
 572         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
 573                 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
 574         case MSR_LBR_SELECT:
 575         case MSR_LBR_TOS:
 576         case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
 577         case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
 578         case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
 579         case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
 580         case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
 581                 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
 582                 return true;
 583         }
 584
 585         r = possible_passthrough_msr_slot(msr) != -ENOENT;
 586
 587         WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
 588
 589         return r;
 590 }
 591
 592 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 593 {
 594         int i;
 595
 596         i = kvm_find_user_return_msr(msr);
 597         if (i >= 0)
 598                 return &vmx->guest_uret_msrs[i];
 599         return NULL;
 600 }
 601
 602 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
 603                                   struct vmx_uret_msr *msr, u64 data)
 604 {
 605         unsigned int slot = msr - vmx->guest_uret_msrs;
 606         int ret = 0;
 607
 608         if (msr->load_into_hardware) {
 609                 preempt_disable();
 610                 ret = kvm_set_user_return_msr(slot, data, msr->mask);
 611                 preempt_enable();
 612         }
 613         if (!ret)
 614                 msr->data = data;
 615         return ret;
 616 }
 617
 618 #ifdef CONFIG_KEXEC_CORE
 619 static void crash_vmclear_local_loaded_vmcss(void)
 620 {
 621         int cpu = raw_smp_processor_id();
 622         struct loaded_vmcs *v;
 623
 624         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
 625                             loaded_vmcss_on_cpu_link)
 626                 vmcs_clear(v->vmcs);
 627 }
 628 #endif /* CONFIG_KEXEC_CORE */
 629
 630 static void __loaded_vmcs_clear(void *arg)
 631 {
 632         struct loaded_vmcs *loaded_vmcs = arg;
 633         int cpu = raw_smp_processor_id();
 634
 635         if (loaded_vmcs->cpu != cpu)
 636                 return; /* vcpu migration can race with cpu offline */
 637         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 638                 per_cpu(current_vmcs, cpu) = NULL;
 639
 640         vmcs_clear(loaded_vmcs->vmcs);
 641         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 642                 vmcs_clear(loaded_vmcs->shadow_vmcs);
 643
 644         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
 645
 646         /*
 647          * Ensure all writes to loaded_vmcs, including deleting it from its
 648          * current percpu list, complete before setting loaded_vmcs->vcpu to
 649          * -1, otherwise a different cpu can see vcpu == -1 first and add
 650          * loaded_vmcs to its percpu list before it's deleted from this cpu's
 651          * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
 652          */
 653         smp_wmb();
 654
 655         loaded_vmcs->cpu = -1;
 656         loaded_vmcs->launched = 0;
 657 }
 658
 659 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 660 {
 661         int cpu = loaded_vmcs->cpu;
 662
 663         if (cpu != -1)
 664                 smp_call_function_single(cpu,
 665                          __loaded_vmcs_clear, loaded_vmcs, 1);
 666 }
 667
 668 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
 669                                        unsigned field)
 670 {
 671         bool ret;
 672         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
 673
 674         if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
 675                 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
 676                 vmx->segment_cache.bitmask = 0;
 677         }
 678         ret = vmx->segment_cache.bitmask & mask;
 679         vmx->segment_cache.bitmask |= mask;
 680         return ret;
 681 }
 682
 683 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
 684 {
 685         u16 *p = &vmx->segment_cache.seg[seg].selector;
 686
 687         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
 688                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
 689         return *p;
 690 }
 691
 692 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
 693 {
 694         ulong *p = &vmx->segment_cache.seg[seg].base;
 695
 696         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
 697                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
 698         return *p;
 699 }
 700
 701 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
 702 {
 703         u32 *p = &vmx->segment_cache.seg[seg].limit;
 704
 705         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
 706                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
 707         return *p;
 708 }
 709
 710 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
 711 {
 712         u32 *p = &vmx->segment_cache.seg[seg].ar;
 713
 714         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
 715                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
 716         return *p;
 717 }
 718
 719 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
 720 {
 721         u32 eb;
 722
 723         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 724              (1u << DB_VECTOR) | (1u << AC_VECTOR);
 725         /*
 726          * Guest access to VMware backdoor ports could legitimately
 727          * trigger #GP because of TSS I/O permission bitmap.
 728          * We intercept those #GP and allow access to them anyway
 729          * as VMware does.
 730          */
 731         if (enable_vmware_backdoor)
 732                 eb |= (1u << GP_VECTOR);
 733         if ((vcpu->guest_debug &
 734              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 735             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
 736                 eb |= 1u << BP_VECTOR;
 737         if (to_vmx(vcpu)->rmode.vm86_active)
 738                 eb = ~0;
 739         if (!vmx_need_pf_intercept(vcpu))
 740                 eb &= ~(1u << PF_VECTOR);
 741
 742         /* When we are running a nested L2 guest and L1 specified for it a
 743          * certain exception bitmap, we must trap the same exceptions and pass
 744          * them to L1. When running L2, we will only handle the exceptions
 745          * specified above if L1 did not want them.
 746          */
 747         if (is_guest_mode(vcpu))
 748                 eb |= get_vmcs12(vcpu)->exception_bitmap;
 749         else {
 750                 int mask = 0, match = 0;
 751
 752                 if (enable_ept && (eb & (1u << PF_VECTOR))) {
 753                         /*
 754                          * If EPT is enabled, #PF is currently only intercepted
 755                          * if MAXPHYADDR is smaller on the guest than on the
 756                          * host.  In that case we only care about present,
 757                          * non-reserved faults.  For vmcs02, however, PFEC_MASK
 758                          * and PFEC_MATCH are set in prepare_vmcs02_rare.
 759                          */
 760                         mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
 761                         match = PFERR_PRESENT_MASK;
 762                 }
 763                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
 764                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
 765         }
 766
 767         /*
 768          * Disabling xfd interception indicates that dynamic xfeatures
 769          * might be used in the guest. Always trap #NM in this case
 770          * to save guest xfd_err timely.
 771          */
 772         if (vcpu->arch.xfd_no_write_intercept)
 773                 eb |= (1u << NM_VECTOR);
 774
 775         vmcs_write32(EXCEPTION_BITMAP, eb);
 776 }
 777
 778 /*
 779  * Check if MSR is intercepted for currently loaded MSR bitmap.
 780  */
 781 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
 782 {
 783         if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
 784                 return true;
 785
 786         return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap,
 787                                          MSR_IA32_SPEC_CTRL);
 788 }
 789
 790 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 791                 unsigned long entry, unsigned long exit)
 792 {
 793         vm_entry_controls_clearbit(vmx, entry);
 794         vm_exit_controls_clearbit(vmx, exit);
 795 }
 796
 797 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
 798 {
 799         unsigned int i;
 800
 801         for (i = 0; i < m->nr; ++i) {
 802                 if (m->val[i].index == msr)
 803                         return i;
 804         }
 805         return -ENOENT;
 806 }
 807
 808 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 809 {
 810         int i;
 811         struct msr_autoload *m = &vmx->msr_autoload;
 812
 813         switch (msr) {
 814         case MSR_EFER:
 815                 if (cpu_has_load_ia32_efer()) {
 816                         clear_atomic_switch_msr_special(vmx,
 817                                         VM_ENTRY_LOAD_IA32_EFER,
 818                                         VM_EXIT_LOAD_IA32_EFER);
 819                         return;
 820                 }
 821                 break;
 822         case MSR_CORE_PERF_GLOBAL_CTRL:
 823                 if (cpu_has_load_perf_global_ctrl()) {
 824                         clear_atomic_switch_msr_special(vmx,
 825                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 826                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 827                         return;
 828                 }
 829                 break;
 830         }
 831         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
 832         if (i < 0)
 833                 goto skip_guest;
 834         --m->guest.nr;
 835         m->guest.val[i] = m->guest.val[m->guest.nr];
 836         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
 837
 838 skip_guest:
 839         i = vmx_find_loadstore_msr_slot(&m->host, msr);
 840         if (i < 0)
 841                 return;
 842
 843         --m->host.nr;
 844         m->host.val[i] = m->host.val[m->host.nr];
 845         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
 846 }
 847
 848 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 849                 unsigned long entry, unsigned long exit,
 850                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
 851                 u64 guest_val, u64 host_val)
 852 {
 853         vmcs_write64(guest_val_vmcs, guest_val);
 854         if (host_val_vmcs != HOST_IA32_EFER)
 855                 vmcs_write64(host_val_vmcs, host_val);
 856         vm_entry_controls_setbit(vmx, entry);
 857         vm_exit_controls_setbit(vmx, exit);
 858 }
 859
 860 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 861                                   u64 guest_val, u64 host_val, bool entry_only)
 862 {
 863         int i, j = 0;
 864         struct msr_autoload *m = &vmx->msr_autoload;
 865
 866         switch (msr) {
 867         case MSR_EFER:
 868                 if (cpu_has_load_ia32_efer()) {
 869                         add_atomic_switch_msr_special(vmx,
 870                                         VM_ENTRY_LOAD_IA32_EFER,
 871                                         VM_EXIT_LOAD_IA32_EFER,
 872                                         GUEST_IA32_EFER,
 873                                         HOST_IA32_EFER,
 874                                         guest_val, host_val);
 875                         return;
 876                 }
 877                 break;
 878         case MSR_CORE_PERF_GLOBAL_CTRL:
 879                 if (cpu_has_load_perf_global_ctrl()) {
 880                         add_atomic_switch_msr_special(vmx,
 881                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 882                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
 883                                         GUEST_IA32_PERF_GLOBAL_CTRL,
 884                                         HOST_IA32_PERF_GLOBAL_CTRL,
 885                                         guest_val, host_val);
 886                         return;
 887                 }
 888                 break;
 889         case MSR_IA32_PEBS_ENABLE:
 890                 /* PEBS needs a quiescent period after being disabled (to write
 891                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
 892                  * provide that period, so a CPU could write host's record into
 893                  * guest's memory.
 894                  */
 895                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 896         }
 897
 898         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
 899         if (!entry_only)
 900                 j = vmx_find_loadstore_msr_slot(&m->host, msr);
 901
 902         if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
 903             (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
 904                 printk_once(KERN_WARNING "Not enough msr switch entries. "
 905                                 "Can't add msr %x\n", msr);
 906                 return;
 907         }
 908         if (i < 0) {
 909                 i = m->guest.nr++;
 910                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
 911         }
 912         m->guest.val[i].index = msr;
 913         m->guest.val[i].value = guest_val;
 914
 915         if (entry_only)
 916                 return;
 917
 918         if (j < 0) {
 919                 j = m->host.nr++;
 920                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
 921         }
 922         m->host.val[j].index = msr;
 923         m->host.val[j].value = host_val;
 924 }
 925
 926 static bool update_transition_efer(struct vcpu_vmx *vmx)
 927 {
 928         u64 guest_efer = vmx->vcpu.arch.efer;
 929         u64 ignore_bits = 0;
 930         int i;
 931
 932         /* Shadow paging assumes NX to be available.  */
 933         if (!enable_ept)
 934                 guest_efer |= EFER_NX;
 935
 936         /*
 937          * LMA and LME handled by hardware; SCE meaningless outside long mode.
 938          */
 939         ignore_bits |= EFER_SCE;
 940 #ifdef CONFIG_X86_64
 941         ignore_bits |= EFER_LMA | EFER_LME;
 942         /* SCE is meaningful only in long mode on Intel */
 943         if (guest_efer & EFER_LMA)
 944                 ignore_bits &= ~(u64)EFER_SCE;
 945 #endif
 946
 947         /*
 948          * On EPT, we can't emulate NX, so we must switch EFER atomically.
 949          * On CPUs that support "load IA32_EFER", always switch EFER
 950          * atomically, since it's faster than switching it manually.
 951          */
 952         if (cpu_has_load_ia32_efer() ||
 953             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
 954                 if (!(guest_efer & EFER_LMA))
 955                         guest_efer &= ~EFER_LME;
 956                 if (guest_efer != host_efer)
 957                         add_atomic_switch_msr(vmx, MSR_EFER,
 958                                               guest_efer, host_efer, false);
 959                 else
 960                         clear_atomic_switch_msr(vmx, MSR_EFER);
 961                 return false;
 962         }
 963
 964         i = kvm_find_user_return_msr(MSR_EFER);
 965         if (i < 0)
 966                 return false;
 967
 968         clear_atomic_switch_msr(vmx, MSR_EFER);
 969
 970         guest_efer &= ~ignore_bits;
 971         guest_efer |= host_efer & ignore_bits;
 972
 973         vmx->guest_uret_msrs[i].data = guest_efer;
 974         vmx->guest_uret_msrs[i].mask = ~ignore_bits;
 975
 976         return true;
 977 }
 978
 979 #ifdef CONFIG_X86_32
 980 /*
 981  * On 32-bit kernels, VM exits still load the FS and GS bases from the
 982  * VMCS rather than the segment table.  KVM uses this helper to figure
 983  * out the current bases to poke them into the VMCS before entry.
 984  */
 985 static unsigned long segment_base(u16 selector)
 986 {
 987         struct desc_struct *table;
 988         unsigned long v;
 989
 990         if (!(selector & ~SEGMENT_RPL_MASK))
 991                 return 0;
 992
 993         table = get_current_gdt_ro();
 994
 995         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
 996                 u16 ldt_selector = kvm_read_ldt();
 997
 998                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
 999                         return 0;
1000
1001                 table = (struct desc_struct *)segment_base(ldt_selector);
1002         }
1003         v = get_desc_base(&table[selector >> 3]);
1004         return v;
1005 }
1006 #endif
1007
1008 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1009 {
1010         return vmx_pt_mode_is_host_guest() &&
1011                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1012 }
1013
1014 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1015 {
1016         /* The base must be 128-byte aligned and a legal physical address. */
1017         return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1018 }
1019
1020 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1021 {
1022         u32 i;
1023
1024         wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1025         wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1026         wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1027         wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1028         for (i = 0; i < addr_range; i++) {
1029                 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1030                 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1031         }
1032 }
1033
1034 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1035 {
1036         u32 i;
1037
1038         rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1039         rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1040         rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1041         rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1042         for (i = 0; i < addr_range; i++) {
1043                 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1044                 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1045         }
1046 }
1047
1048 static void pt_guest_enter(struct vcpu_vmx *vmx)
1049 {
1050         if (vmx_pt_mode_is_system())
1051                 return;
1052
1053         /*
1054          * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1055          * Save host state before VM entry.
1056          */
1057         rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1058         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1059                 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1060                 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1061                 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1062         }
1063 }
1064
1065 static void pt_guest_exit(struct vcpu_vmx *vmx)
1066 {
1067         if (vmx_pt_mode_is_system())
1068                 return;
1069
1070         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1071                 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1072                 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1073         }
1074
1075         /*
1076          * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1077          * i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
1078          */
1079         if (vmx->pt_desc.host.ctl)
1080                 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1081 }
1082
1083 void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
1084                              u16 fs_sel, u16 gs_sel,
1085                              unsigned long fs_base, unsigned long gs_base)
1086 {
1087         if (unlikely(cr3 != host->cr3)) {
1088                 vmcs_writel(HOST_CR3, cr3);
1089                 host->cr3 = cr3;
1090         }
1091         if (unlikely(fs_sel != host->fs_sel)) {
1092                 if (!(fs_sel & 7))
1093                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1094                 else
1095                         vmcs_write16(HOST_FS_SELECTOR, 0);
1096                 host->fs_sel = fs_sel;
1097         }
1098         if (unlikely(gs_sel != host->gs_sel)) {
1099                 if (!(gs_sel & 7))
1100                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1101                 else
1102                         vmcs_write16(HOST_GS_SELECTOR, 0);
1103                 host->gs_sel = gs_sel;
1104         }
1105         if (unlikely(fs_base != host->fs_base)) {
1106                 vmcs_writel(HOST_FS_BASE, fs_base);
1107                 host->fs_base = fs_base;
1108         }
1109         if (unlikely(gs_base != host->gs_base)) {
1110                 vmcs_writel(HOST_GS_BASE, gs_base);
1111                 host->gs_base = gs_base;
1112         }
1113 }
1114
1115 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1116 {
1117         struct vcpu_vmx *vmx = to_vmx(vcpu);
1118         struct vmcs_host_state *host_state;
1119 #ifdef CONFIG_X86_64
1120         int cpu = raw_smp_processor_id();
1121 #endif
1122         unsigned long fs_base, gs_base;
1123         u16 fs_sel, gs_sel;
1124         int i;
1125
1126         vmx->req_immediate_exit = false;
1127
1128         /*
1129          * Note that guest MSRs to be saved/restored can also be changed
1130          * when guest state is loaded. This happens when guest transitions
1131          * to/from long-mode by setting MSR_EFER.LMA.
1132          */
1133         if (!vmx->guest_uret_msrs_loaded) {
1134                 vmx->guest_uret_msrs_loaded = true;
1135                 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
1136                         if (!vmx->guest_uret_msrs[i].load_into_hardware)
1137                                 continue;
1138
1139                         kvm_set_user_return_msr(i,
1140                                                 vmx->guest_uret_msrs[i].data,
1141                                                 vmx->guest_uret_msrs[i].mask);
1142                 }
1143         }
1144
1145         if (vmx->nested.need_vmcs12_to_shadow_sync)
1146                 nested_sync_vmcs12_to_shadow(vcpu);
1147
1148         if (vmx->guest_state_loaded)
1149                 return;
1150
1151         host_state = &vmx->loaded_vmcs->host_state;
1152
1153         /*
1154          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1155          * allow segment selectors with cpl > 0 or ti == 1.
1156          */
1157         host_state->ldt_sel = kvm_read_ldt();
1158
1159 #ifdef CONFIG_X86_64
1160         savesegment(ds, host_state->ds_sel);
1161         savesegment(es, host_state->es_sel);
1162
1163         gs_base = cpu_kernelmode_gs_base(cpu);
1164         if (likely(is_64bit_mm(current->mm))) {
1165                 current_save_fsgs();
1166                 fs_sel = current->thread.fsindex;
1167                 gs_sel = current->thread.gsindex;
1168                 fs_base = current->thread.fsbase;
1169                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1170         } else {
1171                 savesegment(fs, fs_sel);
1172                 savesegment(gs, gs_sel);
1173                 fs_base = read_msr(MSR_FS_BASE);
1174                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1175         }
1176
1177         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1178 #else
1179         savesegment(fs, fs_sel);
1180         savesegment(gs, gs_sel);
1181         fs_base = segment_base(fs_sel);
1182         gs_base = segment_base(gs_sel);
1183 #endif
1184
1185         vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(),
1186                                 fs_sel, gs_sel, fs_base, gs_base);
1187
1188         vmx->guest_state_loaded = true;
1189 }
1190
1191 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1192 {
1193         struct vmcs_host_state *host_state;
1194
1195         if (!vmx->guest_state_loaded)
1196                 return;
1197
1198         host_state = &vmx->loaded_vmcs->host_state;
1199
1200         ++vmx->vcpu.stat.host_state_reload;
1201
1202 #ifdef CONFIG_X86_64
1203         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1204 #endif
1205         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1206                 kvm_load_ldt(host_state->ldt_sel);
1207 #ifdef CONFIG_X86_64
1208                 load_gs_index(host_state->gs_sel);
1209 #else
1210                 loadsegment(gs, host_state->gs_sel);
1211 #endif
1212         }
1213         if (host_state->fs_sel & 7)
1214                 loadsegment(fs, host_state->fs_sel);
1215 #ifdef CONFIG_X86_64
1216         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1217                 loadsegment(ds, host_state->ds_sel);
1218                 loadsegment(es, host_state->es_sel);
1219         }
1220 #endif
1221         invalidate_tss_limit();
1222 #ifdef CONFIG_X86_64
1223         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1224 #endif
1225         load_fixmap_gdt(raw_smp_processor_id());
1226         vmx->guest_state_loaded = false;
1227         vmx->guest_uret_msrs_loaded = false;
1228 }
1229
1230 #ifdef CONFIG_X86_64
1231 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1232 {
1233         preempt_disable();
1234         if (vmx->guest_state_loaded)
1235                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1236         preempt_enable();
1237         return vmx->msr_guest_kernel_gs_base;
1238 }
1239
1240 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1241 {
1242         preempt_disable();
1243         if (vmx->guest_state_loaded)
1244                 wrmsrl(MSR_KERNEL_GS_BASE, data);
1245         preempt_enable();
1246         vmx->msr_guest_kernel_gs_base = data;
1247 }
1248 #endif
1249
1250 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1251                         struct loaded_vmcs *buddy)
1252 {
1253         struct vcpu_vmx *vmx = to_vmx(vcpu);
1254         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1255         struct vmcs *prev;
1256
1257         if (!already_loaded) {
1258                 loaded_vmcs_clear(vmx->loaded_vmcs);
1259                 local_irq_disable();
1260
1261                 /*
1262                  * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1263                  * this cpu's percpu list, otherwise it may not yet be deleted
1264                  * from its previous cpu's percpu list.  Pairs with the
1265                  * smb_wmb() in __loaded_vmcs_clear().
1266                  */
1267                 smp_rmb();
1268
1269                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1270                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1271                 local_irq_enable();
1272         }
1273
1274         prev = per_cpu(current_vmcs, cpu);
1275         if (prev != vmx->loaded_vmcs->vmcs) {
1276                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1277                 vmcs_load(vmx->loaded_vmcs->vmcs);
1278
1279                 /*
1280                  * No indirect branch prediction barrier needed when switching
1281                  * the active VMCS within a guest, e.g. on nested VM-Enter.
1282                  * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
1283                  */
1284                 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1285                         indirect_branch_prediction_barrier();
1286         }
1287
1288         if (!already_loaded) {
1289                 void *gdt = get_current_gdt_ro();
1290
1291                 /*
1292                  * Flush all EPTP/VPID contexts, the new pCPU may have stale
1293                  * TLB entries from its previous association with the vCPU.
1294                  */
1295                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1296
1297                 /*
1298                  * Linux uses per-cpu TSS and GDT, so set these when switching
1299                  * processors.  See 22.2.4.
1300                  */
1301                 vmcs_writel(HOST_TR_BASE,
1302                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1303                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
1304
1305                 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1306                         /* 22.2.3 */
1307                         vmcs_writel(HOST_IA32_SYSENTER_ESP,
1308                                     (unsigned long)(cpu_entry_stack(cpu) + 1));
1309                 }
1310
1311                 vmx->loaded_vmcs->cpu = cpu;
1312         }
1313 }
1314
1315 /*
1316  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1317  * vcpu mutex is already taken.
1318  */
1319 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1320 {
1321         struct vcpu_vmx *vmx = to_vmx(vcpu);
1322
1323         vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1324
1325         vmx_vcpu_pi_load(vcpu, cpu);
1326
1327         vmx->host_debugctlmsr = get_debugctlmsr();
1328 }
1329
1330 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1331 {
1332         vmx_vcpu_pi_put(vcpu);
1333
1334         vmx_prepare_switch_to_host(to_vmx(vcpu));
1335 }
1336
1337 bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1338 {
1339         return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1340 }
1341
1342 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1343 {
1344         struct vcpu_vmx *vmx = to_vmx(vcpu);
1345         unsigned long rflags, save_rflags;
1346
1347         if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1348                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1349                 rflags = vmcs_readl(GUEST_RFLAGS);
1350                 if (vmx->rmode.vm86_active) {
1351                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1352                         save_rflags = vmx->rmode.save_rflags;
1353                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1354                 }
1355                 vmx->rflags = rflags;
1356         }
1357         return vmx->rflags;
1358 }
1359
1360 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1361 {
1362         struct vcpu_vmx *vmx = to_vmx(vcpu);
1363         unsigned long old_rflags;
1364
1365         if (is_unrestricted_guest(vcpu)) {
1366                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1367                 vmx->rflags = rflags;
1368                 vmcs_writel(GUEST_RFLAGS, rflags);
1369                 return;
1370         }
1371
1372         old_rflags = vmx_get_rflags(vcpu);
1373         vmx->rflags = rflags;
1374         if (vmx->rmode.vm86_active) {
1375                 vmx->rmode.save_rflags = rflags;
1376                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1377         }
1378         vmcs_writel(GUEST_RFLAGS, rflags);
1379
1380         if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1381                 vmx->emulation_required = vmx_emulation_required(vcpu);
1382 }
1383
1384 static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1385 {
1386         return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1387 }
1388
1389 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1390 {
1391         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1392         int ret = 0;
1393
1394         if (interruptibility & GUEST_INTR_STATE_STI)
1395                 ret |= KVM_X86_SHADOW_INT_STI;
1396         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1397                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1398
1399         return ret;
1400 }
1401
1402 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1403 {
1404         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1405         u32 interruptibility = interruptibility_old;
1406
1407         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1408
1409         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1410                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1411         else if (mask & KVM_X86_SHADOW_INT_STI)
1412                 interruptibility |= GUEST_INTR_STATE_STI;
1413
1414         if ((interruptibility != interruptibility_old))
1415                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1416 }
1417
1418 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1419 {
1420         struct vcpu_vmx *vmx = to_vmx(vcpu);
1421         unsigned long value;
1422
1423         /*
1424          * Any MSR write that attempts to change bits marked reserved will
1425          * case a #GP fault.
1426          */
1427         if (data & vmx->pt_desc.ctl_bitmask)
1428                 return 1;
1429
1430         /*
1431          * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1432          * result in a #GP unless the same write also clears TraceEn.
1433          */
1434         if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1435                 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1436                 return 1;
1437
1438         /*
1439          * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1440          * and FabricEn would cause #GP, if
1441          * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1442          */
1443         if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1444                 !(data & RTIT_CTL_FABRIC_EN) &&
1445                 !intel_pt_validate_cap(vmx->pt_desc.caps,
1446                                         PT_CAP_single_range_output))
1447                 return 1;
1448
1449         /*
1450          * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1451          * utilize encodings marked reserved will cause a #GP fault.
1452          */
1453         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1454         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1455                         !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1456                         RTIT_CTL_MTC_RANGE_OFFSET, &value))
1457                 return 1;
1458         value = intel_pt_validate_cap(vmx->pt_desc.caps,
1459                                                 PT_CAP_cycle_thresholds);
1460         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1461                         !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1462                         RTIT_CTL_CYC_THRESH_OFFSET, &value))
1463                 return 1;
1464         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1465         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1466                         !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1467                         RTIT_CTL_PSB_FREQ_OFFSET, &value))
1468                 return 1;
1469
1470         /*
1471          * If ADDRx_CFG is reserved or the encodings is >2 will
1472          * cause a #GP fault.
1473          */
1474         value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1475         if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
1476                 return 1;
1477         value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1478         if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
1479                 return 1;
1480         value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1481         if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
1482                 return 1;
1483         value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1484         if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
1485                 return 1;
1486
1487         return 0;
1488 }
1489
1490 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
1491 {
1492         /*
1493          * Emulation of instructions in SGX enclaves is impossible as RIP does
1494          * not point  tthe failing instruction, and even if it did, the code
1495          * stream is inaccessible.  Inject #UD instead of exiting to userspace
1496          * so that guest userspace can't DoS the guest simply by triggering
1497          * emulation (enclaves are CPL3 only).
1498          */
1499         if (to_vmx(vcpu)->exit_reason.enclave_mode) {
1500                 kvm_queue_exception(vcpu, UD_VECTOR);
1501                 return false;
1502         }
1503         return true;
1504 }
1505
1506 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1507 {
1508         union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
1509         unsigned long rip, orig_rip;
1510         u32 instr_len;
1511
1512         /*
1513          * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1514          * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1515          * set when EPT misconfig occurs.  In practice, real hardware updates
1516          * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1517          * (namely Hyper-V) don't set it due to it being undefined behavior,
1518          * i.e. we end up advancing IP with some random value.
1519          */
1520         if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1521             exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1522                 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1523
1524                 /*
1525                  * Emulating an enclave's instructions isn't supported as KVM
1526                  * cannot access the enclave's memory or its true RIP, e.g. the
1527                  * vmcs.GUEST_RIP points at the exit point of the enclave, not
1528                  * the RIP that actually triggered the VM-Exit.  But, because
1529                  * most instructions that cause VM-Exit will #UD in an enclave,
1530                  * most instruction-based VM-Exits simply do not occur.
1531                  *
1532                  * There are a few exceptions, notably the debug instructions
1533                  * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1534                  * and generate #DB/#BP as expected, which KVM might intercept.
1535                  * But again, the CPU does the dirty work and saves an instr
1536                  * length of zero so VMMs don't shoot themselves in the foot.
1537                  * WARN if KVM tries to skip a non-zero length instruction on
1538                  * a VM-Exit from an enclave.
1539                  */
1540                 if (!instr_len)
1541                         goto rip_updated;
1542
1543                 WARN(exit_reason.enclave_mode,
1544                      "KVM: skipping instruction after SGX enclave VM-Exit");
1545
1546                 orig_rip = kvm_rip_read(vcpu);
1547                 rip = orig_rip + instr_len;
1548 #ifdef CONFIG_X86_64
1549                 /*
1550                  * We need to mask out the high 32 bits of RIP if not in 64-bit
1551                  * mode, but just finding out that we are in 64-bit mode is
1552                  * quite expensive.  Only do it if there was a carry.
1553                  */
1554                 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1555                         rip = (u32)rip;
1556 #endif
1557                 kvm_rip_write(vcpu, rip);
1558         } else {
1559                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1560                         return 0;
1561         }
1562
1563 rip_updated:
1564         /* skipping an emulated instruction also counts */
1565         vmx_set_interrupt_shadow(vcpu, 0);
1566
1567         return 1;
1568 }
1569
1570 /*
1571  * Recognizes a pending MTF VM-exit and records the nested state for later
1572  * delivery.
1573  */
1574 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1575 {
1576         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1577         struct vcpu_vmx *vmx = to_vmx(vcpu);
1578
1579         if (!is_guest_mode(vcpu))
1580                 return;
1581
1582         /*
1583          * Per the SDM, MTF takes priority over debug-trap exceptions besides
1584          * T-bit traps. As instruction emulation is completed (i.e. at the
1585          * instruction boundary), any #DB exception pending delivery must be a
1586          * debug-trap. Record the pending MTF state to be delivered in
1587          * vmx_check_nested_events().
1588          */
1589         if (nested_cpu_has_mtf(vmcs12) &&
1590             (!vcpu->arch.exception.pending ||
1591              vcpu->arch.exception.nr == DB_VECTOR))
1592                 vmx->nested.mtf_pending = true;
1593         else
1594                 vmx->nested.mtf_pending = false;
1595 }
1596
1597 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1598 {
1599         vmx_update_emulated_instruction(vcpu);
1600         return skip_emulated_instruction(vcpu);
1601 }
1602
1603 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1604 {
1605         /*
1606          * Ensure that we clear the HLT state in the VMCS.  We don't need to
1607          * explicitly skip the instruction because if the HLT state is set,
1608          * then the instruction is already executing and RIP has already been
1609          * advanced.
1610          */
1611         if (kvm_hlt_in_guest(vcpu->kvm) &&
1612                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1613                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1614 }
1615
1616 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
1617 {
1618         struct vcpu_vmx *vmx = to_vmx(vcpu);
1619         unsigned nr = vcpu->arch.exception.nr;
1620         bool has_error_code = vcpu->arch.exception.has_error_code;
1621         u32 error_code = vcpu->arch.exception.error_code;
1622         u32 intr_info = nr | INTR_INFO_VALID_MASK;
1623
1624         kvm_deliver_exception_payload(vcpu);
1625
1626         if (has_error_code) {
1627                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1628                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1629         }
1630
1631         if (vmx->rmode.vm86_active) {
1632                 int inc_eip = 0;
1633                 if (kvm_exception_is_soft(nr))
1634                         inc_eip = vcpu->arch.event_exit_inst_len;
1635                 kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
1636                 return;
1637         }
1638
1639         WARN_ON_ONCE(vmx->emulation_required);
1640
1641         if (kvm_exception_is_soft(nr)) {
1642                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1643                              vmx->vcpu.arch.event_exit_inst_len);
1644                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1645         } else
1646                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1647
1648         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1649
1650         vmx_clear_hlt(vcpu);
1651 }
1652
1653 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1654                                bool load_into_hardware)
1655 {
1656         struct vmx_uret_msr *uret_msr;
1657
1658         uret_msr = vmx_find_uret_msr(vmx, msr);
1659         if (!uret_msr)
1660                 return;
1661
1662         uret_msr->load_into_hardware = load_into_hardware;
1663 }
1664
1665 /*
1666  * Configuring user return MSRs to automatically save, load, and restore MSRs
1667  * that need to be shoved into hardware when running the guest.  Note, omitting
1668  * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1669  * loaded into hardware when running the guest.
1670  */
1671 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
1672 {
1673 #ifdef CONFIG_X86_64
1674         bool load_syscall_msrs;
1675
1676         /*
1677          * The SYSCALL MSRs are only needed on long mode guests, and only
1678          * when EFER.SCE is set.
1679          */
1680         load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1681                             (vmx->vcpu.arch.efer & EFER_SCE);
1682
1683         vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1684         vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1685         vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
1686 #endif
1687         vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
1688
1689         vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1690                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1691                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
1692
1693         /*
1694          * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1695          * kernel and old userspace.  If those guests run on a tsx=off host, do
1696          * allow guests to use TSX_CTRL, but don't change the value in hardware
1697          * so that TSX remains always disabled.
1698          */
1699         vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
1700
1701         /*
1702          * The set of MSRs to load may have changed, reload MSRs before the
1703          * next VM-Enter.
1704          */
1705         vmx->guest_uret_msrs_loaded = false;
1706 }
1707
1708 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1709 {
1710         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1711
1712         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
1713                 return vmcs12->tsc_offset;
1714
1715         return 0;
1716 }
1717
1718 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1719 {
1720         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1721
1722         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
1723             nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
1724                 return vmcs12->tsc_multiplier;
1725
1726         return kvm_default_tsc_scaling_ratio;
1727 }
1728
1729 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1730 {
1731         vmcs_write64(TSC_OFFSET, offset);
1732 }
1733
1734 static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1735 {
1736         vmcs_write64(TSC_MULTIPLIER, multiplier);
1737 }
1738
1739 /*
1740  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1741  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1742  * all guests if the "nested" module option is off, and can also be disabled
1743  * for a single guest by disabling its VMX cpuid bit.
1744  */
1745 bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1746 {
1747         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1748 }
1749
1750 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
1751                                                  uint64_t val)
1752 {
1753         uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
1754
1755         return !(val & ~valid_bits);
1756 }
1757
1758 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1759 {
1760         switch (msr->index) {
1761         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1762                 if (!nested)
1763                         return 1;
1764                 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1765         case MSR_IA32_PERF_CAPABILITIES:
1766                 msr->data = vmx_get_perf_capabilities();
1767                 return 0;
1768         default:
1769                 return KVM_MSR_RET_INVALID;
1770         }
1771 }
1772
1773 /*
1774  * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
1775  * Returns 0 on success, non-0 otherwise.
1776  * Assumes vcpu_load() was already called.
1777  */
1778 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1779 {
1780         struct vcpu_vmx *vmx = to_vmx(vcpu);
1781         struct vmx_uret_msr *msr;
1782         u32 index;
1783
1784         switch (msr_info->index) {
1785 #ifdef CONFIG_X86_64
1786         case MSR_FS_BASE:
1787                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
1788                 break;
1789         case MSR_GS_BASE:
1790                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
1791                 break;
1792         case MSR_KERNEL_GS_BASE:
1793                 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1794                 break;
1795 #endif
1796         case MSR_EFER:
1797                 return kvm_get_msr_common(vcpu, msr_info);
1798         case MSR_IA32_TSX_CTRL:
1799                 if (!msr_info->host_initiated &&
1800                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1801                         return 1;
1802                 goto find_uret_msr;
1803         case MSR_IA32_UMWAIT_CONTROL:
1804                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1805                         return 1;
1806
1807                 msr_info->data = vmx->msr_ia32_umwait_control;
1808                 break;
1809         case MSR_IA32_SPEC_CTRL:
1810                 if (!msr_info->host_initiated &&
1811                     !guest_has_spec_ctrl_msr(vcpu))
1812                         return 1;
1813
1814                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
1815                 break;
1816         case MSR_IA32_SYSENTER_CS:
1817                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
1818                 break;
1819         case MSR_IA32_SYSENTER_EIP:
1820                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
1821                 break;
1822         case MSR_IA32_SYSENTER_ESP:
1823                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
1824                 break;
1825         case MSR_IA32_BNDCFGS:
1826                 if (!kvm_mpx_supported() ||
1827                     (!msr_info->host_initiated &&
1828                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1829                         return 1;
1830                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
1831                 break;
1832         case MSR_IA32_MCG_EXT_CTL:
1833                 if (!msr_info->host_initiated &&
1834                     !(vmx->msr_ia32_feature_control &
1835                       FEAT_CTL_LMCE_ENABLED))
1836                         return 1;
1837                 msr_info->data = vcpu->arch.mcg_ext_ctl;
1838                 break;
1839         case MSR_IA32_FEAT_CTL:
1840                 msr_info->data = vmx->msr_ia32_feature_control;
1841                 break;
1842         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
1843                 if (!msr_info->host_initiated &&
1844                     !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
1845                         return 1;
1846                 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
1847                         [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
1848                 break;
1849         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1850                 if (!nested_vmx_allowed(vcpu))
1851                         return 1;
1852                 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
1853                                     &msr_info->data))
1854                         return 1;
1855                 /*
1856                  * Enlightened VMCS v1 doesn't have certain VMCS fields but
1857                  * instead of just ignoring the features, different Hyper-V
1858                  * versions are either trying to use them and fail or do some
1859                  * sanity checking and refuse to boot. Filter all unsupported
1860                  * features out.
1861                  */
1862                 if (!msr_info->host_initiated &&
1863                     vmx->nested.enlightened_vmcs_enabled)
1864                         nested_evmcs_filter_control_msr(msr_info->index,
1865                                                         &msr_info->data);
1866                 break;
1867         case MSR_IA32_RTIT_CTL:
1868                 if (!vmx_pt_mode_is_host_guest())
1869                         return 1;
1870                 msr_info->data = vmx->pt_desc.guest.ctl;
1871                 break;
1872         case MSR_IA32_RTIT_STATUS:
1873                 if (!vmx_pt_mode_is_host_guest())
1874                         return 1;
1875                 msr_info->data = vmx->pt_desc.guest.status;
1876                 break;
1877         case MSR_IA32_RTIT_CR3_MATCH:
1878                 if (!vmx_pt_mode_is_host_guest() ||
1879                         !intel_pt_validate_cap(vmx->pt_desc.caps,
1880                                                 PT_CAP_cr3_filtering))
1881                         return 1;
1882                 msr_info->data = vmx->pt_desc.guest.cr3_match;
1883                 break;
1884         case MSR_IA32_RTIT_OUTPUT_BASE:
1885                 if (!vmx_pt_mode_is_host_guest() ||
1886                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
1887                                         PT_CAP_topa_output) &&
1888                          !intel_pt_validate_cap(vmx->pt_desc.caps,
1889                                         PT_CAP_single_range_output)))
1890                         return 1;
1891                 msr_info->data = vmx->pt_desc.guest.output_base;
1892                 break;
1893         case MSR_IA32_RTIT_OUTPUT_MASK:
1894                 if (!vmx_pt_mode_is_host_guest() ||
1895                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
1896                                         PT_CAP_topa_output) &&
1897                          !intel_pt_validate_cap(vmx->pt_desc.caps,
1898                                         PT_CAP_single_range_output)))
1899                         return 1;
1900                 msr_info->data = vmx->pt_desc.guest.output_mask;
1901                 break;
1902         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
1903                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
1904                 if (!vmx_pt_mode_is_host_guest() ||
1905                     (index >= 2 * vmx->pt_desc.num_address_ranges))
1906                         return 1;
1907                 if (index % 2)
1908                         msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
1909                 else
1910                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
1911                 break;
1912         case MSR_IA32_DEBUGCTLMSR:
1913                 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
1914                 break;
1915         default:
1916         find_uret_msr:
1917                 msr = vmx_find_uret_msr(vmx, msr_info->index);
1918                 if (msr) {
1919                         msr_info->data = msr->data;
1920                         break;
1921                 }
1922                 return kvm_get_msr_common(vcpu, msr_info);
1923         }
1924
1925         return 0;
1926 }
1927
1928 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
1929                                                     u64 data)
1930 {
1931 #ifdef CONFIG_X86_64
1932         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
1933                 return (u32)data;
1934 #endif
1935         return (unsigned long)data;
1936 }
1937
1938 static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
1939 {
1940         u64 debugctl = vmx_supported_debugctl();
1941
1942         if (!intel_pmu_lbr_is_enabled(vcpu))
1943                 debugctl &= ~DEBUGCTLMSR_LBR_MASK;
1944
1945         if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
1946                 debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
1947
1948         return debugctl;
1949 }
1950
1951 /*
1952  * Writes msr value into the appropriate "register".
1953  * Returns 0 on success, non-0 otherwise.
1954  * Assumes vcpu_load() was already called.
1955  */
1956 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1957 {
1958         struct vcpu_vmx *vmx = to_vmx(vcpu);
1959         struct vmx_uret_msr *msr;
1960         int ret = 0;
1961         u32 msr_index = msr_info->index;
1962         u64 data = msr_info->data;
1963         u32 index;
1964
1965         switch (msr_index) {
1966         case MSR_EFER:
1967                 ret = kvm_set_msr_common(vcpu, msr_info);
1968                 break;
1969 #ifdef CONFIG_X86_64
1970         case MSR_FS_BASE:
1971                 vmx_segment_cache_clear(vmx);
1972                 vmcs_writel(GUEST_FS_BASE, data);
1973                 break;
1974         case MSR_GS_BASE:
1975                 vmx_segment_cache_clear(vmx);
1976                 vmcs_writel(GUEST_GS_BASE, data);
1977                 break;
1978         case MSR_KERNEL_GS_BASE:
1979                 vmx_write_guest_kernel_gs_base(vmx, data);
1980                 break;
1981         case MSR_IA32_XFD:
1982                 ret = kvm_set_msr_common(vcpu, msr_info);
1983                 /*
1984                  * Always intercepting WRMSR could incur non-negligible
1985                  * overhead given xfd might be changed frequently in
1986                  * guest context switch. Disable write interception
1987                  * upon the first write with a non-zero value (indicating
1988                  * potential usage on dynamic xfeatures). Also update
1989                  * exception bitmap to trap #NM for proper virtualization
1990                  * of guest xfd_err.
1991                  */
1992                 if (!ret && data) {
1993                         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
1994                                                       MSR_TYPE_RW);
1995                         vcpu->arch.xfd_no_write_intercept = true;
1996                         vmx_update_exception_bitmap(vcpu);
1997                 }
1998                 break;
1999 #endif
2000         case MSR_IA32_SYSENTER_CS:
2001                 if (is_guest_mode(vcpu))
2002                         get_vmcs12(vcpu)->guest_sysenter_cs = data;
2003                 vmcs_write32(GUEST_SYSENTER_CS, data);
2004                 break;
2005         case MSR_IA32_SYSENTER_EIP:
2006                 if (is_guest_mode(vcpu)) {
2007                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2008                         get_vmcs12(vcpu)->guest_sysenter_eip = data;
2009                 }
2010                 vmcs_writel(GUEST_SYSENTER_EIP, data);
2011                 break;
2012         case MSR_IA32_SYSENTER_ESP:
2013                 if (is_guest_mode(vcpu)) {
2014                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2015                         get_vmcs12(vcpu)->guest_sysenter_esp = data;
2016                 }
2017                 vmcs_writel(GUEST_SYSENTER_ESP, data);
2018                 break;
2019         case MSR_IA32_DEBUGCTLMSR: {
2020                 u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
2021                 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
2022                         if (report_ignored_msrs)
2023                                 vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
2024                                             __func__, data);
2025                         data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2026                         invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2027                 }
2028
2029                 if (invalid)
2030                         return 1;
2031
2032                 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2033                                                 VM_EXIT_SAVE_DEBUG_CONTROLS)
2034                         get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2035
2036                 vmcs_write64(GUEST_IA32_DEBUGCTL, data);
2037                 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2038                     (data & DEBUGCTLMSR_LBR))
2039                         intel_pmu_create_guest_lbr_event(vcpu);
2040                 return 0;
2041         }
2042         case MSR_IA32_BNDCFGS:
2043                 if (!kvm_mpx_supported() ||
2044                     (!msr_info->host_initiated &&
2045                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2046                         return 1;
2047                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
2048                     (data & MSR_IA32_BNDCFGS_RSVD))
2049                         return 1;
2050                 vmcs_write64(GUEST_BNDCFGS, data);
2051                 break;
2052         case MSR_IA32_UMWAIT_CONTROL:
2053                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2054                         return 1;
2055
2056                 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2057                 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2058                         return 1;
2059
2060                 vmx->msr_ia32_umwait_control = data;
2061                 break;
2062         case MSR_IA32_SPEC_CTRL:
2063                 if (!msr_info->host_initiated &&
2064                     !guest_has_spec_ctrl_msr(vcpu))
2065                         return 1;
2066
2067                 if (kvm_spec_ctrl_test_value(data))
2068                         return 1;
2069
2070                 vmx->spec_ctrl = data;
2071                 if (!data)
2072                         break;
2073
2074                 /*
2075                  * For non-nested:
2076                  * When it's written (to non-zero) for the first time, pass
2077                  * it through.
2078                  *
2079                  * For nested:
2080                  * The handling of the MSR bitmap for L2 guests is done in
2081                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2082                  * vmcs02.msr_bitmap here since it gets completely overwritten
2083                  * in the merging. We update the vmcs01 here for L1 as well
2084                  * since it will end up touching the MSR anyway now.
2085                  */
2086                 vmx_disable_intercept_for_msr(vcpu,
2087                                               MSR_IA32_SPEC_CTRL,
2088                                               MSR_TYPE_RW);
2089                 break;
2090         case MSR_IA32_TSX_CTRL:
2091                 if (!msr_info->host_initiated &&
2092                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2093                         return 1;
2094                 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2095                         return 1;
2096                 goto find_uret_msr;
2097         case MSR_IA32_PRED_CMD:
2098                 if (!msr_info->host_initiated &&
2099                     !guest_has_pred_cmd_msr(vcpu))
2100                         return 1;
2101
2102                 if (data & ~PRED_CMD_IBPB)
2103                         return 1;
2104                 if (!boot_cpu_has(X86_FEATURE_IBPB))
2105                         return 1;
2106                 if (!data)
2107                         break;
2108
2109                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2110
2111                 /*
2112                  * For non-nested:
2113                  * When it's written (to non-zero) for the first time, pass
2114                  * it through.
2115                  *
2116                  * For nested:
2117                  * The handling of the MSR bitmap for L2 guests is done in
2118                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2119                  * vmcs02.msr_bitmap here since it gets completely overwritten
2120                  * in the merging.
2121                  */
2122                 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
2123                 break;
2124         case MSR_IA32_CR_PAT:
2125                 if (!kvm_pat_valid(data))
2126                         return 1;
2127
2128                 if (is_guest_mode(vcpu) &&
2129                     get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2130                         get_vmcs12(vcpu)->guest_ia32_pat = data;
2131
2132                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2133                         vmcs_write64(GUEST_IA32_PAT, data);
2134                         vcpu->arch.pat = data;
2135                         break;
2136                 }
2137                 ret = kvm_set_msr_common(vcpu, msr_info);
2138                 break;
2139         case MSR_IA32_MCG_EXT_CTL:
2140                 if ((!msr_info->host_initiated &&
2141                      !(to_vmx(vcpu)->msr_ia32_feature_control &
2142                        FEAT_CTL_LMCE_ENABLED)) ||
2143                     (data & ~MCG_EXT_CTL_LMCE_EN))
2144                         return 1;
2145                 vcpu->arch.mcg_ext_ctl = data;
2146                 break;
2147         case MSR_IA32_FEAT_CTL:
2148                 if (!vmx_feature_control_msr_valid(vcpu, data) ||
2149                     (to_vmx(vcpu)->msr_ia32_feature_control &
2150                      FEAT_CTL_LOCKED && !msr_info->host_initiated))
2151                         return 1;
2152                 vmx->msr_ia32_feature_control = data;
2153                 if (msr_info->host_initiated && data == 0)
2154                         vmx_leave_nested(vcpu);
2155
2156                 /* SGX may be enabled/disabled by guest's firmware */
2157                 vmx_write_encls_bitmap(vcpu, NULL);
2158                 break;
2159         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2160                 /*
2161                  * On real hardware, the LE hash MSRs are writable before
2162                  * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2163                  * at which point SGX related bits in IA32_FEATURE_CONTROL
2164                  * become writable.
2165                  *
2166                  * KVM does not emulate SGX activation for simplicity, so
2167                  * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2168                  * is unlocked.  This is technically not architectural
2169                  * behavior, but it's close enough.
2170                  */
2171                 if (!msr_info->host_initiated &&
2172                     (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
2173                     ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2174                     !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2175                         return 1;
2176                 vmx->msr_ia32_sgxlepubkeyhash
2177                         [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2178                 break;
2179         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2180                 if (!msr_info->host_initiated)
2181                         return 1; /* they are read-only */
2182                 if (!nested_vmx_allowed(vcpu))
2183                         return 1;
2184                 return vmx_set_vmx_msr(vcpu, msr_index, data);
2185         case MSR_IA32_RTIT_CTL:
2186                 if (!vmx_pt_mode_is_host_guest() ||
2187                         vmx_rtit_ctl_check(vcpu, data) ||
2188                         vmx->nested.vmxon)
2189                         return 1;
2190                 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2191                 vmx->pt_desc.guest.ctl = data;
2192                 pt_update_intercept_for_msr(vcpu);
2193                 break;
2194         case MSR_IA32_RTIT_STATUS:
2195                 if (!pt_can_write_msr(vmx))
2196                         return 1;
2197                 if (data & MSR_IA32_RTIT_STATUS_MASK)
2198                         return 1;
2199                 vmx->pt_desc.guest.status = data;
2200                 break;
2201         case MSR_IA32_RTIT_CR3_MATCH:
2202                 if (!pt_can_write_msr(vmx))
2203                         return 1;
2204                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2205                                            PT_CAP_cr3_filtering))
2206                         return 1;
2207                 vmx->pt_desc.guest.cr3_match = data;
2208                 break;
2209         case MSR_IA32_RTIT_OUTPUT_BASE:
2210                 if (!pt_can_write_msr(vmx))
2211                         return 1;
2212                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2213                                            PT_CAP_topa_output) &&
2214                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2215                                            PT_CAP_single_range_output))
2216                         return 1;
2217                 if (!pt_output_base_valid(vcpu, data))
2218                         return 1;
2219                 vmx->pt_desc.guest.output_base = data;
2220                 break;
2221         case MSR_IA32_RTIT_OUTPUT_MASK:
2222                 if (!pt_can_write_msr(vmx))
2223                         return 1;
2224                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2225                                            PT_CAP_topa_output) &&
2226                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2227                                            PT_CAP_single_range_output))
2228                         return 1;
2229                 vmx->pt_desc.guest.output_mask = data;
2230                 break;
2231         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2232                 if (!pt_can_write_msr(vmx))
2233                         return 1;
2234                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2235                 if (index >= 2 * vmx->pt_desc.num_address_ranges)
2236                         return 1;
2237                 if (is_noncanonical_address(data, vcpu))
2238                         return 1;
2239                 if (index % 2)
2240                         vmx->pt_desc.guest.addr_b[index / 2] = data;
2241                 else
2242                         vmx->pt_desc.guest.addr_a[index / 2] = data;
2243                 break;
2244         case MSR_IA32_PERF_CAPABILITIES:
2245                 if (data && !vcpu_to_pmu(vcpu)->version)
2246                         return 1;
2247                 if (data & PMU_CAP_LBR_FMT) {
2248                         if ((data & PMU_CAP_LBR_FMT) !=
2249                             (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
2250                                 return 1;
2251                         if (!intel_pmu_lbr_is_compatible(vcpu))
2252                                 return 1;
2253                 }
2254                 ret = kvm_set_msr_common(vcpu, msr_info);
2255                 break;
2256
2257         default:
2258         find_uret_msr:
2259                 msr = vmx_find_uret_msr(vmx, msr_index);
2260                 if (msr)
2261                         ret = vmx_set_guest_uret_msr(vmx, msr, data);
2262                 else
2263                         ret = kvm_set_msr_common(vcpu, msr_info);
2264         }
2265
2266         return ret;
2267 }
2268
2269 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2270 {
2271         unsigned long guest_owned_bits;
2272
2273         kvm_register_mark_available(vcpu, reg);
2274
2275         switch (reg) {
2276         case VCPU_REGS_RSP:
2277                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2278                 break;
2279         case VCPU_REGS_RIP:
2280                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2281                 break;
2282         case VCPU_EXREG_PDPTR:
2283                 if (enable_ept)
2284                         ept_save_pdptrs(vcpu);
2285                 break;
2286         case VCPU_EXREG_CR0:
2287                 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2288
2289                 vcpu->arch.cr0 &= ~guest_owned_bits;
2290                 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2291                 break;
2292         case VCPU_EXREG_CR3:
2293                 /*
2294                  * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2295                  * CR3 is loaded into hardware, not the guest's CR3.
2296                  */
2297                 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2298                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2299                 break;
2300         case VCPU_EXREG_CR4:
2301                 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2302
2303                 vcpu->arch.cr4 &= ~guest_owned_bits;
2304                 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2305                 break;
2306         default:
2307                 KVM_BUG_ON(1, vcpu->kvm);
2308                 break;
2309         }
2310 }
2311
2312 static __init int cpu_has_kvm_support(void)
2313 {
2314         return cpu_has_vmx();
2315 }
2316
2317 static __init int vmx_disabled_by_bios(void)
2318 {
2319         return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2320                !boot_cpu_has(X86_FEATURE_VMX);
2321 }
2322
2323 static int kvm_cpu_vmxon(u64 vmxon_pointer)
2324 {
2325         u64 msr;
2326
2327         cr4_set_bits(X86_CR4_VMXE);
2328
2329         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2330                           _ASM_EXTABLE(1b, %l[fault])
2331                           : : [vmxon_pointer] "m"(vmxon_pointer)
2332                           : : fault);
2333         return 0;
2334
2335 fault:
2336         WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2337                   rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2338         cr4_clear_bits(X86_CR4_VMXE);
2339
2340         return -EFAULT;
2341 }
2342
2343 static int hardware_enable(void)
2344 {
2345         int cpu = raw_smp_processor_id();
2346         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2347         int r;
2348
2349         if (cr4_read_shadow() & X86_CR4_VMXE)
2350                 return -EBUSY;
2351
2352         /*
2353          * This can happen if we hot-added a CPU but failed to allocate
2354          * VP assist page for it.
2355          */
2356         if (static_branch_unlikely(&enable_evmcs) &&
2357             !hv_get_vp_assist_page(cpu))
2358                 return -EFAULT;
2359
2360         intel_pt_handle_vmx(1);
2361
2362         r = kvm_cpu_vmxon(phys_addr);
2363         if (r) {
2364                 intel_pt_handle_vmx(0);
2365                 return r;
2366         }
2367
2368         if (enable_ept)
2369                 ept_sync_global();
2370
2371         return 0;
2372 }
2373
2374 static void vmclear_local_loaded_vmcss(void)
2375 {
2376         int cpu = raw_smp_processor_id();
2377         struct loaded_vmcs *v, *n;
2378
2379         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2380                                  loaded_vmcss_on_cpu_link)
2381                 __loaded_vmcs_clear(v);
2382 }
2383
2384 static void hardware_disable(void)
2385 {
2386         vmclear_local_loaded_vmcss();
2387
2388         if (cpu_vmxoff())
2389                 kvm_spurious_fault();
2390
2391         intel_pt_handle_vmx(0);
2392 }
2393
2394 /*
2395  * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2396  * directly instead of going through cpu_has(), to ensure KVM is trapping
2397  * ENCLS whenever it's supported in hardware.  It does not matter whether
2398  * the host OS supports or has enabled SGX.
2399  */
2400 static bool cpu_has_sgx(void)
2401 {
2402         return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2403 }
2404
2405 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2406                                       u32 msr, u32 *result)
2407 {
2408         u32 vmx_msr_low, vmx_msr_high;
2409         u32 ctl = ctl_min | ctl_opt;
2410
2411         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2412
2413         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2414         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2415
2416         /* Ensure minimum (required) set of control bits are supported. */
2417         if (ctl_min & ~ctl)
2418                 return -EIO;
2419
2420         *result = ctl;
2421         return 0;
2422 }
2423
2424 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2425                                     struct vmx_capability *vmx_cap)
2426 {
2427         u32 vmx_msr_low, vmx_msr_high;
2428         u32 min, opt, min2, opt2;
2429         u32 _pin_based_exec_control = 0;
2430         u32 _cpu_based_exec_control = 0;
2431         u32 _cpu_based_2nd_exec_control = 0;
2432         u32 _vmexit_control = 0;
2433         u32 _vmentry_control = 0;
2434
2435         memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2436         min = CPU_BASED_HLT_EXITING |
2437 #ifdef CONFIG_X86_64
2438               CPU_BASED_CR8_LOAD_EXITING |
2439               CPU_BASED_CR8_STORE_EXITING |
2440 #endif
2441               CPU_BASED_CR3_LOAD_EXITING |
2442               CPU_BASED_CR3_STORE_EXITING |
2443               CPU_BASED_UNCOND_IO_EXITING |
2444               CPU_BASED_MOV_DR_EXITING |
2445               CPU_BASED_USE_TSC_OFFSETTING |
2446               CPU_BASED_MWAIT_EXITING |
2447               CPU_BASED_MONITOR_EXITING |
2448               CPU_BASED_INVLPG_EXITING |
2449               CPU_BASED_RDPMC_EXITING;
2450
2451         opt = CPU_BASED_TPR_SHADOW |
2452               CPU_BASED_USE_MSR_BITMAPS |
2453               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2454         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2455                                 &_cpu_based_exec_control) < 0)
2456                 return -EIO;
2457 #ifdef CONFIG_X86_64
2458         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2459                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2460                                            ~CPU_BASED_CR8_STORE_EXITING;
2461 #endif
2462         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2463                 min2 = 0;
2464                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2465                         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2466                         SECONDARY_EXEC_WBINVD_EXITING |
2467                         SECONDARY_EXEC_ENABLE_VPID |
2468                         SECONDARY_EXEC_ENABLE_EPT |
2469                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
2470                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2471                         SECONDARY_EXEC_DESC |
2472                         SECONDARY_EXEC_ENABLE_RDTSCP |
2473                         SECONDARY_EXEC_ENABLE_INVPCID |
2474                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
2475                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2476                         SECONDARY_EXEC_SHADOW_VMCS |
2477                         SECONDARY_EXEC_XSAVES |
2478                         SECONDARY_EXEC_RDSEED_EXITING |
2479                         SECONDARY_EXEC_RDRAND_EXITING |
2480                         SECONDARY_EXEC_ENABLE_PML |
2481                         SECONDARY_EXEC_TSC_SCALING |
2482                         SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2483                         SECONDARY_EXEC_PT_USE_GPA |
2484                         SECONDARY_EXEC_PT_CONCEAL_VMX |
2485                         SECONDARY_EXEC_ENABLE_VMFUNC |
2486                         SECONDARY_EXEC_BUS_LOCK_DETECTION;
2487                 if (cpu_has_sgx())
2488                         opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
2489                 if (adjust_vmx_controls(min2, opt2,
2490                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2491                                         &_cpu_based_2nd_exec_control) < 0)
2492                         return -EIO;
2493         }
2494 #ifndef CONFIG_X86_64
2495         if (!(_cpu_based_2nd_exec_control &
2496                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2497                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2498 #endif
2499
2500         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2501                 _cpu_based_2nd_exec_control &= ~(
2502                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2503                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2504                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2505
2506         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2507                 &vmx_cap->ept, &vmx_cap->vpid);
2508
2509         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2510                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2511                    enabled */
2512                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2513                                              CPU_BASED_CR3_STORE_EXITING |
2514                                              CPU_BASED_INVLPG_EXITING);
2515         } else if (vmx_cap->ept) {
2516                 vmx_cap->ept = 0;
2517                 pr_warn_once("EPT CAP should not exist if not support "
2518                                 "1-setting enable EPT VM-execution control\n");
2519         }
2520         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2521                 vmx_cap->vpid) {
2522                 vmx_cap->vpid = 0;
2523                 pr_warn_once("VPID CAP should not exist if not support "
2524                                 "1-setting enable VPID VM-execution control\n");
2525         }
2526
2527         min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
2528 #ifdef CONFIG_X86_64
2529         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2530 #endif
2531         opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
2532               VM_EXIT_LOAD_IA32_PAT |
2533               VM_EXIT_LOAD_IA32_EFER |
2534               VM_EXIT_CLEAR_BNDCFGS |
2535               VM_EXIT_PT_CONCEAL_PIP |
2536               VM_EXIT_CLEAR_IA32_RTIT_CTL;
2537         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2538                                 &_vmexit_control) < 0)
2539                 return -EIO;
2540
2541         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2542         opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
2543                  PIN_BASED_VMX_PREEMPTION_TIMER;
2544         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2545                                 &_pin_based_exec_control) < 0)
2546                 return -EIO;
2547
2548         if (cpu_has_broken_vmx_preemption_timer())
2549                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2550         if (!(_cpu_based_2nd_exec_control &
2551                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2552                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2553
2554         min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2555         opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
2556               VM_ENTRY_LOAD_IA32_PAT |
2557               VM_ENTRY_LOAD_IA32_EFER |
2558               VM_ENTRY_LOAD_BNDCFGS |
2559               VM_ENTRY_PT_CONCEAL_PIP |
2560               VM_ENTRY_LOAD_IA32_RTIT_CTL;
2561         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2562                                 &_vmentry_control) < 0)
2563                 return -EIO;
2564
2565         /*
2566          * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2567          * can't be used due to an errata where VM Exit may incorrectly clear
2568          * IA32_PERF_GLOBAL_CTRL[34:32].  Workaround the errata by using the
2569          * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2570          */
2571         if (boot_cpu_data.x86 == 0x6) {
2572                 switch (boot_cpu_data.x86_model) {
2573                 case 26: /* AAK155 */
2574                 case 30: /* AAP115 */
2575                 case 37: /* AAT100 */
2576                 case 44: /* BC86,AAY89,BD102 */
2577                 case 46: /* BA97 */
2578                         _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2579                         _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2580                         pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2581                                         "does not work properly. Using workaround\n");
2582                         break;
2583                 default:
2584                         break;
2585                 }
2586         }
2587
2588
2589         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2590
2591         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2592         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2593                 return -EIO;
2594
2595 #ifdef CONFIG_X86_64
2596         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2597         if (vmx_msr_high & (1u<<16))
2598                 return -EIO;
2599 #endif
2600
2601         /* Require Write-Back (WB) memory type for VMCS accesses. */
2602         if (((vmx_msr_high >> 18) & 15) != 6)
2603                 return -EIO;
2604
2605         vmcs_conf->size = vmx_msr_high & 0x1fff;
2606         vmcs_conf->order = get_order(vmcs_conf->size);
2607         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2608
2609         vmcs_conf->revision_id = vmx_msr_low;
2610
2611         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2612         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2613         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2614         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2615         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2616
2617 #if IS_ENABLED(CONFIG_HYPERV)
2618         if (enlightened_vmcs)
2619                 evmcs_sanitize_exec_ctrls(vmcs_conf);
2620 #endif
2621
2622         return 0;
2623 }
2624
2625 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2626 {
2627         int node = cpu_to_node(cpu);
2628         struct page *pages;
2629         struct vmcs *vmcs;
2630
2631         pages = __alloc_pages_node(node, flags, vmcs_config.order);
2632         if (!pages)
2633                 return NULL;
2634         vmcs = page_address(pages);
2635         memset(vmcs, 0, vmcs_config.size);
2636
2637         /* KVM supports Enlightened VMCS v1 only */
2638         if (static_branch_unlikely(&enable_evmcs))
2639                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2640         else
2641                 vmcs->hdr.revision_id = vmcs_config.revision_id;
2642
2643         if (shadow)
2644                 vmcs->hdr.shadow_vmcs = 1;
2645         return vmcs;
2646 }
2647
2648 void free_vmcs(struct vmcs *vmcs)
2649 {
2650         free_pages((unsigned long)vmcs, vmcs_config.order);
2651 }
2652
2653 /*
2654  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2655  */
2656 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2657 {
2658         if (!loaded_vmcs->vmcs)
2659                 return;
2660         loaded_vmcs_clear(loaded_vmcs);
2661         free_vmcs(loaded_vmcs->vmcs);
2662         loaded_vmcs->vmcs = NULL;
2663         if (loaded_vmcs->msr_bitmap)
2664                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2665         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2666 }
2667
2668 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2669 {
2670         loaded_vmcs->vmcs = alloc_vmcs(false);
2671         if (!loaded_vmcs->vmcs)
2672                 return -ENOMEM;
2673
2674         vmcs_clear(loaded_vmcs->vmcs);
2675
2676         loaded_vmcs->shadow_vmcs = NULL;
2677         loaded_vmcs->hv_timer_soft_disabled = false;
2678         loaded_vmcs->cpu = -1;
2679         loaded_vmcs->launched = 0;
2680
2681         if (cpu_has_vmx_msr_bitmap()) {
2682                 loaded_vmcs->msr_bitmap = (unsigned long *)
2683                                 __get_free_page(GFP_KERNEL_ACCOUNT);
2684                 if (!loaded_vmcs->msr_bitmap)
2685                         goto out_vmcs;
2686                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2687         }
2688
2689         memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2690         memset(&loaded_vmcs->controls_shadow, 0,
2691                 sizeof(struct vmcs_controls_shadow));
2692
2693         return 0;
2694
2695 out_vmcs:
2696         free_loaded_vmcs(loaded_vmcs);
2697         return -ENOMEM;
2698 }
2699
2700 static void free_kvm_area(void)
2701 {
2702         int cpu;
2703
2704         for_each_possible_cpu(cpu) {
2705                 free_vmcs(per_cpu(vmxarea, cpu));
2706                 per_cpu(vmxarea, cpu) = NULL;
2707         }
2708 }
2709
2710 static __init int alloc_kvm_area(void)
2711 {
2712         int cpu;
2713
2714         for_each_possible_cpu(cpu) {
2715                 struct vmcs *vmcs;
2716
2717                 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2718                 if (!vmcs) {
2719                         free_kvm_area();
2720                         return -ENOMEM;
2721                 }
2722
2723                 /*
2724                  * When eVMCS is enabled, alloc_vmcs_cpu() sets
2725                  * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2726                  * revision_id reported by MSR_IA32_VMX_BASIC.
2727                  *
2728                  * However, even though not explicitly documented by
2729                  * TLFS, VMXArea passed as VMXON argument should
2730                  * still be marked with revision_id reported by
2731                  * physical CPU.
2732                  */
2733                 if (static_branch_unlikely(&enable_evmcs))
2734                         vmcs->hdr.revision_id = vmcs_config.revision_id;
2735
2736                 per_cpu(vmxarea, cpu) = vmcs;
2737         }
2738         return 0;
2739 }
2740
2741 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2742                 struct kvm_segment *save)
2743 {
2744         if (!emulate_invalid_guest_state) {
2745                 /*
2746                  * CS and SS RPL should be equal during guest entry according
2747                  * to VMX spec, but in reality it is not always so. Since vcpu
2748                  * is in the middle of the transition from real mode to
2749                  * protected mode it is safe to assume that RPL 0 is a good
2750                  * default value.
2751                  */
2752                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2753                         save->selector &= ~SEGMENT_RPL_MASK;
2754                 save->dpl = save->selector & SEGMENT_RPL_MASK;
2755                 save->s = 1;
2756         }
2757         __vmx_set_segment(vcpu, save, seg);
2758 }
2759
2760 static void enter_pmode(struct kvm_vcpu *vcpu)
2761 {
2762         unsigned long flags;
2763         struct vcpu_vmx *vmx = to_vmx(vcpu);
2764
2765         /*
2766          * Update real mode segment cache. It may be not up-to-date if segment
2767          * register was written while vcpu was in a guest mode.
2768          */
2769         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2770         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2771         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2772         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2773         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2774         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2775
2776         vmx->rmode.vm86_active = 0;
2777
2778         __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2779
2780         flags = vmcs_readl(GUEST_RFLAGS);
2781         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2782         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2783         vmcs_writel(GUEST_RFLAGS, flags);
2784
2785         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
2786                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
2787
2788         vmx_update_exception_bitmap(vcpu);
2789
2790         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2791         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2792         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2793         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2794         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2795         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2796 }
2797
2798 static void fix_rmode_seg(int seg, struct kvm_segment *save)
2799 {
2800         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2801         struct kvm_segment var = *save;
2802
2803         var.dpl = 0x3;
2804         if (seg == VCPU_SREG_CS)
2805                 var.type = 0x3;
2806
2807         if (!emulate_invalid_guest_state) {
2808                 var.selector = var.base >> 4;
2809                 var.base = var.base & 0xffff0;
2810                 var.limit = 0xffff;
2811                 var.g = 0;
2812                 var.db = 0;
2813                 var.present = 1;
2814                 var.s = 1;
2815                 var.l = 0;
2816                 var.unusable = 0;
2817                 var.type = 0x3;
2818                 var.avl = 0;
2819                 if (save->base & 0xf)
2820                         printk_once(KERN_WARNING "kvm: segment base is not "
2821                                         "paragraph aligned when entering "
2822                                         "protected mode (seg=%d)", seg);
2823         }
2824
2825         vmcs_write16(sf->selector, var.selector);
2826         vmcs_writel(sf->base, var.base);
2827         vmcs_write32(sf->limit, var.limit);
2828         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2829 }
2830
2831 static void enter_rmode(struct kvm_vcpu *vcpu)
2832 {
2833         unsigned long flags;
2834         struct vcpu_vmx *vmx = to_vmx(vcpu);
2835         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
2836
2837         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2838         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2839         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2840         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2841         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2842         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2843         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2844
2845         vmx->rmode.vm86_active = 1;
2846
2847         /*
2848          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2849          * vcpu. Warn the user that an update is overdue.
2850          */
2851         if (!kvm_vmx->tss_addr)
2852                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2853                              "called before entering vcpu\n");
2854
2855         vmx_segment_cache_clear(vmx);
2856
2857         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
2858         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2859         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2860
2861         flags = vmcs_readl(GUEST_RFLAGS);
2862         vmx->rmode.save_rflags = flags;
2863
2864         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2865
2866         vmcs_writel(GUEST_RFLAGS, flags);
2867         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2868         vmx_update_exception_bitmap(vcpu);
2869
2870         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2871         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2872         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2873         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2874         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2875         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2876 }
2877
2878 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
2879 {
2880         struct vcpu_vmx *vmx = to_vmx(vcpu);
2881         struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
2882
2883         /* Nothing to do if hardware doesn't support EFER. */
2884         if (!msr)
2885                 return 0;
2886
2887         vcpu->arch.efer = efer;
2888         if (efer & EFER_LMA) {
2889                 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2890                 msr->data = efer;
2891         } else {
2892                 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2893
2894                 msr->data = efer & ~EFER_LME;
2895         }
2896         vmx_setup_uret_msrs(vmx);
2897         return 0;
2898 }
2899
2900 #ifdef CONFIG_X86_64
2901
2902 static void enter_lmode(struct kvm_vcpu *vcpu)
2903 {
2904         u32 guest_tr_ar;
2905
2906         vmx_segment_cache_clear(to_vmx(vcpu));
2907
2908         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2909         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
2910                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2911                                      __func__);
2912                 vmcs_write32(GUEST_TR_AR_BYTES,
2913                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
2914                              | VMX_AR_TYPE_BUSY_64_TSS);
2915         }
2916         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
2917 }
2918
2919 static void exit_lmode(struct kvm_vcpu *vcpu)
2920 {
2921         vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2922         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
2923 }
2924
2925 #endif
2926
2927 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
2928 {
2929         struct vcpu_vmx *vmx = to_vmx(vcpu);
2930
2931         /*
2932          * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
2933          * the CPU is not required to invalidate guest-physical mappings on
2934          * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
2935          * associated with the root EPT structure and not any particular VPID
2936          * (INVVPID also isn't required to invalidate guest-physical mappings).
2937          */
2938         if (enable_ept) {
2939                 ept_sync_global();
2940         } else if (enable_vpid) {
2941                 if (cpu_has_vmx_invvpid_global()) {
2942                         vpid_sync_vcpu_global();
2943                 } else {
2944                         vpid_sync_vcpu_single(vmx->vpid);
2945                         vpid_sync_vcpu_single(vmx->nested.vpid02);
2946                 }
2947         }
2948 }
2949
2950 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
2951 {
2952         if (is_guest_mode(vcpu))
2953                 return nested_get_vpid02(vcpu);
2954         return to_vmx(vcpu)->vpid;
2955 }
2956
2957 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
2958 {
2959         struct kvm_mmu *mmu = vcpu->arch.mmu;
2960         u64 root_hpa = mmu->root_hpa;
2961
2962         /* No flush required if the current context is invalid. */
2963         if (!VALID_PAGE(root_hpa))
2964                 return;
2965
2966         if (enable_ept)
2967                 ept_sync_context(construct_eptp(vcpu, root_hpa,
2968                                                 mmu->shadow_root_level));
2969         else
2970                 vpid_sync_context(vmx_get_current_vpid(vcpu));
2971 }
2972
2973 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
2974 {
2975         /*
2976          * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
2977          * vmx_flush_tlb_guest() for an explanation of why this is ok.
2978          */
2979         vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
2980 }
2981
2982 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
2983 {
2984         /*
2985          * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
2986          * vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit are
2987          * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
2988          * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
2989          * i.e. no explicit INVVPID is necessary.
2990          */
2991         vpid_sync_context(vmx_get_current_vpid(vcpu));
2992 }
2993
2994 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
2995 {
2996         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2997
2998         if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
2999                 return;
3000
3001         if (is_pae_paging(vcpu)) {
3002                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3003                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3004                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3005                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3006         }
3007 }
3008
3009 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3010 {
3011         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3012
3013         if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3014                 return;
3015
3016         mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3017         mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3018         mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3019         mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3020
3021         kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
3022 }
3023
3024 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3025                           CPU_BASED_CR3_STORE_EXITING)
3026
3027 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3028 {
3029         struct vcpu_vmx *vmx = to_vmx(vcpu);
3030         unsigned long hw_cr0, old_cr0_pg;
3031         u32 tmp;
3032
3033         old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3034
3035         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3036         if (is_unrestricted_guest(vcpu))
3037                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3038         else {
3039                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3040                 if (!enable_ept)
3041                         hw_cr0 |= X86_CR0_WP;
3042
3043                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3044                         enter_pmode(vcpu);
3045
3046                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3047                         enter_rmode(vcpu);
3048         }
3049
3050         vmcs_writel(CR0_READ_SHADOW, cr0);
3051         vmcs_writel(GUEST_CR0, hw_cr0);
3052         vcpu->arch.cr0 = cr0;
3053         kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3054
3055 #ifdef CONFIG_X86_64
3056         if (vcpu->arch.efer & EFER_LME) {
3057                 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3058                         enter_lmode(vcpu);
3059                 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3060                         exit_lmode(vcpu);
3061         }
3062 #endif
3063
3064         if (enable_ept && !is_unrestricted_guest(vcpu)) {
3065                 /*
3066                  * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
3067                  * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3068                  * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3069                  * KVM's CR3 is installed.
3070                  */
3071                 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3072                         vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3073
3074                 /*
3075                  * When running with EPT but not unrestricted guest, KVM must
3076                  * intercept CR3 accesses when paging is _disabled_.  This is
3077                  * necessary because restricted guests can't actually run with
3078                  * paging disabled, and so KVM stuffs its own CR3 in order to
3079                  * run the guest when identity mapped page tables.
3080                  *
3081                  * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3082                  * update, it may be stale with respect to CR3 interception,
3083                  * e.g. after nested VM-Enter.
3084                  *
3085                  * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3086                  * stores to forward them to L1, even if KVM does not need to
3087                  * intercept them to preserve its identity mapped page tables.
3088                  */
3089                 if (!(cr0 & X86_CR0_PG)) {
3090                         exec_controls_setbit(vmx, CR3_EXITING_BITS);
3091                 } else if (!is_guest_mode(vcpu)) {
3092                         exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3093                 } else {
3094                         tmp = exec_controls_get(vmx);
3095                         tmp &= ~CR3_EXITING_BITS;
3096                         tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3097                         exec_controls_set(vmx, tmp);
3098                 }
3099
3100                 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3101                 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3102                         vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3103
3104                 /*
3105                  * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3106                  * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3107                  */
3108                 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3109                         kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
3110         }
3111
3112         /* depends on vcpu->arch.cr0 to be set to a new value */
3113         vmx->emulation_required = vmx_emulation_required(vcpu);
3114 }
3115
3116 static int vmx_get_max_tdp_level(void)
3117 {
3118         if (cpu_has_vmx_ept_5levels())
3119                 return 5;
3120         return 4;
3121 }
3122
3123 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3124 {
3125         u64 eptp = VMX_EPTP_MT_WB;
3126
3127         eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3128
3129         if (enable_ept_ad_bits &&
3130             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3131                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
3132         eptp |= root_hpa;
3133
3134         return eptp;
3135 }
3136
3137 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3138                              int root_level)
3139 {
3140         struct kvm *kvm = vcpu->kvm;
3141         bool update_guest_cr3 = true;
3142         unsigned long guest_cr3;
3143         u64 eptp;
3144
3145         if (enable_ept) {
3146                 eptp = construct_eptp(vcpu, root_hpa, root_level);
3147                 vmcs_write64(EPT_POINTER, eptp);
3148
3149                 hv_track_root_tdp(vcpu, root_hpa);
3150
3151                 if (!enable_unrestricted_guest && !is_paging(vcpu))
3152                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3153                 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
3154                         guest_cr3 = vcpu->arch.cr3;
3155                 else /* vmcs.GUEST_CR3 is already up-to-date. */
3156                         update_guest_cr3 = false;
3157                 vmx_ept_load_pdptrs(vcpu);
3158         } else {
3159                 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
3160         }
3161
3162         if (update_guest_cr3)
3163                 vmcs_writel(GUEST_CR3, guest_cr3);
3164 }
3165
3166
3167 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3168 {
3169         /*
3170          * We operate under the default treatment of SMM, so VMX cannot be
3171          * enabled under SMM.  Note, whether or not VMXE is allowed at all is
3172          * handled by kvm_is_valid_cr4().
3173          */
3174         if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3175                 return false;
3176
3177         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3178                 return false;
3179
3180         return true;
3181 }
3182
3183 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3184 {
3185         unsigned long old_cr4 = vcpu->arch.cr4;
3186         struct vcpu_vmx *vmx = to_vmx(vcpu);
3187         /*
3188          * Pass through host's Machine Check Enable value to hw_cr4, which
3189          * is in force while we are in guest mode.  Do not let guests control
3190          * this bit, even if host CR4.MCE == 0.
3191          */
3192         unsigned long hw_cr4;
3193
3194         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3195         if (is_unrestricted_guest(vcpu))
3196                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3197         else if (vmx->rmode.vm86_active)
3198                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3199         else
3200                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3201
3202         if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3203                 if (cr4 & X86_CR4_UMIP) {
3204                         secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3205                         hw_cr4 &= ~X86_CR4_UMIP;
3206                 } else if (!is_guest_mode(vcpu) ||
3207                         !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3208                         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3209                 }
3210         }
3211
3212         vcpu->arch.cr4 = cr4;
3213         kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3214
3215         if (!is_unrestricted_guest(vcpu)) {
3216                 if (enable_ept) {
3217                         if (!is_paging(vcpu)) {
3218                                 hw_cr4 &= ~X86_CR4_PAE;
3219                                 hw_cr4 |= X86_CR4_PSE;
3220                         } else if (!(cr4 & X86_CR4_PAE)) {
3221                                 hw_cr4 &= ~X86_CR4_PAE;
3222                         }
3223                 }
3224
3225                 /*
3226                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3227                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
3228                  * to be manually disabled when guest switches to non-paging
3229                  * mode.
3230                  *
3231                  * If !enable_unrestricted_guest, the CPU is always running
3232                  * with CR0.PG=1 and CR4 needs to be modified.
3233                  * If enable_unrestricted_guest, the CPU automatically
3234                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3235                  */
3236                 if (!is_paging(vcpu))
3237                         hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3238         }
3239
3240         vmcs_writel(CR4_READ_SHADOW, cr4);
3241         vmcs_writel(GUEST_CR4, hw_cr4);
3242
3243         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3244                 kvm_update_cpuid_runtime(vcpu);
3245 }
3246
3247 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3248 {
3249         struct vcpu_vmx *vmx = to_vmx(vcpu);
3250         u32 ar;
3251
3252         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3253                 *var = vmx->rmode.segs[seg];
3254                 if (seg == VCPU_SREG_TR
3255                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3256                         return;
3257                 var->base = vmx_read_guest_seg_base(vmx, seg);
3258                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3259                 return;
3260         }
3261         var->base = vmx_read_guest_seg_base(vmx, seg);
3262         var->limit = vmx_read_guest_seg_limit(vmx, seg);
3263         var->selector = vmx_read_guest_seg_selector(vmx, seg);
3264         ar = vmx_read_guest_seg_ar(vmx, seg);
3265         var->unusable = (ar >> 16) & 1;
3266         var->type = ar & 15;
3267         var->s = (ar >> 4) & 1;
3268         var->dpl = (ar >> 5) & 3;
3269         /*
3270          * Some userspaces do not preserve unusable property. Since usable
3271          * segment has to be present according to VMX spec we can use present
3272          * property to amend userspace bug by making unusable segment always
3273          * nonpresent. vmx_segment_access_rights() already marks nonpresent
3274          * segment as unusable.
3275          */
3276         var->present = !var->unusable;
3277         var->avl = (ar >> 12) & 1;
3278         var->l = (ar >> 13) & 1;
3279         var->db = (ar >> 14) & 1;
3280         var->g = (ar >> 15) & 1;
3281 }
3282
3283 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3284 {
3285         struct kvm_segment s;
3286
3287         if (to_vmx(vcpu)->rmode.vm86_active) {
3288                 vmx_get_segment(vcpu, &s, seg);
3289                 return s.base;
3290         }
3291         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3292 }
3293
3294 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3295 {
3296         struct vcpu_vmx *vmx = to_vmx(vcpu);
3297
3298         if (unlikely(vmx->rmode.vm86_active))
3299                 return 0;
3300         else {
3301                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3302                 return VMX_AR_DPL(ar);
3303         }
3304 }
3305
3306 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3307 {
3308         u32 ar;
3309
3310         if (var->unusable || !var->present)
3311                 ar = 1 << 16;
3312         else {
3313                 ar = var->type & 15;
3314                 ar |= (var->s & 1) << 4;
3315                 ar |= (var->dpl & 3) << 5;
3316                 ar |= (var->present & 1) << 7;
3317                 ar |= (var->avl & 1) << 12;
3318                 ar |= (var->l & 1) << 13;
3319                 ar |= (var->db & 1) << 14;
3320                 ar |= (var->g & 1) << 15;
3321         }
3322
3323         return ar;
3324 }
3325
3326 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3327 {
3328         struct vcpu_vmx *vmx = to_vmx(vcpu);
3329         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3330
3331         vmx_segment_cache_clear(vmx);
3332
3333         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3334                 vmx->rmode.segs[seg] = *var;
3335                 if (seg == VCPU_SREG_TR)
3336                         vmcs_write16(sf->selector, var->selector);
3337                 else if (var->s)
3338                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3339                 return;
3340         }
3341
3342         vmcs_writel(sf->base, var->base);
3343         vmcs_write32(sf->limit, var->limit);
3344         vmcs_write16(sf->selector, var->selector);
3345
3346         /*
3347          *   Fix the "Accessed" bit in AR field of segment registers for older
3348          * qemu binaries.
3349          *   IA32 arch specifies that at the time of processor reset the
3350          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3351          * is setting it to 0 in the userland code. This causes invalid guest
3352          * state vmexit when "unrestricted guest" mode is turned on.
3353          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3354          * tree. Newer qemu binaries with that qemu fix would not need this
3355          * kvm hack.
3356          */
3357         if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3358                 var->type |= 0x1; /* Accessed */
3359
3360         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3361 }
3362
3363 static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3364 {
3365         __vmx_set_segment(vcpu, var, seg);
3366
3367         to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
3368 }
3369
3370 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3371 {
3372         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3373
3374         *db = (ar >> 14) & 1;
3375         *l = (ar >> 13) & 1;
3376 }
3377
3378 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3379 {
3380         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3381         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3382 }
3383
3384 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3385 {
3386         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3387         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3388 }
3389
3390 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3391 {
3392         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3393         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3394 }
3395
3396 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3397 {
3398         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3399         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3400 }
3401
3402 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3403 {
3404         struct kvm_segment var;
3405         u32 ar;
3406
3407         vmx_get_segment(vcpu, &var, seg);
3408         var.dpl = 0x3;
3409         if (seg == VCPU_SREG_CS)
3410                 var.type = 0x3;
3411         ar = vmx_segment_access_rights(&var);
3412
3413         if (var.base != (var.selector << 4))
3414                 return false;
3415         if (var.limit != 0xffff)
3416                 return false;
3417         if (ar != 0xf3)
3418                 return false;
3419
3420         return true;
3421 }
3422
3423 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3424 {
3425         struct kvm_segment cs;
3426         unsigned int cs_rpl;
3427
3428         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3429         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3430
3431         if (cs.unusable)
3432                 return false;
3433         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3434                 return false;
3435         if (!cs.s)
3436                 return false;
3437         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3438                 if (cs.dpl > cs_rpl)
3439                         return false;
3440         } else {
3441                 if (cs.dpl != cs_rpl)
3442                         return false;
3443         }
3444         if (!cs.present)
3445                 return false;
3446
3447         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3448         return true;
3449 }
3450
3451 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3452 {
3453         struct kvm_segment ss;
3454         unsigned int ss_rpl;
3455
3456         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3457         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3458
3459         if (ss.unusable)
3460                 return true;
3461         if (ss.type != 3 && ss.type != 7)
3462                 return false;
3463         if (!ss.s)
3464                 return false;
3465         if (ss.dpl != ss_rpl) /* DPL != RPL */
3466                 return false;
3467         if (!ss.present)
3468                 return false;
3469
3470         return true;
3471 }
3472
3473 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3474 {
3475         struct kvm_segment var;
3476         unsigned int rpl;
3477
3478         vmx_get_segment(vcpu, &var, seg);
3479         rpl = var.selector & SEGMENT_RPL_MASK;
3480
3481         if (var.unusable)
3482                 return true;
3483         if (!var.s)
3484                 return false;
3485         if (!var.present)
3486                 return false;
3487         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3488                 if (var.dpl < rpl) /* DPL < RPL */
3489                         return false;
3490         }
3491
3492         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3493          * rights flags
3494          */
3495         return true;
3496 }
3497
3498 static bool tr_valid(struct kvm_vcpu *vcpu)
3499 {
3500         struct kvm_segment tr;
3501
3502         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3503
3504         if (tr.unusable)
3505                 return false;
3506         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
3507                 return false;
3508         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3509                 return false;
3510         if (!tr.present)
3511                 return false;
3512
3513         return true;
3514 }
3515
3516 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3517 {
3518         struct kvm_segment ldtr;
3519
3520         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3521
3522         if (ldtr.unusable)
3523                 return true;
3524         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
3525                 return false;
3526         if (ldtr.type != 2)
3527                 return false;
3528         if (!ldtr.present)
3529                 return false;
3530
3531         return true;
3532 }
3533
3534 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3535 {
3536         struct kvm_segment cs, ss;
3537
3538         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3539         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3540
3541         return ((cs.selector & SEGMENT_RPL_MASK) ==
3542                  (ss.selector & SEGMENT_RPL_MASK));
3543 }
3544
3545 /*
3546  * Check if guest state is valid. Returns true if valid, false if
3547  * not.
3548  * We assume that registers are always usable
3549  */
3550 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3551 {
3552         /* real mode guest state checks */
3553         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3554                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3555                         return false;
3556                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3557                         return false;
3558                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3559                         return false;
3560                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3561                         return false;
3562                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3563                         return false;
3564                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3565                         return false;
3566         } else {
3567         /* protected mode guest state checks */
3568                 if (!cs_ss_rpl_check(vcpu))
3569                         return false;
3570                 if (!code_segment_valid(vcpu))
3571                         return false;
3572                 if (!stack_segment_valid(vcpu))
3573                         return false;
3574                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3575                         return false;
3576                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3577                         return false;
3578                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3579                         return false;
3580                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3581                         return false;
3582                 if (!tr_valid(vcpu))
3583                         return false;
3584                 if (!ldtr_valid(vcpu))
3585                         return false;
3586         }
3587         /* TODO:
3588          * - Add checks on RIP
3589          * - Add checks on RFLAGS
3590          */
3591
3592         return true;
3593 }
3594
3595 static int init_rmode_tss(struct kvm *kvm, void __user *ua)
3596 {
3597         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3598         u16 data;
3599         int i;
3600
3601         for (i = 0; i < 3; i++) {
3602                 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
3603                         return -EFAULT;
3604         }
3605
3606         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3607         if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
3608                 return -EFAULT;
3609
3610         data = ~0;
3611         if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
3612                 return -EFAULT;
3613
3614         return 0;
3615 }
3616
3617 static int init_rmode_identity_map(struct kvm *kvm)
3618 {
3619         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3620         int i, r = 0;
3621         void __user *uaddr;
3622         u32 tmp;
3623
3624         /* Protect kvm_vmx->ept_identity_pagetable_done. */
3625         mutex_lock(&kvm->slots_lock);
3626
3627         if (likely(kvm_vmx->ept_identity_pagetable_done))
3628                 goto out;
3629
3630         if (!kvm_vmx->ept_identity_map_addr)
3631                 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3632
3633         uaddr = __x86_set_memory_region(kvm,
3634                                         IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3635                                         kvm_vmx->ept_identity_map_addr,
3636                                         PAGE_SIZE);
3637         if (IS_ERR(uaddr)) {
3638                 r = PTR_ERR(uaddr);
3639                 goto out;
3640         }
3641
3642         /* Set up identity-mapping pagetable for EPT in real mode */
3643         for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
3644                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3645                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3646                 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
3647                         r = -EFAULT;
3648                         goto out;
3649                 }
3650         }
3651         kvm_vmx->ept_identity_pagetable_done = true;
3652
3653 out:
3654         mutex_unlock(&kvm->slots_lock);
3655         return r;
3656 }
3657
3658 static void seg_setup(int seg)
3659 {
3660         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3661         unsigned int ar;
3662
3663         vmcs_write16(sf->selector, 0);
3664         vmcs_writel(sf->base, 0);
3665         vmcs_write32(sf->limit, 0xffff);
3666         ar = 0x93;
3667         if (seg == VCPU_SREG_CS)
3668                 ar |= 0x08; /* code segment */
3669
3670         vmcs_write32(sf->ar_bytes, ar);
3671 }
3672
3673 static int alloc_apic_access_page(struct kvm *kvm)
3674 {
3675         struct page *page;
3676         void __user *hva;
3677         int ret = 0;
3678
3679         mutex_lock(&kvm->slots_lock);
3680         if (kvm->arch.apic_access_memslot_enabled)
3681                 goto out;
3682         hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
3683                                       APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
3684         if (IS_ERR(hva)) {
3685                 ret = PTR_ERR(hva);
3686                 goto out;
3687         }
3688
3689         page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
3690         if (is_error_page(page)) {
3691                 ret = -EFAULT;
3692                 goto out;
3693         }
3694
3695         /*
3696          * Do not pin the page in memory, so that memory hot-unplug
3697          * is able to migrate it.
3698          */
3699         put_page(page);
3700         kvm->arch.apic_access_memslot_enabled = true;
3701 out:
3702         mutex_unlock(&kvm->slots_lock);
3703         return ret;
3704 }
3705
3706 int allocate_vpid(void)
3707 {
3708         int vpid;
3709
3710         if (!enable_vpid)
3711                 return 0;
3712         spin_lock(&vmx_vpid_lock);
3713         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3714         if (vpid < VMX_NR_VPIDS)
3715                 __set_bit(vpid, vmx_vpid_bitmap);
3716         else
3717                 vpid = 0;
3718         spin_unlock(&vmx_vpid_lock);
3719         return vpid;
3720 }
3721
3722 void free_vpid(int vpid)
3723 {
3724         if (!enable_vpid || vpid == 0)
3725                 return;
3726         spin_lock(&vmx_vpid_lock);
3727         __clear_bit(vpid, vmx_vpid_bitmap);
3728         spin_unlock(&vmx_vpid_lock);
3729 }
3730
3731 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3732 {
3733         /*
3734          * When KVM is a nested hypervisor on top of Hyper-V and uses
3735          * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3736          * bitmap has changed.
3737          */
3738         if (static_branch_unlikely(&enable_evmcs))
3739                 evmcs_touch_msr_bitmap();
3740
3741         vmx->nested.force_msr_bitmap_recalc = true;
3742 }
3743
3744 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
3745 {
3746         struct vcpu_vmx *vmx = to_vmx(vcpu);
3747         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3748
3749         if (!cpu_has_vmx_msr_bitmap())
3750                 return;
3751
3752         vmx_msr_bitmap_l01_changed(vmx);
3753
3754         /*
3755          * Mark the desired intercept state in shadow bitmap, this is needed
3756          * for resync when the MSR filters change.
3757         */
3758         if (is_valid_passthrough_msr(msr)) {
3759                 int idx = possible_passthrough_msr_slot(msr);
3760
3761                 if (idx != -ENOENT) {
3762                         if (type & MSR_TYPE_R)
3763                                 clear_bit(idx, vmx->shadow_msr_intercept.read);
3764                         if (type & MSR_TYPE_W)
3765                                 clear_bit(idx, vmx->shadow_msr_intercept.write);
3766                 }
3767         }
3768
3769         if ((type & MSR_TYPE_R) &&
3770             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
3771                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
3772                 type &= ~MSR_TYPE_R;
3773         }
3774
3775         if ((type & MSR_TYPE_W) &&
3776             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
3777                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
3778                 type &= ~MSR_TYPE_W;
3779         }
3780
3781         if (type & MSR_TYPE_R)
3782                 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
3783
3784         if (type & MSR_TYPE_W)
3785                 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
3786 }
3787
3788 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
3789 {
3790         struct vcpu_vmx *vmx = to_vmx(vcpu);
3791         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3792
3793         if (!cpu_has_vmx_msr_bitmap())
3794                 return;
3795
3796         vmx_msr_bitmap_l01_changed(vmx);
3797
3798         /*
3799          * Mark the desired intercept state in shadow bitmap, this is needed
3800          * for resync when the MSR filter changes.
3801         */
3802         if (is_valid_passthrough_msr(msr)) {
3803                 int idx = possible_passthrough_msr_slot(msr);
3804
3805                 if (idx != -ENOENT) {
3806                         if (type & MSR_TYPE_R)
3807                                 set_bit(idx, vmx->shadow_msr_intercept.read);
3808                         if (type & MSR_TYPE_W)
3809                                 set_bit(idx, vmx->shadow_msr_intercept.write);
3810                 }
3811         }
3812
3813         if (type & MSR_TYPE_R)
3814                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
3815
3816         if (type & MSR_TYPE_W)
3817                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
3818 }
3819
3820 static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
3821 {
3822         unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
3823         unsigned long read_intercept;
3824         int msr;
3825
3826         read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
3827
3828         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
3829                 unsigned int read_idx = msr / BITS_PER_LONG;
3830                 unsigned int write_idx = read_idx + (0x800 / sizeof(long));
3831
3832                 msr_bitmap[read_idx] = read_intercept;
3833                 msr_bitmap[write_idx] = ~0ul;
3834         }
3835 }
3836
3837 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
3838 {
3839         struct vcpu_vmx *vmx = to_vmx(vcpu);
3840         u8 mode;
3841
3842         if (!cpu_has_vmx_msr_bitmap())
3843                 return;
3844
3845         if (cpu_has_secondary_exec_ctrls() &&
3846             (secondary_exec_controls_get(vmx) &
3847              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
3848                 mode = MSR_BITMAP_MODE_X2APIC;
3849                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
3850                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
3851         } else {
3852                 mode = 0;
3853         }
3854
3855         if (mode == vmx->x2apic_msr_bitmap_mode)
3856                 return;
3857
3858         vmx->x2apic_msr_bitmap_mode = mode;
3859
3860         vmx_reset_x2apic_msrs(vcpu, mode);
3861
3862         /*
3863          * TPR reads and writes can be virtualized even if virtual interrupt
3864          * delivery is not in use.
3865          */
3866         vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
3867                                   !(mode & MSR_BITMAP_MODE_X2APIC));
3868
3869         if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
3870                 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
3871                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
3872                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
3873         }
3874 }
3875
3876 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
3877 {
3878         struct vcpu_vmx *vmx = to_vmx(vcpu);
3879         bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
3880         u32 i;
3881
3882         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
3883         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
3884         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
3885         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
3886         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
3887                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
3888                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
3889         }
3890 }
3891
3892 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
3893 {
3894         struct vcpu_vmx *vmx = to_vmx(vcpu);
3895         void *vapic_page;
3896         u32 vppr;
3897         int rvi;
3898
3899         if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
3900                 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
3901                 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
3902                 return false;
3903
3904         rvi = vmx_get_rvi();
3905
3906         vapic_page = vmx->nested.virtual_apic_map.hva;
3907         vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
3908
3909         return ((rvi & 0xf0) > (vppr & 0xf0));
3910 }
3911
3912 static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
3913 {
3914         struct vcpu_vmx *vmx = to_vmx(vcpu);
3915         u32 i;
3916
3917         /*
3918          * Set intercept permissions for all potentially passed through MSRs
3919          * again. They will automatically get filtered through the MSR filter,
3920          * so we are back in sync after this.
3921          */
3922         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
3923                 u32 msr = vmx_possible_passthrough_msrs[i];
3924                 bool read = test_bit(i, vmx->shadow_msr_intercept.read);
3925                 bool write = test_bit(i, vmx->shadow_msr_intercept.write);
3926
3927                 vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
3928                 vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
3929         }
3930
3931         pt_update_intercept_for_msr(vcpu);
3932 }
3933
3934 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
3935                                                      bool nested)
3936 {
3937 #ifdef CONFIG_SMP
3938         int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
3939
3940         if (vcpu->mode == IN_GUEST_MODE) {
3941                 /*
3942                  * The vector of interrupt to be delivered to vcpu had
3943                  * been set in PIR before this function.
3944                  *
3945                  * Following cases will be reached in this block, and
3946                  * we always send a notification event in all cases as
3947                  * explained below.
3948                  *
3949                  * Case 1: vcpu keeps in non-root mode. Sending a
3950                  * notification event posts the interrupt to vcpu.
3951                  *
3952                  * Case 2: vcpu exits to root mode and is still
3953                  * runnable. PIR will be synced to vIRR before the
3954                  * next vcpu entry. Sending a notification event in
3955                  * this case has no effect, as vcpu is not in root
3956                  * mode.
3957                  *
3958                  * Case 3: vcpu exits to root mode and is blocked.
3959                  * vcpu_block() has already synced PIR to vIRR and
3960                  * never blocks vcpu if vIRR is not cleared. Therefore,
3961                  * a blocked vcpu here does not wait for any requested
3962                  * interrupts in PIR, and sending a notification event
3963                  * which has no effect is safe here.
3964                  */
3965
3966                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
3967                 return true;
3968         }
3969 #endif
3970         return false;
3971 }
3972
3973 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
3974                                                 int vector)
3975 {
3976         struct vcpu_vmx *vmx = to_vmx(vcpu);
3977
3978         if (is_guest_mode(vcpu) &&
3979             vector == vmx->nested.posted_intr_nv) {
3980                 /*
3981                  * If a posted intr is not recognized by hardware,
3982                  * we will accomplish it in the next vmentry.
3983                  */
3984                 vmx->nested.pi_pending = true;
3985                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3986
3987                 /*
3988                  * This pairs with the smp_mb_*() after setting vcpu->mode in
3989                  * vcpu_enter_guest() to guarantee the vCPU sees the event
3990                  * request if triggering a posted interrupt "fails" because
3991                  * vcpu->mode != IN_GUEST_MODE.  The extra barrier is needed as
3992                  * the smb_wmb() in kvm_make_request() only ensures everything
3993                  * done before making the request is visible when the request
3994                  * is visible, it doesn't ensure ordering between the store to
3995                  * vcpu->requests and the load from vcpu->mode.
3996                  */
3997                 smp_mb__after_atomic();
3998
3999                 /* the PIR and ON have been set by L1. */
4000                 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
4001                         kvm_vcpu_kick(vcpu);
4002                 return 0;
4003         }
4004         return -1;
4005 }
4006 /*
4007  * Send interrupt to vcpu via posted interrupt way.
4008  * 1. If target vcpu is running(non-root mode), send posted interrupt
4009  * notification to vcpu and hardware will sync PIR to vIRR atomically.
4010  * 2. If target vcpu isn't running(root mode), kick it to pick up the
4011  * interrupt from PIR in next vmentry.
4012  */
4013 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4014 {
4015         struct vcpu_vmx *vmx = to_vmx(vcpu);
4016         int r;
4017
4018         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4019         if (!r)
4020                 return 0;
4021
4022         if (!vcpu->arch.apicv_active)
4023                 return -1;
4024
4025         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4026                 return 0;
4027
4028         /* If a previous notification has sent the IPI, nothing to do.  */
4029         if (pi_test_and_set_on(&vmx->pi_desc))
4030                 return 0;
4031
4032         /*
4033          * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
4034          * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
4035          * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
4036          * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
4037          */
4038         if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
4039                 kvm_vcpu_kick(vcpu);
4040
4041         return 0;
4042 }
4043
4044 /*
4045  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4046  * will not change in the lifetime of the guest.
4047  * Note that host-state that does change is set elsewhere. E.g., host-state
4048  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4049  */
4050 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4051 {
4052         u32 low32, high32;
4053         unsigned long tmpl;
4054         unsigned long cr0, cr3, cr4;
4055
4056         cr0 = read_cr0();
4057         WARN_ON(cr0 & X86_CR0_TS);
4058         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
4059
4060         /*
4061          * Save the most likely value for this task's CR3 in the VMCS.
4062          * We can't use __get_current_cr3_fast() because we're not atomic.
4063          */
4064         cr3 = __read_cr3();
4065         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
4066         vmx->loaded_vmcs->host_state.cr3 = cr3;
4067
4068         /* Save the most likely value for this task's CR4 in the VMCS. */
4069         cr4 = cr4_read_shadow();
4070         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
4071         vmx->loaded_vmcs->host_state.cr4 = cr4;
4072
4073         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4074 #ifdef CONFIG_X86_64
4075         /*
4076          * Load null selectors, so we can avoid reloading them in
4077          * vmx_prepare_switch_to_host(), in case userspace uses
4078          * the null selectors too (the expected case).
4079          */
4080         vmcs_write16(HOST_DS_SELECTOR, 0);
4081         vmcs_write16(HOST_ES_SELECTOR, 0);
4082 #else
4083         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4084         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4085 #endif
4086         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4087         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4088
4089         vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
4090
4091         vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4092
4093         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4094         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4095
4096         /*
4097          * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites
4098          * HOST_IA32_SYSENTER_ESP.
4099          */
4100         vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4101         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4102         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4103
4104         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4105                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4106                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4107         }
4108
4109         if (cpu_has_load_ia32_efer())
4110                 vmcs_write64(HOST_IA32_EFER, host_efer);
4111 }
4112
4113 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4114 {
4115         struct kvm_vcpu *vcpu = &vmx->vcpu;
4116
4117         vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4118                                           ~vcpu->arch.cr4_guest_rsvd_bits;
4119         if (!enable_ept) {
4120                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4121                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4122         }
4123         if (is_guest_mode(&vmx->vcpu))
4124                 vcpu->arch.cr4_guest_owned_bits &=
4125                         ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4126         vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4127 }
4128
4129 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4130 {
4131         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4132
4133         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4134                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4135
4136         if (!enable_vnmi)
4137                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4138
4139         if (!enable_preemption_timer)
4140                 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4141
4142         return pin_based_exec_ctrl;
4143 }
4144
4145 static u32 vmx_vmentry_ctrl(void)
4146 {
4147         u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4148
4149         if (vmx_pt_mode_is_system())
4150                 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4151                                   VM_ENTRY_LOAD_IA32_RTIT_CTL);
4152         /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4153         return vmentry_ctrl &
4154                 ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
4155 }
4156
4157 static u32 vmx_vmexit_ctrl(void)
4158 {
4159         u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4160
4161         if (vmx_pt_mode_is_system())
4162                 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4163                                  VM_EXIT_CLEAR_IA32_RTIT_CTL);
4164         /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4165         return vmexit_ctrl &
4166                 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
4167 }
4168
4169 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4170 {
4171         struct vcpu_vmx *vmx = to_vmx(vcpu);
4172
4173         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4174         if (cpu_has_secondary_exec_ctrls()) {
4175                 if (kvm_vcpu_apicv_active(vcpu))
4176                         secondary_exec_controls_setbit(vmx,
4177                                       SECONDARY_EXEC_APIC_REGISTER_VIRT |
4178                                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4179                 else
4180                         secondary_exec_controls_clearbit(vmx,
4181                                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
4182                                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4183         }
4184
4185         vmx_update_msr_bitmap_x2apic(vcpu);
4186 }
4187
4188 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4189 {
4190         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4191
4192         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4193                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4194
4195         if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
4196                 exec_control &= ~CPU_BASED_TPR_SHADOW;
4197 #ifdef CONFIG_X86_64
4198                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4199                                 CPU_BASED_CR8_LOAD_EXITING;
4200 #endif
4201         }
4202         if (!enable_ept)
4203                 exec_control |= CPU_BASED_CR3_STORE_EXITING |
4204                                 CPU_BASED_CR3_LOAD_EXITING  |
4205                                 CPU_BASED_INVLPG_EXITING;
4206         if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4207                 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4208                                 CPU_BASED_MONITOR_EXITING);
4209         if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4210                 exec_control &= ~CPU_BASED_HLT_EXITING;
4211         return exec_control;
4212 }
4213
4214 /*
4215  * Adjust a single secondary execution control bit to intercept/allow an
4216  * instruction in the guest.  This is usually done based on whether or not a
4217  * feature has been exposed to the guest in order to correctly emulate faults.
4218  */
4219 static inline void
4220 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4221                                   u32 control, bool enabled, bool exiting)
4222 {
4223         /*
4224          * If the control is for an opt-in feature, clear the control if the
4225          * feature is not exposed to the guest, i.e. not enabled.  If the
4226          * control is opt-out, i.e. an exiting control, clear the control if
4227          * the feature _is_ exposed to the guest, i.e. exiting/interception is
4228          * disabled for the associated instruction.  Note, the caller is
4229          * responsible presetting exec_control to set all supported bits.
4230          */
4231         if (enabled == exiting)
4232                 *exec_control &= ~control;
4233
4234         /*
4235          * Update the nested MSR settings so that a nested VMM can/can't set
4236          * controls for features that are/aren't exposed to the guest.
4237          */
4238         if (nested) {
4239                 if (enabled)
4240                         vmx->nested.msrs.secondary_ctls_high |= control;
4241                 else
4242                         vmx->nested.msrs.secondary_ctls_high &= ~control;
4243         }
4244 }
4245
4246 /*
4247  * Wrapper macro for the common case of adjusting a secondary execution control
4248  * based on a single guest CPUID bit, with a dedicated feature bit.  This also
4249  * verifies that the control is actually supported by KVM and hardware.
4250  */
4251 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4252 ({                                                                       \
4253         bool __enabled;                                                  \
4254                                                                          \
4255         if (cpu_has_vmx_##name()) {                                      \
4256                 __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
4257                                             X86_FEATURE_##feat_name);    \
4258                 vmx_adjust_secondary_exec_control(vmx, exec_control,     \
4259                         SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4260         }                                                                \
4261 })
4262
4263 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4264 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4265         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4266
4267 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4268         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4269
4270 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4271 {
4272         struct kvm_vcpu *vcpu = &vmx->vcpu;
4273
4274         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4275
4276         if (vmx_pt_mode_is_system())
4277                 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4278         if (!cpu_need_virtualize_apic_accesses(vcpu))
4279                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4280         if (vmx->vpid == 0)
4281                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4282         if (!enable_ept) {
4283                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4284                 enable_unrestricted_guest = 0;
4285         }
4286         if (!enable_unrestricted_guest)
4287                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4288         if (kvm_pause_in_guest(vmx->vcpu.kvm))
4289                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4290         if (!kvm_vcpu_apicv_active(vcpu))
4291                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4292                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4293         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4294
4295         /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4296          * in vmx_set_cr4.  */
4297         exec_control &= ~SECONDARY_EXEC_DESC;
4298
4299         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4300            (handle_vmptrld).
4301            We can NOT enable shadow_vmcs here because we don't have yet
4302            a current VMCS12
4303         */
4304         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4305
4306         /*
4307          * PML is enabled/disabled when dirty logging of memsmlots changes, but
4308          * it needs to be set here when dirty logging is already active, e.g.
4309          * if this vCPU was created after dirty logging was enabled.
4310          */
4311         if (!vcpu->kvm->arch.cpu_dirty_logging_count)
4312                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4313
4314         if (cpu_has_vmx_xsaves()) {
4315                 /* Exposing XSAVES only when XSAVE is exposed */
4316                 bool xsaves_enabled =
4317                         boot_cpu_has(X86_FEATURE_XSAVE) &&
4318                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4319                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4320
4321                 vcpu->arch.xsaves_enabled = xsaves_enabled;
4322
4323                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4324                                                   SECONDARY_EXEC_XSAVES,
4325                                                   xsaves_enabled, false);
4326         }
4327
4328         /*
4329          * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4330          * feature is exposed to the guest.  This creates a virtualization hole
4331          * if both are supported in hardware but only one is exposed to the
4332          * guest, but letting the guest execute RDTSCP or RDPID when either one
4333          * is advertised is preferable to emulating the advertised instruction
4334          * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4335          */
4336         if (cpu_has_vmx_rdtscp()) {
4337                 bool rdpid_or_rdtscp_enabled =
4338                         guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
4339                         guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
4340
4341                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4342                                                   SECONDARY_EXEC_ENABLE_RDTSCP,
4343                                                   rdpid_or_rdtscp_enabled, false);
4344         }
4345         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4346
4347         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4348         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4349
4350         vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4351                                     ENABLE_USR_WAIT_PAUSE, false);
4352
4353         if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4354                 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4355
4356         return exec_control;
4357 }
4358
4359 #define VMX_XSS_EXIT_BITMAP 0
4360
4361 static void init_vmcs(struct vcpu_vmx *vmx)
4362 {
4363         if (nested)
4364                 nested_vmx_set_vmcs_shadowing_bitmap();
4365
4366         if (cpu_has_vmx_msr_bitmap())
4367                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4368
4369         vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
4370
4371         /* Control */
4372         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4373
4374         exec_controls_set(vmx, vmx_exec_control(vmx));
4375
4376         if (cpu_has_secondary_exec_ctrls())
4377                 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
4378
4379         if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
4380                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4381                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4382                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4383                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4384
4385                 vmcs_write16(GUEST_INTR_STATUS, 0);
4386
4387                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4388                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4389         }
4390
4391         if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
4392                 vmcs_write32(PLE_GAP, ple_gap);
4393                 vmx->ple_window = ple_window;
4394                 vmx->ple_window_dirty = true;
4395         }
4396
4397         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4398         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4399         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4400
4401         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4402         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4403         vmx_set_constant_host_state(vmx);
4404         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4405         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4406
4407         if (cpu_has_vmx_vmfunc())
4408                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4409
4410         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4411         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4412         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4413         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4414         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4415
4416         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4417                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4418
4419         vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
4420
4421         /* 22.2.1, 20.8.1 */
4422         vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4423
4424         vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4425         vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4426
4427         set_cr4_guest_host_mask(vmx);
4428
4429         if (vmx->vpid != 0)
4430                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4431
4432         if (cpu_has_vmx_xsaves())
4433                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4434
4435         if (enable_pml) {
4436                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4437                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4438         }
4439
4440         vmx_write_encls_bitmap(&vmx->vcpu, NULL);
4441
4442         if (vmx_pt_mode_is_host_guest()) {
4443                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4444                 /* Bit[6~0] are forced to 1, writes are ignored. */
4445                 vmx->pt_desc.guest.output_mask = 0x7F;
4446                 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4447         }
4448
4449         vmcs_write32(GUEST_SYSENTER_CS, 0);
4450         vmcs_writel(GUEST_SYSENTER_ESP, 0);
4451         vmcs_writel(GUEST_SYSENTER_EIP, 0);
4452         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4453
4454         if (cpu_has_vmx_tpr_shadow()) {
4455                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4456                 if (cpu_need_tpr_shadow(&vmx->vcpu))
4457                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4458                                      __pa(vmx->vcpu.arch.apic->regs));
4459                 vmcs_write32(TPR_THRESHOLD, 0);
4460         }
4461
4462         vmx_setup_uret_msrs(vmx);
4463 }
4464
4465 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4466 {
4467         struct vcpu_vmx *vmx = to_vmx(vcpu);
4468
4469         init_vmcs(vmx);
4470
4471         if (nested)
4472                 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4473
4474         vcpu_setup_sgx_lepubkeyhash(vcpu);
4475
4476         vmx->nested.posted_intr_nv = -1;
4477         vmx->nested.vmxon_ptr = INVALID_GPA;
4478         vmx->nested.current_vmptr = INVALID_GPA;
4479         vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4480
4481         vcpu->arch.microcode_version = 0x100000000ULL;
4482         vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4483
4484         /*
4485          * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4486          * or POSTED_INTR_WAKEUP_VECTOR.
4487          */
4488         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
4489         vmx->pi_desc.sn = 1;
4490 }
4491
4492 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4493 {
4494         struct vcpu_vmx *vmx = to_vmx(vcpu);
4495
4496         if (!init_event)
4497                 __vmx_vcpu_reset(vcpu);
4498
4499         vmx->rmode.vm86_active = 0;
4500         vmx->spec_ctrl = 0;
4501
4502         vmx->msr_ia32_umwait_control = 0;
4503
4504         vmx->hv_deadline_tsc = -1;
4505         kvm_set_cr8(vcpu, 0);
4506
4507         vmx_segment_cache_clear(vmx);
4508         kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
4509
4510         seg_setup(VCPU_SREG_CS);
4511         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4512         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4513
4514         seg_setup(VCPU_SREG_DS);
4515         seg_setup(VCPU_SREG_ES);
4516         seg_setup(VCPU_SREG_FS);
4517         seg_setup(VCPU_SREG_GS);
4518         seg_setup(VCPU_SREG_SS);
4519
4520         vmcs_write16(GUEST_TR_SELECTOR, 0);
4521         vmcs_writel(GUEST_TR_BASE, 0);
4522         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4523         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4524
4525         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4526         vmcs_writel(GUEST_LDTR_BASE, 0);
4527         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4528         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4529
4530         vmcs_writel(GUEST_GDTR_BASE, 0);
4531         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4532
4533         vmcs_writel(GUEST_IDTR_BASE, 0);
4534         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4535
4536         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4537         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4538         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4539         if (kvm_mpx_supported())
4540                 vmcs_write64(GUEST_BNDCFGS, 0);
4541
4542         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4543
4544         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4545
4546         vpid_sync_context(vmx->vpid);
4547 }
4548
4549 static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
4550 {
4551         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4552 }
4553
4554 static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
4555 {
4556         if (!enable_vnmi ||
4557             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4558                 vmx_enable_irq_window(vcpu);
4559                 return;
4560         }
4561
4562         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4563 }
4564
4565 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4566 {
4567         struct vcpu_vmx *vmx = to_vmx(vcpu);
4568         uint32_t intr;
4569         int irq = vcpu->arch.interrupt.nr;
4570
4571         trace_kvm_inj_virq(irq);
4572
4573         ++vcpu->stat.irq_injections;
4574         if (vmx->rmode.vm86_active) {
4575                 int inc_eip = 0;
4576                 if (vcpu->arch.interrupt.soft)
4577                         inc_eip = vcpu->arch.event_exit_inst_len;
4578                 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4579                 return;
4580         }
4581         intr = irq | INTR_INFO_VALID_MASK;
4582         if (vcpu->arch.interrupt.soft) {
4583                 intr |= INTR_TYPE_SOFT_INTR;
4584                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4585                              vmx->vcpu.arch.event_exit_inst_len);
4586         } else
4587                 intr |= INTR_TYPE_EXT_INTR;
4588         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4589
4590         vmx_clear_hlt(vcpu);
4591 }
4592
4593 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4594 {
4595         struct vcpu_vmx *vmx = to_vmx(vcpu);
4596
4597         if (!enable_vnmi) {
4598                 /*
4599                  * Tracking the NMI-blocked state in software is built upon
4600                  * finding the next open IRQ window. This, in turn, depends on
4601                  * well-behaving guests: They have to keep IRQs disabled at
4602                  * least as long as the NMI handler runs. Otherwise we may
4603                  * cause NMI nesting, maybe breaking the guest. But as this is
4604                  * highly unlikely, we can live with the residual risk.
4605                  */
4606                 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4607                 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4608         }
4609
4610         ++vcpu->stat.nmi_injections;
4611         vmx->loaded_vmcs->nmi_known_unmasked = false;
4612
4613         if (vmx->rmode.vm86_active) {
4614                 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
4615                 return;
4616         }
4617
4618         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4619                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4620
4621         vmx_clear_hlt(vcpu);
4622 }
4623
4624 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4625 {
4626         struct vcpu_vmx *vmx = to_vmx(vcpu);
4627         bool masked;
4628
4629         if (!enable_vnmi)
4630                 return vmx->loaded_vmcs->soft_vnmi_blocked;
4631         if (vmx->loaded_vmcs->nmi_known_unmasked)
4632                 return false;
4633         masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4634         vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4635         return masked;
4636 }
4637
4638 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4639 {
4640         struct vcpu_vmx *vmx = to_vmx(vcpu);
4641
4642         if (!enable_vnmi) {
4643                 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
4644                         vmx->loaded_vmcs->soft_vnmi_blocked = masked;
4645                         vmx->loaded_vmcs->vnmi_blocked_time = 0;
4646                 }
4647         } else {
4648                 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4649                 if (masked)
4650                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4651                                       GUEST_INTR_STATE_NMI);
4652                 else
4653                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4654                                         GUEST_INTR_STATE_NMI);
4655         }
4656 }
4657
4658 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
4659 {
4660         if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
4661                 return false;
4662
4663         if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
4664                 return true;
4665
4666         return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4667                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
4668                  GUEST_INTR_STATE_NMI));
4669 }
4670
4671 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4672 {
4673         if (to_vmx(vcpu)->nested.nested_run_pending)
4674                 return -EBUSY;
4675
4676         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
4677         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
4678                 return -EBUSY;
4679
4680         return !vmx_nmi_blocked(vcpu);
4681 }
4682
4683 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
4684 {
4685         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4686                 return false;
4687
4688         return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
4689                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4690                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4691 }
4692
4693 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4694 {
4695         if (to_vmx(vcpu)->nested.nested_run_pending)
4696                 return -EBUSY;
4697
4698        /*
4699         * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
4700         * e.g. if the IRQ arrived asynchronously after checking nested events.
4701         */
4702         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4703                 return -EBUSY;
4704
4705         return !vmx_interrupt_blocked(vcpu);
4706 }
4707
4708 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4709 {
4710         void __user *ret;
4711
4712         if (enable_unrestricted_guest)
4713                 return 0;
4714
4715         mutex_lock(&kvm->slots_lock);
4716         ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
4717                                       PAGE_SIZE * 3);
4718         mutex_unlock(&kvm->slots_lock);
4719
4720         if (IS_ERR(ret))
4721                 return PTR_ERR(ret);
4722
4723         to_kvm_vmx(kvm)->tss_addr = addr;
4724
4725         return init_rmode_tss(kvm, ret);
4726 }
4727
4728 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
4729 {
4730         to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
4731         return 0;
4732 }
4733
4734 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4735 {
4736         switch (vec) {
4737         case BP_VECTOR:
4738                 /*
4739                  * Update instruction length as we may reinject the exception
4740                  * from user space while in guest debugging mode.
4741                  */
4742                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4743                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4744                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4745                         return false;
4746                 fallthrough;
4747         case DB_VECTOR:
4748                 return !(vcpu->guest_debug &
4749                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
4750         case DE_VECTOR:
4751         case OF_VECTOR:
4752         case BR_VECTOR:
4753         case UD_VECTOR:
4754         case DF_VECTOR:
4755         case SS_VECTOR:
4756         case GP_VECTOR:
4757         case MF_VECTOR:
4758                 return true;
4759         }
4760         return false;
4761 }
4762
4763 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4764                                   int vec, u32 err_code)
4765 {
4766         /*
4767          * Instruction with address size override prefix opcode 0x67
4768          * Cause the #SS fault with 0 error code in VM86 mode.
4769          */
4770         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4771                 if (kvm_emulate_instruction(vcpu, 0)) {
4772                         if (vcpu->arch.halt_request) {
4773                                 vcpu->arch.halt_request = 0;
4774                                 return kvm_emulate_halt_noskip(vcpu);
4775                         }
4776                         return 1;
4777                 }
4778                 return 0;
4779         }
4780
4781         /*
4782          * Forward all other exceptions that are valid in real mode.
4783          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4784          *        the required debugging infrastructure rework.
4785          */
4786         kvm_queue_exception(vcpu, vec);
4787         return 1;
4788 }
4789
4790 static int handle_machine_check(struct kvm_vcpu *vcpu)
4791 {
4792         /* handled by vmx_vcpu_run() */
4793         return 1;
4794 }
4795
4796 /*
4797  * If the host has split lock detection disabled, then #AC is
4798  * unconditionally injected into the guest, which is the pre split lock
4799  * detection behaviour.
4800  *
4801  * If the host has split lock detection enabled then #AC is
4802  * only injected into the guest when:
4803  *  - Guest CPL == 3 (user mode)
4804  *  - Guest has #AC detection enabled in CR0
4805  *  - Guest EFLAGS has AC bit set
4806  */
4807 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
4808 {
4809         if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
4810                 return true;
4811
4812         return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
4813                (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
4814 }
4815
4816 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
4817 {
4818         struct vcpu_vmx *vmx = to_vmx(vcpu);
4819         struct kvm_run *kvm_run = vcpu->run;
4820         u32 intr_info, ex_no, error_code;
4821         unsigned long cr2, dr6;
4822         u32 vect_info;
4823
4824         vect_info = vmx->idt_vectoring_info;
4825         intr_info = vmx_get_intr_info(vcpu);
4826
4827         if (is_machine_check(intr_info) || is_nmi(intr_info))
4828                 return 1; /* handled by handle_exception_nmi_irqoff() */
4829
4830         /*
4831          * Queue the exception here instead of in handle_nm_fault_irqoff().
4832          * This ensures the nested_vmx check is not skipped so vmexit can
4833          * be reflected to L1 (when it intercepts #NM) before reaching this
4834          * point.
4835          */
4836         if (is_nm_fault(intr_info)) {
4837                 kvm_queue_exception(vcpu, NM_VECTOR);
4838                 return 1;
4839         }
4840
4841         if (is_invalid_opcode(intr_info))
4842                 return handle_ud(vcpu);
4843
4844         error_code = 0;
4845         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4846                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4847
4848         if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
4849                 WARN_ON_ONCE(!enable_vmware_backdoor);
4850
4851                 /*
4852                  * VMware backdoor emulation on #GP interception only handles
4853                  * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
4854                  * error code on #GP.
4855                  */
4856                 if (error_code) {
4857                         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
4858                         return 1;
4859                 }
4860                 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
4861         }
4862
4863         /*
4864          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4865          * MMIO, it is better to report an internal error.
4866          * See the comments in vmx_handle_exit.
4867          */
4868         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4869             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4870                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4871                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4872                 vcpu->run->internal.ndata = 4;
4873                 vcpu->run->internal.data[0] = vect_info;
4874                 vcpu->run->internal.data[1] = intr_info;
4875                 vcpu->run->internal.data[2] = error_code;
4876                 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
4877                 return 0;
4878         }
4879
4880         if (is_page_fault(intr_info)) {
4881                 cr2 = vmx_get_exit_qual(vcpu);
4882                 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
4883                         /*
4884                          * EPT will cause page fault only if we need to
4885                          * detect illegal GPAs.
4886                          */
4887                         WARN_ON_ONCE(!allow_smaller_maxphyaddr);
4888                         kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
4889                         return 1;
4890                 } else
4891                         return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
4892         }
4893
4894         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4895
4896         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4897                 return handle_rmode_exception(vcpu, ex_no, error_code);
4898
4899         switch (ex_no) {
4900         case DB_VECTOR:
4901                 dr6 = vmx_get_exit_qual(vcpu);
4902                 if (!(vcpu->guest_debug &
4903                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4904                         if (is_icebp(intr_info))
4905                                 WARN_ON(!skip_emulated_instruction(vcpu));
4906
4907                         kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
4908                         return 1;
4909                 }
4910                 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
4911                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
4912                 fallthrough;
4913         case BP_VECTOR:
4914                 /*
4915                  * Update instruction length as we may reinject #BP from
4916                  * user space while in guest debugging mode. Reading it for
4917                  * #DB as well causes no harm, it is not used in that case.
4918                  */
4919                 vmx->vcpu.arch.event_exit_inst_len =
4920                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4921                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
4922                 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
4923                 kvm_run->debug.arch.exception = ex_no;
4924                 break;
4925         case AC_VECTOR:
4926                 if (vmx_guest_inject_ac(vcpu)) {
4927                         kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
4928                         return 1;
4929                 }
4930
4931                 /*
4932                  * Handle split lock. Depending on detection mode this will
4933                  * either warn and disable split lock detection for this
4934                  * task or force SIGBUS on it.
4935                  */
4936                 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
4937                         return 1;
4938                 fallthrough;
4939         default:
4940                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
4941                 kvm_run->ex.exception = ex_no;
4942                 kvm_run->ex.error_code = error_code;
4943                 break;
4944         }
4945         return 0;
4946 }
4947
4948 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
4949 {
4950         ++vcpu->stat.irq_exits;
4951         return 1;
4952 }
4953
4954 static int handle_triple_fault(struct kvm_vcpu *vcpu)
4955 {
4956         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4957         vcpu->mmio_needed = 0;
4958         return 0;
4959 }
4960
4961 static int handle_io(struct kvm_vcpu *vcpu)
4962 {
4963         unsigned long exit_qualification;
4964         int size, in, string;
4965         unsigned port;
4966
4967         exit_qualification = vmx_get_exit_qual(vcpu);
4968         string = (exit_qualification & 16) != 0;
4969
4970         ++vcpu->stat.io_exits;
4971
4972         if (string)
4973                 return kvm_emulate_instruction(vcpu, 0);
4974
4975         port = exit_qualification >> 16;
4976         size = (exit_qualification & 7) + 1;
4977         in = (exit_qualification & 8) != 0;
4978
4979         return kvm_fast_pio(vcpu, size, port, in);
4980 }
4981
4982 static void
4983 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4984 {
4985         /*
4986          * Patch in the VMCALL instruction:
4987          */
4988         hypercall[0] = 0x0f;
4989         hypercall[1] = 0x01;
4990         hypercall[2] = 0xc1;
4991 }
4992
4993 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4994 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4995 {
4996         if (is_guest_mode(vcpu)) {
4997                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4998                 unsigned long orig_val = val;
4999
5000                 /*
5001                  * We get here when L2 changed cr0 in a way that did not change
5002                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5003                  * but did change L0 shadowed bits. So we first calculate the
5004                  * effective cr0 value that L1 would like to write into the
5005                  * hardware. It consists of the L2-owned bits from the new
5006                  * value combined with the L1-owned bits from L1's guest_cr0.
5007                  */
5008                 val = (val & ~vmcs12->cr0_guest_host_mask) |
5009                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5010
5011                 if (!nested_guest_cr0_valid(vcpu, val))
5012                         return 1;
5013
5014                 if (kvm_set_cr0(vcpu, val))
5015                         return 1;
5016                 vmcs_writel(CR0_READ_SHADOW, orig_val);
5017                 return 0;
5018         } else {
5019                 if (to_vmx(vcpu)->nested.vmxon &&
5020                     !nested_host_cr0_valid(vcpu, val))
5021                         return 1;
5022
5023                 return kvm_set_cr0(vcpu, val);
5024         }
5025 }
5026
5027 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5028 {
5029         if (is_guest_mode(vcpu)) {
5030                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5031                 unsigned long orig_val = val;
5032
5033                 /* analogously to handle_set_cr0 */
5034                 val = (val & ~vmcs12->cr4_guest_host_mask) |
5035                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5036                 if (kvm_set_cr4(vcpu, val))
5037                         return 1;
5038                 vmcs_writel(CR4_READ_SHADOW, orig_val);
5039                 return 0;
5040         } else
5041                 return kvm_set_cr4(vcpu, val);
5042 }
5043
5044 static int handle_desc(struct kvm_vcpu *vcpu)
5045 {
5046         WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
5047         return kvm_emulate_instruction(vcpu, 0);
5048 }
5049
5050 static int handle_cr(struct kvm_vcpu *vcpu)
5051 {
5052         unsigned long exit_qualification, val;
5053         int cr;
5054         int reg;
5055         int err;
5056         int ret;
5057
5058         exit_qualification = vmx_get_exit_qual(vcpu);
5059         cr = exit_qualification & 15;
5060         reg = (exit_qualification >> 8) & 15;
5061         switch ((exit_qualification >> 4) & 3) {
5062         case 0: /* mov to cr */
5063                 val = kvm_register_read(vcpu, reg);
5064                 trace_kvm_cr_write(cr, val);
5065                 switch (cr) {
5066                 case 0:
5067                         err = handle_set_cr0(vcpu, val);
5068                         return kvm_complete_insn_gp(vcpu, err);
5069                 case 3:
5070                         WARN_ON_ONCE(enable_unrestricted_guest);
5071
5072                         err = kvm_set_cr3(vcpu, val);
5073                         return kvm_complete_insn_gp(vcpu, err);
5074                 case 4:
5075                         err = handle_set_cr4(vcpu, val);
5076                         return kvm_complete_insn_gp(vcpu, err);
5077                 case 8: {
5078                                 u8 cr8_prev = kvm_get_cr8(vcpu);
5079                                 u8 cr8 = (u8)val;
5080                                 err = kvm_set_cr8(vcpu, cr8);
5081                                 ret = kvm_complete_insn_gp(vcpu, err);
5082                                 if (lapic_in_kernel(vcpu))
5083                                         return ret;
5084                                 if (cr8_prev <= cr8)
5085                                         return ret;
5086                                 /*
5087                                  * TODO: we might be squashing a
5088                                  * KVM_GUESTDBG_SINGLESTEP-triggered
5089                                  * KVM_EXIT_DEBUG here.
5090                                  */
5091                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5092                                 return 0;
5093                         }
5094                 }
5095                 break;
5096         case 2: /* clts */
5097                 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5098                 return -EIO;
5099         case 1: /*mov from cr*/
5100                 switch (cr) {
5101                 case 3:
5102                         WARN_ON_ONCE(enable_unrestricted_guest);
5103
5104                         val = kvm_read_cr3(vcpu);
5105                         kvm_register_write(vcpu, reg, val);
5106                         trace_kvm_cr_read(cr, val);
5107                         return kvm_skip_emulated_instruction(vcpu);
5108                 case 8:
5109                         val = kvm_get_cr8(vcpu);
5110                         kvm_register_write(vcpu, reg, val);
5111                         trace_kvm_cr_read(cr, val);
5112                         return kvm_skip_emulated_instruction(vcpu);
5113                 }
5114                 break;
5115         case 3: /* lmsw */
5116                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5117                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5118                 kvm_lmsw(vcpu, val);
5119
5120                 return kvm_skip_emulated_instruction(vcpu);
5121         default:
5122                 break;
5123         }
5124         vcpu->run->exit_reason = 0;
5125         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5126                (int)(exit_qualification >> 4) & 3, cr);
5127         return 0;
5128 }
5129
5130 static int handle_dr(struct kvm_vcpu *vcpu)
5131 {
5132         unsigned long exit_qualification;
5133         int dr, dr7, reg;
5134         int err = 1;
5135
5136         exit_qualification = vmx_get_exit_qual(vcpu);
5137         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5138
5139         /* First, if DR does not exist, trigger UD */
5140         if (!kvm_require_dr(vcpu, dr))
5141                 return 1;
5142
5143         if (kvm_x86_ops.get_cpl(vcpu) > 0)
5144                 goto out;
5145
5146         dr7 = vmcs_readl(GUEST_DR7);
5147         if (dr7 & DR7_GD) {
5148                 /*
5149                  * As the vm-exit takes precedence over the debug trap, we
5150                  * need to emulate the latter, either for the host or the
5151                  * guest debugging itself.
5152                  */
5153                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5154                         vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
5155                         vcpu->run->debug.arch.dr7 = dr7;
5156                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5157                         vcpu->run->debug.arch.exception = DB_VECTOR;
5158                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5159                         return 0;
5160                 } else {
5161                         kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5162                         return 1;
5163                 }
5164         }
5165
5166         if (vcpu->guest_debug == 0) {
5167                 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5168
5169                 /*
5170                  * No more DR vmexits; force a reload of the debug registers
5171                  * and reenter on this instruction.  The next vmexit will
5172                  * retrieve the full state of the debug registers.
5173                  */
5174                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5175                 return 1;
5176         }
5177
5178         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5179         if (exit_qualification & TYPE_MOV_FROM_DR) {
5180                 unsigned long val;
5181
5182                 kvm_get_dr(vcpu, dr, &val);
5183                 kvm_register_write(vcpu, reg, val);
5184                 err = 0;
5185         } else {
5186                 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
5187         }
5188
5189 out:
5190         return kvm_complete_insn_gp(vcpu, err);
5191 }
5192
5193 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5194 {
5195         get_debugreg(vcpu->arch.db[0], 0);
5196         get_debugreg(vcpu->arch.db[1], 1);
5197         get_debugreg(vcpu->arch.db[2], 2);
5198         get_debugreg(vcpu->arch.db[3], 3);
5199         get_debugreg(vcpu->arch.dr6, 6);
5200         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5201
5202         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5203         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5204
5205         /*
5206          * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5207          * a stale dr6 from the guest.
5208          */
5209         set_debugreg(DR6_RESERVED, 6);
5210 }
5211
5212 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5213 {
5214         vmcs_writel(GUEST_DR7, val);
5215 }
5216
5217 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5218 {
5219         kvm_apic_update_ppr(vcpu);
5220         return 1;
5221 }
5222
5223 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5224 {
5225         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5226
5227         kvm_make_request(KVM_REQ_EVENT, vcpu);
5228
5229         ++vcpu->stat.irq_window_exits;
5230         return 1;
5231 }
5232
5233 static int handle_invlpg(struct kvm_vcpu *vcpu)
5234 {
5235         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5236
5237         kvm_mmu_invlpg(vcpu, exit_qualification);
5238         return kvm_skip_emulated_instruction(vcpu);
5239 }
5240
5241 static int handle_apic_access(struct kvm_vcpu *vcpu)
5242 {
5243         if (likely(fasteoi)) {
5244                 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5245                 int access_type, offset;
5246
5247                 access_type = exit_qualification & APIC_ACCESS_TYPE;
5248                 offset = exit_qualification & APIC_ACCESS_OFFSET;
5249                 /*
5250                  * Sane guest uses MOV to write EOI, with written value
5251                  * not cared. So make a short-circuit here by avoiding
5252                  * heavy instruction emulation.
5253                  */
5254                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5255                     (offset == APIC_EOI)) {
5256                         kvm_lapic_set_eoi(vcpu);
5257                         return kvm_skip_emulated_instruction(vcpu);
5258                 }
5259         }
5260         return kvm_emulate_instruction(vcpu, 0);
5261 }
5262
5263 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5264 {
5265         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5266         int vector = exit_qualification & 0xff;
5267
5268         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5269         kvm_apic_set_eoi_accelerated(vcpu, vector);
5270         return 1;
5271 }
5272
5273 static int handle_apic_write(struct kvm_vcpu *vcpu)
5274 {
5275         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5276         u32 offset = exit_qualification & 0xfff;
5277
5278         /* APIC-write VM exit is trap-like and thus no need to adjust IP */
5279         kvm_apic_write_nodecode(vcpu, offset);
5280         return 1;
5281 }
5282
5283 static int handle_task_switch(struct kvm_vcpu *vcpu)
5284 {
5285         struct vcpu_vmx *vmx = to_vmx(vcpu);
5286         unsigned long exit_qualification;
5287         bool has_error_code = false;
5288         u32 error_code = 0;
5289         u16 tss_selector;
5290         int reason, type, idt_v, idt_index;
5291
5292         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5293         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5294         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5295
5296         exit_qualification = vmx_get_exit_qual(vcpu);
5297
5298         reason = (u32)exit_qualification >> 30;
5299         if (reason == TASK_SWITCH_GATE && idt_v) {
5300                 switch (type) {
5301                 case INTR_TYPE_NMI_INTR:
5302                         vcpu->arch.nmi_injected = false;
5303                         vmx_set_nmi_mask(vcpu, true);
5304                         break;
5305                 case INTR_TYPE_EXT_INTR:
5306                 case INTR_TYPE_SOFT_INTR:
5307                         kvm_clear_interrupt_queue(vcpu);
5308                         break;
5309                 case INTR_TYPE_HARD_EXCEPTION:
5310                         if (vmx->idt_vectoring_info &
5311                             VECTORING_INFO_DELIVER_CODE_MASK) {
5312                                 has_error_code = true;
5313                                 error_code =
5314                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
5315                         }
5316                         fallthrough;
5317                 case INTR_TYPE_SOFT_EXCEPTION:
5318                         kvm_clear_exception_queue(vcpu);
5319                         break;
5320                 default:
5321                         break;
5322                 }
5323         }
5324         tss_selector = exit_qualification;
5325
5326         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5327                        type != INTR_TYPE_EXT_INTR &&
5328                        type != INTR_TYPE_NMI_INTR))
5329                 WARN_ON(!skip_emulated_instruction(vcpu));
5330
5331         /*
5332          * TODO: What about debug traps on tss switch?
5333          *       Are we supposed to inject them and update dr6?
5334          */
5335         return kvm_task_switch(vcpu, tss_selector,
5336                                type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5337                                reason, has_error_code, error_code);
5338 }
5339
5340 static int handle_ept_violation(struct kvm_vcpu *vcpu)
5341 {
5342         unsigned long exit_qualification;
5343         gpa_t gpa;
5344         u64 error_code;
5345
5346         exit_qualification = vmx_get_exit_qual(vcpu);
5347
5348         /*
5349          * EPT violation happened while executing iret from NMI,
5350          * "blocked by NMI" bit has to be set before next VM entry.
5351          * There are errata that may cause this bit to not be set:
5352          * AAK134, BY25.
5353          */
5354         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5355                         enable_vnmi &&
5356                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5357                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5358
5359         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5360         trace_kvm_page_fault(gpa, exit_qualification);
5361
5362         /* Is it a read fault? */
5363         error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5364                      ? PFERR_USER_MASK : 0;
5365         /* Is it a write fault? */
5366         error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5367                       ? PFERR_WRITE_MASK : 0;
5368         /* Is it a fetch fault? */
5369         error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5370                       ? PFERR_FETCH_MASK : 0;
5371         /* ept page table entry is present? */
5372         error_code |= (exit_qualification &
5373                        (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
5374                         EPT_VIOLATION_EXECUTABLE))
5375                       ? PFERR_PRESENT_MASK : 0;
5376
5377         error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
5378                PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5379
5380         vcpu->arch.exit_qualification = exit_qualification;
5381
5382         /*
5383          * Check that the GPA doesn't exceed physical memory limits, as that is
5384          * a guest page fault.  We have to emulate the instruction here, because
5385          * if the illegal address is that of a paging structure, then
5386          * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
5387          * would also use advanced VM-exit information for EPT violations to
5388          * reconstruct the page fault error code.
5389          */
5390         if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
5391                 return kvm_emulate_instruction(vcpu, 0);
5392
5393         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5394 }
5395
5396 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5397 {
5398         gpa_t gpa;
5399
5400         if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
5401                 return 1;
5402
5403         /*
5404          * A nested guest cannot optimize MMIO vmexits, because we have an
5405          * nGPA here instead of the required GPA.
5406          */
5407         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5408         if (!is_guest_mode(vcpu) &&
5409             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5410                 trace_kvm_fast_mmio(gpa);
5411                 return kvm_skip_emulated_instruction(vcpu);
5412         }
5413
5414         return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5415 }
5416
5417 static int handle_nmi_window(struct kvm_vcpu *vcpu)
5418 {
5419         if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5420                 return -EIO;
5421
5422         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5423         ++vcpu->stat.nmi_window_exits;
5424         kvm_make_request(KVM_REQ_EVENT, vcpu);
5425
5426         return 1;
5427 }
5428
5429 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5430 {
5431         struct vcpu_vmx *vmx = to_vmx(vcpu);
5432         bool intr_window_requested;
5433         unsigned count = 130;
5434
5435         intr_window_requested = exec_controls_get(vmx) &
5436                                 CPU_BASED_INTR_WINDOW_EXITING;
5437
5438         while (vmx->emulation_required && count-- != 0) {
5439                 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5440                         return handle_interrupt_window(&vmx->vcpu);
5441
5442                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5443                         return 1;
5444
5445                 if (!kvm_emulate_instruction(vcpu, 0))
5446                         return 0;
5447
5448                 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
5449                     vcpu->arch.exception.pending) {
5450                         kvm_prepare_emulation_failure_exit(vcpu);
5451                         return 0;
5452                 }
5453
5454                 if (vcpu->arch.halt_request) {
5455                         vcpu->arch.halt_request = 0;
5456                         return kvm_emulate_halt_noskip(vcpu);
5457                 }
5458
5459                 /*
5460                  * Note, return 1 and not 0, vcpu_run() will invoke
5461                  * xfer_to_guest_mode() which will create a proper return
5462                  * code.
5463                  */
5464                 if (__xfer_to_guest_mode_work_pending())
5465                         return 1;
5466         }
5467
5468         return 1;
5469 }
5470
5471 static void grow_ple_window(struct kvm_vcpu *vcpu)
5472 {
5473         struct vcpu_vmx *vmx = to_vmx(vcpu);
5474         unsigned int old = vmx->ple_window;
5475
5476         vmx->ple_window = __grow_ple_window(old, ple_window,
5477                                             ple_window_grow,
5478                                             ple_window_max);
5479
5480         if (vmx->ple_window != old) {
5481                 vmx->ple_window_dirty = true;
5482                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5483                                             vmx->ple_window, old);
5484         }
5485 }
5486
5487 static void shrink_ple_window(struct kvm_vcpu *vcpu)
5488 {
5489         struct vcpu_vmx *vmx = to_vmx(vcpu);
5490         unsigned int old = vmx->ple_window;
5491
5492         vmx->ple_window = __shrink_ple_window(old, ple_window,
5493                                               ple_window_shrink,
5494                                               ple_window);
5495
5496         if (vmx->ple_window != old) {
5497                 vmx->ple_window_dirty = true;
5498                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5499                                             vmx->ple_window, old);
5500         }
5501 }
5502
5503 /*
5504  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5505  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5506  */
5507 static int handle_pause(struct kvm_vcpu *vcpu)
5508 {
5509         if (!kvm_pause_in_guest(vcpu->kvm))
5510                 grow_ple_window(vcpu);
5511
5512         /*
5513          * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5514          * VM-execution control is ignored if CPL > 0. OTOH, KVM
5515          * never set PAUSE_EXITING and just set PLE if supported,
5516          * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5517          */
5518         kvm_vcpu_on_spin(vcpu, true);
5519         return kvm_skip_emulated_instruction(vcpu);
5520 }
5521
5522 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5523 {
5524         return 1;
5525 }
5526
5527 static int handle_invpcid(struct kvm_vcpu *vcpu)
5528 {
5529         u32 vmx_instruction_info;
5530         unsigned long type;
5531         gva_t gva;
5532         struct {
5533                 u64 pcid;
5534                 u64 gla;
5535         } operand;
5536         int gpr_index;
5537
5538         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5539                 kvm_queue_exception(vcpu, UD_VECTOR);
5540                 return 1;
5541         }
5542
5543         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5544         gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5545         type = kvm_register_read(vcpu, gpr_index);
5546
5547         /* According to the Intel instruction reference, the memory operand
5548          * is read even if it isn't needed (e.g., for type==all)
5549          */
5550         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5551                                 vmx_instruction_info, false,
5552                                 sizeof(operand), &gva))
5553                 return 1;
5554
5555         return kvm_handle_invpcid(vcpu, type, gva);
5556 }
5557
5558 static int handle_pml_full(struct kvm_vcpu *vcpu)
5559 {
5560         unsigned long exit_qualification;
5561
5562         trace_kvm_pml_full(vcpu->vcpu_id);
5563
5564         exit_qualification = vmx_get_exit_qual(vcpu);
5565
5566         /*
5567          * PML buffer FULL happened while executing iret from NMI,
5568          * "blocked by NMI" bit has to be set before next VM entry.
5569          */
5570         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5571                         enable_vnmi &&
5572                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5573                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5574                                 GUEST_INTR_STATE_NMI);
5575
5576         /*
5577          * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5578          * here.., and there's no userspace involvement needed for PML.
5579          */
5580         return 1;
5581 }
5582
5583 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
5584 {
5585         struct vcpu_vmx *vmx = to_vmx(vcpu);
5586
5587         if (!vmx->req_immediate_exit &&
5588             !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
5589                 kvm_lapic_expired_hv_timer(vcpu);
5590                 return EXIT_FASTPATH_REENTER_GUEST;
5591         }
5592
5593         return EXIT_FASTPATH_NONE;
5594 }
5595
5596 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
5597 {
5598         handle_fastpath_preemption_timer(vcpu);
5599         return 1;
5600 }
5601
5602 /*
5603  * When nested=0, all VMX instruction VM Exits filter here.  The handlers
5604  * are overwritten by nested_vmx_setup() when nested=1.
5605  */
5606 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
5607 {
5608         kvm_queue_exception(vcpu, UD_VECTOR);
5609         return 1;
5610 }
5611
5612 #ifndef CONFIG_X86_SGX_KVM
5613 static int handle_encls(struct kvm_vcpu *vcpu)
5614 {
5615         /*
5616          * SGX virtualization is disabled.  There is no software enable bit for
5617          * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
5618          * the guest from executing ENCLS (when SGX is supported by hardware).
5619          */
5620         kvm_queue_exception(vcpu, UD_VECTOR);
5621         return 1;
5622 }
5623 #endif /* CONFIG_X86_SGX_KVM */
5624
5625 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
5626 {
5627         /*
5628          * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
5629          * VM-Exits. Unconditionally set the flag here and leave the handling to
5630          * vmx_handle_exit().
5631          */
5632         to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
5633         return 1;
5634 }
5635
5636 /*
5637  * The exit handlers return 1 if the exit was handled fully and guest execution
5638  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
5639  * to be done to userspace and return 0.
5640  */
5641 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5642         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
5643         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
5644         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
5645         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
5646         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
5647         [EXIT_REASON_CR_ACCESS]               = handle_cr,
5648         [EXIT_REASON_DR_ACCESS]               = handle_dr,
5649         [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
5650         [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
5651         [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
5652         [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
5653         [EXIT_REASON_HLT]                     = kvm_emulate_halt,
5654         [EXIT_REASON_INVD]                    = kvm_emulate_invd,
5655         [EXIT_REASON_INVLPG]                  = handle_invlpg,
5656         [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
5657         [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
5658         [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
5659         [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
5660         [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
5661         [EXIT_REASON_VMPTRST]                 = handle_vmx_instruction,
5662         [EXIT_REASON_VMREAD]                  = handle_vmx_instruction,
5663         [EXIT_REASON_VMRESUME]                = handle_vmx_instruction,
5664         [EXIT_REASON_VMWRITE]                 = handle_vmx_instruction,
5665         [EXIT_REASON_VMOFF]                   = handle_vmx_instruction,
5666         [EXIT_REASON_VMON]                    = handle_vmx_instruction,
5667         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
5668         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
5669         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
5670         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
5671         [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
5672         [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
5673         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
5674         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
5675         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
5676         [EXIT_REASON_LDTR_TR]                 = handle_desc,
5677         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
5678         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
5679         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
5680         [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
5681         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
5682         [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
5683         [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
5684         [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
5685         [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
5686         [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
5687         [EXIT_REASON_PML_FULL]                = handle_pml_full,
5688         [EXIT_REASON_INVPCID]                 = handle_invpcid,
5689         [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
5690         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
5691         [EXIT_REASON_ENCLS]                   = handle_encls,
5692         [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
5693 };
5694
5695 static const int kvm_vmx_max_exit_handlers =
5696         ARRAY_SIZE(kvm_vmx_exit_handlers);
5697
5698 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
5699                               u64 *info1, u64 *info2,
5700                               u32 *intr_info, u32 *error_code)
5701 {
5702         struct vcpu_vmx *vmx = to_vmx(vcpu);
5703
5704         *reason = vmx->exit_reason.full;
5705         *info1 = vmx_get_exit_qual(vcpu);
5706         if (!(vmx->exit_reason.failed_vmentry)) {
5707                 *info2 = vmx->idt_vectoring_info;
5708                 *intr_info = vmx_get_intr_info(vcpu);
5709                 if (is_exception_with_error_code(*intr_info))
5710                         *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5711                 else
5712                         *error_code = 0;
5713         } else {
5714                 *info2 = 0;
5715                 *intr_info = 0;
5716                 *error_code = 0;
5717         }
5718 }
5719
5720 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
5721 {
5722         if (vmx->pml_pg) {
5723                 __free_page(vmx->pml_pg);
5724                 vmx->pml_pg = NULL;
5725         }
5726 }
5727
5728 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
5729 {
5730         struct vcpu_vmx *vmx = to_vmx(vcpu);
5731         u64 *pml_buf;
5732         u16 pml_idx;
5733
5734         pml_idx = vmcs_read16(GUEST_PML_INDEX);
5735
5736         /* Do nothing if PML buffer is empty */
5737         if (pml_idx == (PML_ENTITY_NUM - 1))
5738                 return;
5739
5740         /* PML index always points to next available PML buffer entity */
5741         if (pml_idx >= PML_ENTITY_NUM)
5742                 pml_idx = 0;
5743         else
5744                 pml_idx++;
5745
5746         pml_buf = page_address(vmx->pml_pg);
5747         for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
5748                 u64 gpa;
5749
5750                 gpa = pml_buf[pml_idx];
5751                 WARN_ON(gpa & (PAGE_SIZE - 1));
5752                 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
5753         }
5754
5755         /* reset PML index */
5756         vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5757 }
5758
5759 static void vmx_dump_sel(char *name, uint32_t sel)
5760 {
5761         pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
5762                name, vmcs_read16(sel),
5763                vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
5764                vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
5765                vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
5766 }
5767
5768 static void vmx_dump_dtsel(char *name, uint32_t limit)
5769 {
5770         pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
5771                name, vmcs_read32(limit),
5772                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
5773 }
5774
5775 static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
5776 {
5777         unsigned int i;
5778         struct vmx_msr_entry *e;
5779
5780         pr_err("MSR %s:\n", name);
5781         for (i = 0, e = m->val; i < m->nr; ++i, ++e)
5782                 pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
5783 }
5784
5785 void dump_vmcs(struct kvm_vcpu *vcpu)
5786 {
5787         struct vcpu_vmx *vmx = to_vmx(vcpu);
5788         u32 vmentry_ctl, vmexit_ctl;
5789         u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
5790         unsigned long cr4;
5791         int efer_slot;
5792
5793         if (!dump_invalid_vmcs) {
5794                 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
5795                 return;
5796         }
5797
5798         vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
5799         vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
5800         cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5801         pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
5802         cr4 = vmcs_readl(GUEST_CR4);
5803         secondary_exec_control = 0;
5804         if (cpu_has_secondary_exec_ctrls())
5805                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5806
5807         pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
5808                vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
5809         pr_err("*** Guest State ***\n");
5810         pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5811                vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
5812                vmcs_readl(CR0_GUEST_HOST_MASK));
5813         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5814                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
5815         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
5816         if (cpu_has_vmx_ept()) {
5817                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
5818                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
5819                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
5820                        vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
5821         }
5822         pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
5823                vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
5824         pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
5825                vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
5826         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5827                vmcs_readl(GUEST_SYSENTER_ESP),
5828                vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
5829         vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
5830         vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
5831         vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
5832         vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
5833         vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
5834         vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
5835         vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
5836         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
5837         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
5838         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
5839         efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
5840         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
5841                 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
5842         else if (efer_slot >= 0)
5843                 pr_err("EFER= 0x%016llx (autoload)\n",
5844                        vmx->msr_autoload.guest.val[efer_slot].value);
5845         else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
5846                 pr_err("EFER= 0x%016llx (effective)\n",
5847                        vcpu->arch.efer | (EFER_LMA | EFER_LME));
5848         else
5849                 pr_err("EFER= 0x%016llx (effective)\n",
5850                        vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
5851         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
5852                 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
5853         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
5854                vmcs_read64(GUEST_IA32_DEBUGCTL),
5855                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
5856         if (cpu_has_load_perf_global_ctrl() &&
5857             vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
5858                 pr_err("PerfGlobCtl = 0x%016llx\n",
5859                        vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
5860         if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
5861                 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
5862         pr_err("Interruptibility = %08x  ActivityState = %08x\n",
5863                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
5864                vmcs_read32(GUEST_ACTIVITY_STATE));
5865         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
5866                 pr_err("InterruptStatus = %04x\n",
5867                        vmcs_read16(GUEST_INTR_STATUS));
5868         if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
5869                 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
5870         if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
5871                 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
5872
5873         pr_err("*** Host State ***\n");
5874         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
5875                vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
5876         pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
5877                vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
5878                vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
5879                vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
5880                vmcs_read16(HOST_TR_SELECTOR));
5881         pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
5882                vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
5883                vmcs_readl(HOST_TR_BASE));
5884         pr_err("GDTBase=%016lx IDTBase=%016lx\n",
5885                vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
5886         pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
5887                vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
5888                vmcs_readl(HOST_CR4));
5889         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5890                vmcs_readl(HOST_IA32_SYSENTER_ESP),
5891                vmcs_read32(HOST_IA32_SYSENTER_CS),
5892                vmcs_readl(HOST_IA32_SYSENTER_EIP));
5893         if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
5894                 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
5895         if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
5896                 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
5897         if (cpu_has_load_perf_global_ctrl() &&
5898             vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
5899                 pr_err("PerfGlobCtl = 0x%016llx\n",
5900                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
5901         if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
5902                 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
5903
5904         pr_err("*** Control State ***\n");
5905         pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
5906                pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
5907         pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
5908         pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
5909                vmcs_read32(EXCEPTION_BITMAP),
5910                vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
5911                vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
5912         pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
5913                vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
5914                vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
5915                vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
5916         pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
5917                vmcs_read32(VM_EXIT_INTR_INFO),
5918                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5919                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
5920         pr_err("        reason=%08x qualification=%016lx\n",
5921                vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
5922         pr_err("IDTVectoring: info=%08x errcode=%08x\n",
5923                vmcs_read32(IDT_VECTORING_INFO_FIELD),
5924                vmcs_read32(IDT_VECTORING_ERROR_CODE));
5925         pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
5926         if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
5927                 pr_err("TSC Multiplier = 0x%016llx\n",
5928                        vmcs_read64(TSC_MULTIPLIER));
5929         if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
5930                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
5931                         u16 status = vmcs_read16(GUEST_INTR_STATUS);
5932                         pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
5933                 }
5934                 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
5935                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
5936                         pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
5937                 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
5938         }
5939         if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
5940                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
5941         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
5942                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
5943         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
5944                 pr_err("PLE Gap=%08x Window=%08x\n",
5945                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
5946         if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
5947                 pr_err("Virtual processor ID = 0x%04x\n",
5948                        vmcs_read16(VIRTUAL_PROCESSOR_ID));
5949 }
5950
5951 /*
5952  * The guest has exited.  See if we can fix it or if we need userspace
5953  * assistance.
5954  */
5955 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
5956 {
5957         struct vcpu_vmx *vmx = to_vmx(vcpu);
5958         union vmx_exit_reason exit_reason = vmx->exit_reason;
5959         u32 vectoring_info = vmx->idt_vectoring_info;
5960         u16 exit_handler_index;
5961
5962         /*
5963          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
5964          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
5965          * querying dirty_bitmap, we only need to kick all vcpus out of guest
5966          * mode as if vcpus is in root mode, the PML buffer must has been
5967          * flushed already.  Note, PML is never enabled in hardware while
5968          * running L2.
5969          */
5970         if (enable_pml && !is_guest_mode(vcpu))
5971                 vmx_flush_pml_buffer(vcpu);
5972
5973         /*
5974          * KVM should never reach this point with a pending nested VM-Enter.
5975          * More specifically, short-circuiting VM-Entry to emulate L2 due to
5976          * invalid guest state should never happen as that means KVM knowingly
5977          * allowed a nested VM-Enter with an invalid vmcs12.  More below.
5978          */
5979         if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
5980                 return -EIO;
5981
5982         if (is_guest_mode(vcpu)) {
5983                 /*
5984                  * PML is never enabled when running L2, bail immediately if a
5985                  * PML full exit occurs as something is horribly wrong.
5986                  */
5987                 if (exit_reason.basic == EXIT_REASON_PML_FULL)
5988                         goto unexpected_vmexit;
5989
5990                 /*
5991                  * The host physical addresses of some pages of guest memory
5992                  * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5993                  * Page). The CPU may write to these pages via their host
5994                  * physical address while L2 is running, bypassing any
5995                  * address-translation-based dirty tracking (e.g. EPT write
5996                  * protection).
5997                  *
5998                  * Mark them dirty on every exit from L2 to prevent them from
5999                  * getting out of sync with dirty tracking.
6000                  */
6001                 nested_mark_vmcs12_pages_dirty(vcpu);
6002
6003                 /*
6004                  * Synthesize a triple fault if L2 state is invalid.  In normal
6005                  * operation, nested VM-Enter rejects any attempt to enter L2
6006                  * with invalid state.  However, those checks are skipped if
6007                  * state is being stuffed via RSM or KVM_SET_NESTED_STATE.  If
6008                  * L2 state is invalid, it means either L1 modified SMRAM state
6009                  * or userspace provided bad state.  Synthesize TRIPLE_FAULT as
6010                  * doing so is architecturally allowed in the RSM case, and is
6011                  * the least awful solution for the userspace case without
6012                  * risking false positives.
6013                  */
6014                 if (vmx->emulation_required) {
6015                         nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6016                         return 1;
6017                 }
6018
6019                 if (nested_vmx_reflect_vmexit(vcpu))
6020                         return 1;
6021         }
6022
6023         /* If guest state is invalid, start emulating.  L2 is handled above. */
6024         if (vmx->emulation_required)
6025                 return handle_invalid_guest_state(vcpu);
6026
6027         if (exit_reason.failed_vmentry) {
6028                 dump_vmcs(vcpu);
6029                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6030                 vcpu->run->fail_entry.hardware_entry_failure_reason
6031                         = exit_reason.full;
6032                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6033                 return 0;
6034         }
6035
6036         if (unlikely(vmx->fail)) {
6037                 dump_vmcs(vcpu);
6038                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6039                 vcpu->run->fail_entry.hardware_entry_failure_reason
6040                         = vmcs_read32(VM_INSTRUCTION_ERROR);
6041                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6042                 return 0;
6043         }
6044
6045         /*
6046          * Note:
6047          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6048          * delivery event since it indicates guest is accessing MMIO.
6049          * The vm-exit can be triggered again after return to guest that
6050          * will cause infinite loop.
6051          */
6052         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6053             (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6054              exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6055              exit_reason.basic != EXIT_REASON_PML_FULL &&
6056              exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6057              exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
6058                 int ndata = 3;
6059
6060                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6061                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6062                 vcpu->run->internal.data[0] = vectoring_info;
6063                 vcpu->run->internal.data[1] = exit_reason.full;
6064                 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
6065                 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
6066                         vcpu->run->internal.data[ndata++] =
6067                                 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6068                 }
6069                 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6070                 vcpu->run->internal.ndata = ndata;
6071                 return 0;
6072         }
6073
6074         if (unlikely(!enable_vnmi &&
6075                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
6076                 if (!vmx_interrupt_blocked(vcpu)) {
6077                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6078                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6079                            vcpu->arch.nmi_pending) {
6080                         /*
6081                          * This CPU don't support us in finding the end of an
6082                          * NMI-blocked window if the guest runs with IRQs
6083                          * disabled. So we pull the trigger after 1 s of
6084                          * futile waiting, but inform the user about this.
6085                          */
6086                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6087                                "state on VCPU %d after 1 s timeout\n",
6088                                __func__, vcpu->vcpu_id);
6089                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6090                 }
6091         }
6092
6093         if (exit_fastpath != EXIT_FASTPATH_NONE)
6094                 return 1;
6095
6096         if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6097                 goto unexpected_vmexit;
6098 #ifdef CONFIG_RETPOLINE
6099         if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6100                 return kvm_emulate_wrmsr(vcpu);
6101         else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6102                 return handle_preemption_timer(vcpu);
6103         else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6104                 return handle_interrupt_window(vcpu);
6105         else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6106                 return handle_external_interrupt(vcpu);
6107         else if (exit_reason.basic == EXIT_REASON_HLT)
6108                 return kvm_emulate_halt(vcpu);
6109         else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6110                 return handle_ept_misconfig(vcpu);
6111 #endif
6112
6113         exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6114                                                 kvm_vmx_max_exit_handlers);
6115         if (!kvm_vmx_exit_handlers[exit_handler_index])
6116                 goto unexpected_vmexit;
6117
6118         return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6119
6120 unexpected_vmexit:
6121         vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6122                     exit_reason.full);
6123         dump_vmcs(vcpu);
6124         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6125         vcpu->run->internal.suberror =
6126                         KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6127         vcpu->run->internal.ndata = 2;
6128         vcpu->run->internal.data[0] = exit_reason.full;
6129         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
6130         return 0;
6131 }
6132
6133 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6134 {
6135         int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6136
6137         /*
6138          * Exit to user space when bus lock detected to inform that there is
6139          * a bus lock in guest.
6140          */
6141         if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
6142                 if (ret > 0)
6143                         vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6144
6145                 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6146                 return 0;
6147         }
6148         return ret;
6149 }
6150
6151 /*
6152  * Software based L1D cache flush which is used when microcode providing
6153  * the cache control MSR is not loaded.
6154  *
6155  * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6156  * flush it is required to read in 64 KiB because the replacement algorithm
6157  * is not exactly LRU. This could be sized at runtime via topology
6158  * information but as all relevant affected CPUs have 32KiB L1D cache size
6159  * there is no point in doing so.
6160  */
6161 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6162 {
6163         int size = PAGE_SIZE << L1D_CACHE_ORDER;
6164
6165         /*
6166          * This code is only executed when the the flush mode is 'cond' or
6167          * 'always'
6168          */
6169         if (static_branch_likely(&vmx_l1d_flush_cond)) {
6170                 bool flush_l1d;
6171
6172                 /*
6173                  * Clear the per-vcpu flush bit, it gets set again
6174                  * either from vcpu_run() or from one of the unsafe
6175                  * VMEXIT handlers.
6176                  */
6177                 flush_l1d = vcpu->arch.l1tf_flush_l1d;
6178                 vcpu->arch.l1tf_flush_l1d = false;
6179
6180                 /*
6181                  * Clear the per-cpu flush bit, it gets set again from
6182                  * the interrupt handlers.
6183                  */
6184                 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6185                 kvm_clear_cpu_l1tf_flush_l1d();
6186
6187                 if (!flush_l1d)
6188                         return;
6189         }
6190
6191         vcpu->stat.l1d_flush++;
6192
6193         if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6194                 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6195                 return;
6196         }
6197
6198         asm volatile(
6199                 /* First ensure the pages are in the TLB */
6200                 "xorl   %%eax, %%eax\n"
6201                 ".Lpopulate_tlb:\n\t"
6202                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6203                 "addl   $4096, %%eax\n\t"
6204                 "cmpl   %%eax, %[size]\n\t"
6205                 "jne    .Lpopulate_tlb\n\t"
6206                 "xorl   %%eax, %%eax\n\t"
6207                 "cpuid\n\t"
6208                 /* Now fill the cache */
6209                 "xorl   %%eax, %%eax\n"
6210                 ".Lfill_cache:\n"
6211                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6212                 "addl   $64, %%eax\n\t"
6213                 "cmpl   %%eax, %[size]\n\t"
6214                 "jne    .Lfill_cache\n\t"
6215                 "lfence\n"
6216                 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6217                     [size] "r" (size)
6218                 : "eax", "ebx", "ecx", "edx");
6219 }
6220
6221 static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6222 {
6223         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6224         int tpr_threshold;
6225
6226         if (is_guest_mode(vcpu) &&
6227                 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6228                 return;
6229
6230         tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6231         if (is_guest_mode(vcpu))
6232                 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6233         else
6234                 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6235 }
6236
6237 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6238 {
6239         struct vcpu_vmx *vmx = to_vmx(vcpu);
6240         u32 sec_exec_control;
6241
6242         if (!lapic_in_kernel(vcpu))
6243                 return;
6244
6245         if (!flexpriority_enabled &&
6246             !cpu_has_vmx_virtualize_x2apic_mode())
6247                 return;
6248
6249         /* Postpone execution until vmcs01 is the current VMCS. */
6250         if (is_guest_mode(vcpu)) {
6251                 vmx->nested.change_vmcs01_virtual_apic_mode = true;
6252                 return;
6253         }
6254
6255         sec_exec_control = secondary_exec_controls_get(vmx);
6256         sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6257                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6258
6259         switch (kvm_get_apic_mode(vcpu)) {
6260         case LAPIC_MODE_INVALID:
6261                 WARN_ONCE(true, "Invalid local APIC state");
6262                 break;
6263         case LAPIC_MODE_DISABLED:
6264                 break;
6265         case LAPIC_MODE_XAPIC:
6266                 if (flexpriority_enabled) {
6267                         sec_exec_control |=
6268                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6269                         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6270
6271                         /*
6272                          * Flush the TLB, reloading the APIC access page will
6273                          * only do so if its physical address has changed, but
6274                          * the guest may have inserted a non-APIC mapping into
6275                          * the TLB while the APIC access page was disabled.
6276                          */
6277                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6278                 }
6279                 break;
6280         case LAPIC_MODE_X2APIC:
6281                 if (cpu_has_vmx_virtualize_x2apic_mode())
6282                         sec_exec_control |=
6283                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6284                 break;
6285         }
6286         secondary_exec_controls_set(vmx, sec_exec_control);
6287
6288         vmx_update_msr_bitmap_x2apic(vcpu);
6289 }
6290
6291 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6292 {
6293         struct page *page;
6294
6295         /* Defer reload until vmcs01 is the current VMCS. */
6296         if (is_guest_mode(vcpu)) {
6297                 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6298                 return;
6299         }
6300
6301         if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6302             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6303                 return;
6304
6305         page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
6306         if (is_error_page(page))
6307                 return;
6308
6309         vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
6310         vmx_flush_tlb_current(vcpu);
6311
6312         /*
6313          * Do not pin apic access page in memory, the MMU notifier
6314          * will call us again if it is migrated or swapped out.
6315          */
6316         put_page(page);
6317 }
6318
6319 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
6320 {
6321         u16 status;
6322         u8 old;
6323
6324         if (max_isr == -1)
6325                 max_isr = 0;
6326
6327         status = vmcs_read16(GUEST_INTR_STATUS);
6328         old = status >> 8;
6329         if (max_isr != old) {
6330                 status &= 0xff;
6331                 status |= max_isr << 8;
6332                 vmcs_write16(GUEST_INTR_STATUS, status);
6333         }
6334 }
6335
6336 static void vmx_set_rvi(int vector)
6337 {
6338         u16 status;
6339         u8 old;
6340
6341         if (vector == -1)
6342                 vector = 0;
6343
6344         status = vmcs_read16(GUEST_INTR_STATUS);
6345         old = (u8)status & 0xff;
6346         if ((u8)vector != old) {
6347                 status &= ~0xff;
6348                 status |= (u8)vector;
6349                 vmcs_write16(GUEST_INTR_STATUS, status);
6350         }
6351 }
6352
6353 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6354 {
6355         /*
6356          * When running L2, updating RVI is only relevant when
6357          * vmcs12 virtual-interrupt-delivery enabled.
6358          * However, it can be enabled only when L1 also
6359          * intercepts external-interrupts and in that case
6360          * we should not update vmcs02 RVI but instead intercept
6361          * interrupt. Therefore, do nothing when running L2.
6362          */
6363         if (!is_guest_mode(vcpu))
6364                 vmx_set_rvi(max_irr);
6365 }
6366
6367 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6368 {
6369         struct vcpu_vmx *vmx = to_vmx(vcpu);
6370         int max_irr;
6371         bool got_posted_interrupt;
6372
6373         if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
6374                 return -EIO;
6375
6376         if (pi_test_on(&vmx->pi_desc)) {
6377                 pi_clear_on(&vmx->pi_desc);
6378                 /*
6379                  * IOMMU can write to PID.ON, so the barrier matters even on UP.
6380                  * But on x86 this is just a compiler barrier anyway.
6381                  */
6382                 smp_mb__after_atomic();
6383                 got_posted_interrupt =
6384                         kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6385         } else {
6386                 max_irr = kvm_lapic_find_highest_irr(vcpu);
6387                 got_posted_interrupt = false;
6388         }
6389
6390         /*
6391          * Newly recognized interrupts are injected via either virtual interrupt
6392          * delivery (RVI) or KVM_REQ_EVENT.  Virtual interrupt delivery is
6393          * disabled in two cases:
6394          *
6395          * 1) If L2 is running and the vCPU has a new pending interrupt.  If L1
6396          * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
6397          * VM-Exit to L1.  If L1 doesn't want to exit, the interrupt is injected
6398          * into L2, but KVM doesn't use virtual interrupt delivery to inject
6399          * interrupts into L2, and so KVM_REQ_EVENT is again needed.
6400          *
6401          * 2) If APICv is disabled for this vCPU, assigned devices may still
6402          * attempt to post interrupts.  The posted interrupt vector will cause
6403          * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
6404          */
6405         if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
6406                 vmx_set_rvi(max_irr);
6407         else if (got_posted_interrupt)
6408                 kvm_make_request(KVM_REQ_EVENT, vcpu);
6409
6410         return max_irr;
6411 }
6412
6413 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6414 {
6415         if (!kvm_vcpu_apicv_active(vcpu))
6416                 return;
6417
6418         vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6419         vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6420         vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6421         vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6422 }
6423
6424 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6425 {
6426         struct vcpu_vmx *vmx = to_vmx(vcpu);
6427
6428         pi_clear_on(&vmx->pi_desc);
6429         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6430 }
6431
6432 void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
6433
6434 static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
6435                                         unsigned long entry)
6436 {
6437         bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
6438
6439         kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
6440         vmx_do_interrupt_nmi_irqoff(entry);
6441         kvm_after_interrupt(vcpu);
6442 }
6443
6444 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
6445 {
6446         /*
6447          * Save xfd_err to guest_fpu before interrupt is enabled, so the
6448          * MSR value is not clobbered by the host activity before the guest
6449          * has chance to consume it.
6450          *
6451          * Do not blindly read xfd_err here, since this exception might
6452          * be caused by L1 interception on a platform which doesn't
6453          * support xfd at all.
6454          *
6455          * Do it conditionally upon guest_fpu::xfd. xfd_err matters
6456          * only when xfd contains a non-zero value.
6457          *
6458          * Queuing exception is done in vmx_handle_exit. See comment there.
6459          */
6460         if (vcpu->arch.guest_fpu.fpstate->xfd)
6461                 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
6462 }
6463
6464 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
6465 {
6466         const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
6467         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
6468
6469         /* if exit due to PF check for async PF */
6470         if (is_page_fault(intr_info))
6471                 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
6472         /* if exit due to NM, handle before interrupts are enabled */
6473         else if (is_nm_fault(intr_info))
6474                 handle_nm_fault_irqoff(&vmx->vcpu);
6475         /* Handle machine checks before interrupts are enabled */
6476         else if (is_machine_check(intr_info))
6477                 kvm_machine_check();
6478         /* We need to handle NMIs before interrupts are enabled */
6479         else if (is_nmi(intr_info))
6480                 handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
6481 }
6482
6483 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6484 {
6485         u32 intr_info = vmx_get_intr_info(vcpu);
6486         unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6487         gate_desc *desc = (gate_desc *)host_idt_base + vector;
6488
6489         if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
6490             "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
6491                 return;
6492
6493         handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
6494 }
6495
6496 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6497 {
6498         struct vcpu_vmx *vmx = to_vmx(vcpu);
6499
6500         if (vmx->emulation_required)
6501                 return;
6502
6503         if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6504                 handle_external_interrupt_irqoff(vcpu);
6505         else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
6506                 handle_exception_nmi_irqoff(vmx);
6507 }
6508
6509 /*
6510  * The kvm parameter can be NULL (module initialization, or invocation before
6511  * VM creation). Be sure to check the kvm parameter before using it.
6512  */
6513 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
6514 {
6515         switch (index) {
6516         case MSR_IA32_SMBASE:
6517                 /*
6518                  * We cannot do SMM unless we can run the guest in big
6519                  * real mode.
6520                  */
6521                 return enable_unrestricted_guest || emulate_invalid_guest_state;
6522         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6523                 return nested;
6524         case MSR_AMD64_VIRT_SPEC_CTRL:
6525         case MSR_AMD64_TSC_RATIO:
6526                 /* This is AMD only.  */
6527                 return false;
6528         default:
6529                 return true;
6530         }
6531 }
6532
6533 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6534 {
6535         u32 exit_intr_info;
6536         bool unblock_nmi;
6537         u8 vector;
6538         bool idtv_info_valid;
6539
6540         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6541
6542         if (enable_vnmi) {
6543                 if (vmx->loaded_vmcs->nmi_known_unmasked)
6544                         return;
6545
6546                 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
6547                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
6548                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6549                 /*
6550                  * SDM 3: 27.7.1.2 (September 2008)
6551                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
6552                  * a guest IRET fault.
6553                  * SDM 3: 23.2.2 (September 2008)
6554                  * Bit 12 is undefined in any of the following cases:
6555                  *  If the VM exit sets the valid bit in the IDT-vectoring
6556                  *   information field.
6557                  *  If the VM exit is due to a double fault.
6558                  */
6559                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
6560                     vector != DF_VECTOR && !idtv_info_valid)
6561                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6562                                       GUEST_INTR_STATE_NMI);
6563                 else
6564                         vmx->loaded_vmcs->nmi_known_unmasked =
6565                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
6566                                   & GUEST_INTR_STATE_NMI);
6567         } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
6568                 vmx->loaded_vmcs->vnmi_blocked_time +=
6569                         ktime_to_ns(ktime_sub(ktime_get(),
6570                                               vmx->loaded_vmcs->entry_time));
6571 }
6572
6573 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
6574                                       u32 idt_vectoring_info,
6575                                       int instr_len_field,
6576                                       int error_code_field)
6577 {
6578         u8 vector;
6579         int type;
6580         bool idtv_info_valid;
6581
6582         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6583
6584         vcpu->arch.nmi_injected = false;
6585         kvm_clear_exception_queue(vcpu);
6586         kvm_clear_interrupt_queue(vcpu);
6587
6588         if (!idtv_info_valid)
6589                 return;
6590
6591         kvm_make_request(KVM_REQ_EVENT, vcpu);
6592
6593         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
6594         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
6595
6596         switch (type) {
6597         case INTR_TYPE_NMI_INTR:
6598                 vcpu->arch.nmi_injected = true;
6599                 /*
6600                  * SDM 3: 27.7.1.2 (September 2008)
6601                  * Clear bit "block by NMI" before VM entry if a NMI
6602                  * delivery faulted.
6603                  */
6604                 vmx_set_nmi_mask(vcpu, false);
6605                 break;
6606         case INTR_TYPE_SOFT_EXCEPTION:
6607                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6608                 fallthrough;
6609         case INTR_TYPE_HARD_EXCEPTION:
6610                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
6611                         u32 err = vmcs_read32(error_code_field);
6612                         kvm_requeue_exception_e(vcpu, vector, err);
6613                 } else
6614                         kvm_requeue_exception(vcpu, vector);
6615                 break;
6616         case INTR_TYPE_SOFT_INTR:
6617                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6618                 fallthrough;
6619         case INTR_TYPE_EXT_INTR:
6620                 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
6621                 break;
6622         default:
6623                 break;
6624         }
6625 }
6626
6627 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
6628 {
6629         __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
6630                                   VM_EXIT_INSTRUCTION_LEN,
6631                                   IDT_VECTORING_ERROR_CODE);
6632 }
6633
6634 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
6635 {
6636         __vmx_complete_interrupts(vcpu,
6637                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6638                                   VM_ENTRY_INSTRUCTION_LEN,
6639                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
6640
6641         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
6642 }
6643
6644 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6645 {
6646         int i, nr_msrs;
6647         struct perf_guest_switch_msr *msrs;
6648
6649         /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
6650         msrs = perf_guest_get_msrs(&nr_msrs);
6651         if (!msrs)
6652                 return;
6653
6654         for (i = 0; i < nr_msrs; i++)
6655                 if (msrs[i].host == msrs[i].guest)
6656                         clear_atomic_switch_msr(vmx, msrs[i].msr);
6657                 else
6658                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
6659                                         msrs[i].host, false);
6660 }
6661
6662 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
6663 {
6664         struct vcpu_vmx *vmx = to_vmx(vcpu);
6665         u64 tscl;
6666         u32 delta_tsc;
6667
6668         if (vmx->req_immediate_exit) {
6669                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
6670                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6671         } else if (vmx->hv_deadline_tsc != -1) {
6672                 tscl = rdtsc();
6673                 if (vmx->hv_deadline_tsc > tscl)
6674                         /* set_hv_timer ensures the delta fits in 32-bits */
6675                         delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
6676                                 cpu_preemption_timer_multi);
6677                 else
6678                         delta_tsc = 0;
6679
6680                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
6681                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6682         } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
6683                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
6684                 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
6685         }
6686 }
6687
6688 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
6689 {
6690         if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
6691                 vmx->loaded_vmcs->host_state.rsp = host_rsp;
6692                 vmcs_writel(HOST_RSP, host_rsp);
6693         }
6694 }
6695
6696 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
6697 {
6698         switch (to_vmx(vcpu)->exit_reason.basic) {
6699         case EXIT_REASON_MSR_WRITE:
6700                 return handle_fastpath_set_msr_irqoff(vcpu);
6701         case EXIT_REASON_PREEMPTION_TIMER:
6702                 return handle_fastpath_preemption_timer(vcpu);
6703         default:
6704                 return EXIT_FASTPATH_NONE;
6705         }
6706 }
6707
6708 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
6709                                         struct vcpu_vmx *vmx)
6710 {
6711         kvm_guest_enter_irqoff();
6712
6713         /* L1D Flush includes CPU buffer clear to mitigate MDS */
6714         if (static_branch_unlikely(&vmx_l1d_should_flush))
6715                 vmx_l1d_flush(vcpu);
6716         else if (static_branch_unlikely(&mds_user_clear))
6717                 mds_clear_cpu_buffers();
6718
6719         if (vcpu->arch.cr2 != native_read_cr2())
6720                 native_write_cr2(vcpu->arch.cr2);
6721
6722         vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
6723                                    vmx->loaded_vmcs->launched);
6724
6725         vcpu->arch.cr2 = native_read_cr2();
6726
6727         kvm_guest_exit_irqoff();
6728 }
6729
6730 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
6731 {
6732         struct vcpu_vmx *vmx = to_vmx(vcpu);
6733         unsigned long cr4;
6734
6735         /* Record the guest's net vcpu time for enforced NMI injections. */
6736         if (unlikely(!enable_vnmi &&
6737                      vmx->loaded_vmcs->soft_vnmi_blocked))
6738                 vmx->loaded_vmcs->entry_time = ktime_get();
6739
6740         /*
6741          * Don't enter VMX if guest state is invalid, let the exit handler
6742          * start emulation until we arrive back to a valid state.  Synthesize a
6743          * consistency check VM-Exit due to invalid guest state and bail.
6744          */
6745         if (unlikely(vmx->emulation_required)) {
6746                 vmx->fail = 0;
6747
6748                 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
6749                 vmx->exit_reason.failed_vmentry = 1;
6750                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
6751                 vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
6752                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
6753                 vmx->exit_intr_info = 0;
6754                 return EXIT_FASTPATH_NONE;
6755         }
6756
6757         trace_kvm_entry(vcpu);
6758
6759         if (vmx->ple_window_dirty) {
6760                 vmx->ple_window_dirty = false;
6761                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
6762         }
6763
6764         /*
6765          * We did this in prepare_switch_to_guest, because it needs to
6766          * be within srcu_read_lock.
6767          */
6768         WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
6769
6770         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
6771                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
6772         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
6773                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
6774         vcpu->arch.regs_dirty = 0;
6775
6776         cr4 = cr4_read_shadow();
6777         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
6778                 vmcs_writel(HOST_CR4, cr4);
6779                 vmx->loaded_vmcs->host_state.cr4 = cr4;
6780         }
6781
6782         /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
6783         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
6784                 set_debugreg(vcpu->arch.dr6, 6);
6785
6786         /* When single-stepping over STI and MOV SS, we must clear the
6787          * corresponding interruptibility bits in the guest state. Otherwise
6788          * vmentry fails as it then expects bit 14 (BS) in pending debug
6789          * exceptions being set, but that's not correct for the guest debugging
6790          * case. */
6791         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6792                 vmx_set_interrupt_shadow(vcpu, 0);
6793
6794         kvm_load_guest_xsave_state(vcpu);
6795
6796         pt_guest_enter(vmx);
6797
6798         atomic_switch_perf_msrs(vmx);
6799         if (intel_pmu_lbr_is_enabled(vcpu))
6800                 vmx_passthrough_lbr_msrs(vcpu);
6801
6802         if (enable_preemption_timer)
6803                 vmx_update_hv_timer(vcpu);
6804
6805         kvm_wait_lapic_expire(vcpu);
6806
6807         /*
6808          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
6809          * it's non-zero. Since vmentry is serialising on affected CPUs, there
6810          * is no need to worry about the conditional branch over the wrmsr
6811          * being speculatively taken.
6812          */
6813         x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
6814
6815         /* The actual VMENTER/EXIT is in the .noinstr.text section. */
6816         vmx_vcpu_enter_exit(vcpu, vmx);
6817
6818         /*
6819          * We do not use IBRS in the kernel. If this vCPU has used the
6820          * SPEC_CTRL MSR it may have left it on; save the value and
6821          * turn it off. This is much more efficient than blindly adding
6822          * it to the atomic save/restore list. Especially as the former
6823          * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
6824          *
6825          * For non-nested case:
6826          * If the L01 MSR bitmap does not intercept the MSR, then we need to
6827          * save it.
6828          *
6829          * For nested case:
6830          * If the L02 MSR bitmap does not intercept the MSR, then we need to
6831          * save it.
6832          */
6833         if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
6834                 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
6835
6836         x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
6837
6838         /* All fields are clean at this point */
6839         if (static_branch_unlikely(&enable_evmcs)) {
6840                 current_evmcs->hv_clean_fields |=
6841                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
6842
6843                 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
6844         }
6845
6846         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6847         if (vmx->host_debugctlmsr)
6848                 update_debugctlmsr(vmx->host_debugctlmsr);
6849
6850 #ifndef CONFIG_X86_64
6851         /*
6852          * The sysexit path does not restore ds/es, so we must set them to
6853          * a reasonable value ourselves.
6854          *
6855          * We can't defer this to vmx_prepare_switch_to_host() since that
6856          * function may be executed in interrupt context, which saves and
6857          * restore segments around it, nullifying its effect.
6858          */
6859         loadsegment(ds, __USER_DS);
6860         loadsegment(es, __USER_DS);
6861 #endif
6862
6863         vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
6864
6865         pt_guest_exit(vmx);
6866
6867         kvm_load_host_xsave_state(vcpu);
6868
6869         if (is_guest_mode(vcpu)) {
6870                 /*
6871                  * Track VMLAUNCH/VMRESUME that have made past guest state
6872                  * checking.
6873                  */
6874                 if (vmx->nested.nested_run_pending &&
6875                     !vmx->exit_reason.failed_vmentry)
6876                         ++vcpu->stat.nested_run;
6877
6878                 vmx->nested.nested_run_pending = 0;
6879         }
6880
6881         vmx->idt_vectoring_info = 0;
6882
6883         if (unlikely(vmx->fail)) {
6884                 vmx->exit_reason.full = 0xdead;
6885                 return EXIT_FASTPATH_NONE;
6886         }
6887
6888         vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
6889         if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
6890                 kvm_machine_check();
6891
6892         if (likely(!vmx->exit_reason.failed_vmentry))
6893                 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
6894
6895         trace_kvm_exit(vcpu, KVM_ISA_VMX);
6896
6897         if (unlikely(vmx->exit_reason.failed_vmentry))
6898                 return EXIT_FASTPATH_NONE;
6899
6900         vmx->loaded_vmcs->launched = 1;
6901
6902         vmx_recover_nmi_blocking(vmx);
6903         vmx_complete_interrupts(vmx);
6904
6905         if (is_guest_mode(vcpu))
6906                 return EXIT_FASTPATH_NONE;
6907
6908         return vmx_exit_handlers_fastpath(vcpu);
6909 }
6910
6911 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6912 {
6913         struct vcpu_vmx *vmx = to_vmx(vcpu);
6914
6915         if (enable_pml)
6916                 vmx_destroy_pml_buffer(vmx);
6917         free_vpid(vmx->vpid);
6918         nested_vmx_free_vcpu(vcpu);
6919         free_loaded_vmcs(vmx->loaded_vmcs);
6920 }
6921
6922 static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
6923 {
6924         struct vmx_uret_msr *tsx_ctrl;
6925         struct vcpu_vmx *vmx;
6926         int i, err;
6927
6928         BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
6929         vmx = to_vmx(vcpu);
6930
6931         err = -ENOMEM;
6932
6933         vmx->vpid = allocate_vpid();
6934
6935         /*
6936          * If PML is turned on, failure on enabling PML just results in failure
6937          * of creating the vcpu, therefore we can simplify PML logic (by
6938          * avoiding dealing with cases, such as enabling PML partially on vcpus
6939          * for the guest), etc.
6940          */
6941         if (enable_pml) {
6942                 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
6943                 if (!vmx->pml_pg)
6944                         goto free_vpid;
6945         }
6946
6947         for (i = 0; i < kvm_nr_uret_msrs; ++i)
6948                 vmx->guest_uret_msrs[i].mask = -1ull;
6949         if (boot_cpu_has(X86_FEATURE_RTM)) {
6950                 /*
6951                  * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
6952                  * Keep the host value unchanged to avoid changing CPUID bits
6953                  * under the host kernel's feet.
6954                  */
6955                 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
6956                 if (tsx_ctrl)
6957                         tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
6958         }
6959
6960         err = alloc_loaded_vmcs(&vmx->vmcs01);
6961         if (err < 0)
6962                 goto free_pml;
6963
6964         /*
6965          * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
6966          * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
6967          * feature only for vmcs01, KVM currently isn't equipped to realize any
6968          * performance benefits from enabling it for vmcs02.
6969          */
6970         if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
6971             (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
6972                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
6973
6974                 evmcs->hv_enlightenments_control.msr_bitmap = 1;
6975         }
6976
6977         /* The MSR bitmap starts with all ones */
6978         bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
6979         bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
6980
6981         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
6982 #ifdef CONFIG_X86_64
6983         vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
6984         vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
6985         vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
6986 #endif
6987         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
6988         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
6989         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
6990         if (kvm_cstate_in_guest(vcpu->kvm)) {
6991                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
6992                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
6993                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
6994                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
6995         }
6996
6997         vmx->loaded_vmcs = &vmx->vmcs01;
6998
6999         if (cpu_need_virtualize_apic_accesses(vcpu)) {
7000                 err = alloc_apic_access_page(vcpu->kvm);
7001                 if (err)
7002                         goto free_vmcs;
7003         }
7004
7005         if (enable_ept && !enable_unrestricted_guest) {
7006                 err = init_rmode_identity_map(vcpu->kvm);
7007                 if (err)
7008                         goto free_vmcs;
7009         }
7010
7011         return 0;
7012
7013 free_vmcs:
7014         free_loaded_vmcs(vmx->loaded_vmcs);
7015 free_pml:
7016         vmx_destroy_pml_buffer(vmx);
7017 free_vpid:
7018         free_vpid(vmx->vpid);
7019         return err;
7020 }
7021
7022 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7023 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7024
7025 static int vmx_vm_init(struct kvm *kvm)
7026 {
7027         if (!ple_gap)
7028                 kvm->arch.pause_in_guest = true;
7029
7030         if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7031                 switch (l1tf_mitigation) {
7032                 case L1TF_MITIGATION_OFF:
7033                 case L1TF_MITIGATION_FLUSH_NOWARN:
7034                         /* 'I explicitly don't care' is set */
7035                         break;
7036                 case L1TF_MITIGATION_FLUSH:
7037                 case L1TF_MITIGATION_FLUSH_NOSMT:
7038                 case L1TF_MITIGATION_FULL:
7039                         /*
7040                          * Warn upon starting the first VM in a potentially
7041                          * insecure environment.
7042                          */
7043                         if (sched_smt_active())
7044                                 pr_warn_once(L1TF_MSG_SMT);
7045                         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7046                                 pr_warn_once(L1TF_MSG_L1D);
7047                         break;
7048                 case L1TF_MITIGATION_FULL_FORCE:
7049                         /* Flush is enforced */
7050                         break;
7051                 }
7052         }
7053         return 0;
7054 }
7055
7056 static int __init vmx_check_processor_compat(void)
7057 {
7058         struct vmcs_config vmcs_conf;
7059         struct vmx_capability vmx_cap;
7060
7061         if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
7062             !this_cpu_has(X86_FEATURE_VMX)) {
7063                 pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
7064                 return -EIO;
7065         }
7066
7067         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
7068                 return -EIO;
7069         if (nested)
7070                 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
7071         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
7072                 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
7073                                 smp_processor_id());
7074                 return -EIO;
7075         }
7076         return 0;
7077 }
7078
7079 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7080 {
7081         u8 cache;
7082
7083         /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
7084          * memory aliases with conflicting memory types and sometimes MCEs.
7085          * We have to be careful as to what are honored and when.
7086          *
7087          * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
7088          * UC.  The effective memory type is UC or WC depending on guest PAT.
7089          * This was historically the source of MCEs and we want to be
7090          * conservative.
7091          *
7092          * When there is no need to deal with noncoherent DMA (e.g., no VT-d
7093          * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
7094          * EPT memory type is set to WB.  The effective memory type is forced
7095          * WB.
7096          *
7097          * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
7098          * EPT memory type is used to emulate guest CD/MTRR.
7099          */
7100
7101         if (is_mmio)
7102                 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7103
7104         if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
7105                 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7106
7107         if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
7108                 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7109                         cache = MTRR_TYPE_WRBACK;
7110                 else
7111                         cache = MTRR_TYPE_UNCACHABLE;
7112
7113                 return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7114         }
7115
7116         return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
7117 }
7118
7119 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7120 {
7121         /*
7122          * These bits in the secondary execution controls field
7123          * are dynamic, the others are mostly based on the hypervisor
7124          * architecture and the guest's CPUID.  Do not touch the
7125          * dynamic bits.
7126          */
7127         u32 mask =
7128                 SECONDARY_EXEC_SHADOW_VMCS |
7129                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7130                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7131                 SECONDARY_EXEC_DESC;
7132
7133         u32 cur_ctl = secondary_exec_controls_get(vmx);
7134
7135         secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7136 }
7137
7138 /*
7139  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7140  * (indicating "allowed-1") if they are supported in the guest's CPUID.
7141  */
7142 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7143 {
7144         struct vcpu_vmx *vmx = to_vmx(vcpu);
7145         struct kvm_cpuid_entry2 *entry;
7146
7147         vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7148         vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7149
7150 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
7151         if (entry && (entry->_reg & (_cpuid_mask)))                     \
7152                 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
7153 } while (0)
7154
7155         entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
7156         cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
7157         cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
7158         cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
7159         cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
7160         cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
7161         cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
7162         cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
7163         cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
7164         cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
7165         cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7166         cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
7167         cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
7168         cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
7169         cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
7170
7171         entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
7172         cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
7173         cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
7174         cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
7175         cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
7176         cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
7177         cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
7178
7179 #undef cr4_fixed1_update
7180 }
7181
7182 static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
7183 {
7184         struct vcpu_vmx *vmx = to_vmx(vcpu);
7185
7186         if (kvm_mpx_supported()) {
7187                 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
7188
7189                 if (mpx_enabled) {
7190                         vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
7191                         vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
7192                 } else {
7193                         vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
7194                         vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
7195                 }
7196         }
7197 }
7198
7199 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7200 {
7201         struct vcpu_vmx *vmx = to_vmx(vcpu);
7202         struct kvm_cpuid_entry2 *best = NULL;
7203         int i;
7204
7205         for (i = 0; i < PT_CPUID_LEAVES; i++) {
7206                 best = kvm_find_cpuid_entry(vcpu, 0x14, i);
7207                 if (!best)
7208                         return;
7209                 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7210                 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7211                 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7212                 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7213         }
7214
7215         /* Get the number of configurable Address Ranges for filtering */
7216         vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
7217                                                 PT_CAP_num_address_ranges);
7218
7219         /* Initialize and clear the no dependency bits */
7220         vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7221                         RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7222                         RTIT_CTL_BRANCH_EN);
7223
7224         /*
7225          * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7226          * will inject an #GP
7227          */
7228         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7229                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7230
7231         /*
7232          * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7233          * PSBFreq can be set
7234          */
7235         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7236                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7237                                 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7238
7239         /*
7240          * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
7241          */
7242         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7243                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7244                                               RTIT_CTL_MTC_RANGE);
7245
7246         /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7247         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7248                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7249                                                         RTIT_CTL_PTW_EN);
7250
7251         /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7252         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7253                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7254
7255         /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7256         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7257                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7258
7259         /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
7260         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7261                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7262
7263         /* unmask address range configure area */
7264         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
7265                 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7266 }
7267
7268 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7269 {
7270         struct vcpu_vmx *vmx = to_vmx(vcpu);
7271
7272         /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7273         vcpu->arch.xsaves_enabled = false;
7274
7275         vmx_setup_uret_msrs(vmx);
7276
7277         if (cpu_has_secondary_exec_ctrls())
7278                 vmcs_set_secondary_exec_control(vmx,
7279                                                 vmx_secondary_exec_control(vmx));
7280
7281         if (nested_vmx_allowed(vcpu))
7282                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7283                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7284                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7285         else
7286                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7287                         ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7288                           FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7289
7290         if (nested_vmx_allowed(vcpu)) {
7291                 nested_vmx_cr_fixed1_bits_update(vcpu);
7292                 nested_vmx_entry_exit_ctls_update(vcpu);
7293         }
7294
7295         if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7296                         guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7297                 update_intel_pt_cfg(vcpu);
7298
7299         if (boot_cpu_has(X86_FEATURE_RTM)) {
7300                 struct vmx_uret_msr *msr;
7301                 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7302                 if (msr) {
7303                         bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7304                         vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7305                 }
7306         }
7307
7308         if (kvm_cpu_cap_has(X86_FEATURE_XFD))
7309                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
7310                                           !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
7311
7312
7313         set_cr4_guest_host_mask(vmx);
7314
7315         vmx_write_encls_bitmap(vcpu, NULL);
7316         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
7317                 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
7318         else
7319                 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7320
7321         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
7322                 vmx->msr_ia32_feature_control_valid_bits |=
7323                         FEAT_CTL_SGX_LC_ENABLED;
7324         else
7325                 vmx->msr_ia32_feature_control_valid_bits &=
7326                         ~FEAT_CTL_SGX_LC_ENABLED;
7327
7328         /* Refresh #PF interception to account for MAXPHYADDR changes. */
7329         vmx_update_exception_bitmap(vcpu);
7330 }
7331
7332 static __init void vmx_set_cpu_caps(void)
7333 {
7334         kvm_set_cpu_caps();
7335
7336         /* CPUID 0x1 */
7337         if (nested)
7338                 kvm_cpu_cap_set(X86_FEATURE_VMX);
7339
7340         /* CPUID 0x7 */
7341         if (kvm_mpx_supported())
7342                 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7343         if (!cpu_has_vmx_invpcid())
7344                 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
7345         if (vmx_pt_mode_is_host_guest())
7346                 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7347
7348         if (!enable_sgx) {
7349                 kvm_cpu_cap_clear(X86_FEATURE_SGX);
7350                 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
7351                 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
7352                 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
7353         }
7354
7355         if (vmx_umip_emulated())
7356                 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7357
7358         /* CPUID 0xD.1 */
7359         supported_xss = 0;
7360         if (!cpu_has_vmx_xsaves())
7361                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7362
7363         /* CPUID 0x80000001 and 0x7 (RDPID) */
7364         if (!cpu_has_vmx_rdtscp()) {
7365                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7366                 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7367         }
7368
7369         if (cpu_has_vmx_waitpkg())
7370                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
7371 }
7372
7373 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7374 {
7375         to_vmx(vcpu)->req_immediate_exit = true;
7376 }
7377
7378 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7379                                   struct x86_instruction_info *info)
7380 {
7381         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7382         unsigned short port;
7383         bool intercept;
7384         int size;
7385
7386         if (info->intercept == x86_intercept_in ||
7387             info->intercept == x86_intercept_ins) {
7388                 port = info->src_val;
7389                 size = info->dst_bytes;
7390         } else {
7391                 port = info->dst_val;
7392                 size = info->src_bytes;
7393         }
7394
7395         /*
7396          * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7397          * VM-exits depend on the 'unconditional IO exiting' VM-execution
7398          * control.
7399          *
7400          * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7401          */
7402         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7403                 intercept = nested_cpu_has(vmcs12,
7404                                            CPU_BASED_UNCOND_IO_EXITING);
7405         else
7406                 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7407
7408         /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7409         return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7410 }
7411
7412 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7413                                struct x86_instruction_info *info,
7414                                enum x86_intercept_stage stage,
7415                                struct x86_exception *exception)
7416 {
7417         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7418
7419         switch (info->intercept) {
7420         /*
7421          * RDPID causes #UD if disabled through secondary execution controls.
7422          * Because it is marked as EmulateOnUD, we need to intercept it here.
7423          * Note, RDPID is hidden behind ENABLE_RDTSCP.
7424          */
7425         case x86_intercept_rdpid:
7426                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
7427                         exception->vector = UD_VECTOR;
7428                         exception->error_code_valid = false;
7429                         return X86EMUL_PROPAGATE_FAULT;
7430                 }
7431                 break;
7432
7433         case x86_intercept_in:
7434         case x86_intercept_ins:
7435         case x86_intercept_out:
7436         case x86_intercept_outs:
7437                 return vmx_check_intercept_io(vcpu, info);
7438
7439         case x86_intercept_lgdt:
7440         case x86_intercept_lidt:
7441         case x86_intercept_lldt:
7442         case x86_intercept_ltr:
7443         case x86_intercept_sgdt:
7444         case x86_intercept_sidt:
7445         case x86_intercept_sldt:
7446         case x86_intercept_str:
7447                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
7448                         return X86EMUL_CONTINUE;
7449
7450                 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7451                 break;
7452
7453         /* TODO: check more intercepts... */
7454         default:
7455                 break;
7456         }
7457
7458         return X86EMUL_UNHANDLEABLE;
7459 }
7460
7461 #ifdef CONFIG_X86_64
7462 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7463 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7464                                   u64 divisor, u64 *result)
7465 {
7466         u64 low = a << shift, high = a >> (64 - shift);
7467
7468         /* To avoid the overflow on divq */
7469         if (high >= divisor)
7470                 return 1;
7471
7472         /* Low hold the result, high hold rem which is discarded */
7473         asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7474             "rm" (divisor), "0" (low), "1" (high));
7475         *result = low;
7476
7477         return 0;
7478 }
7479
7480 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7481                             bool *expired)
7482 {
7483         struct vcpu_vmx *vmx;
7484         u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
7485         struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
7486
7487         vmx = to_vmx(vcpu);
7488         tscl = rdtsc();
7489         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7490         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
7491         lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
7492                                                     ktimer->timer_advance_ns);
7493
7494         if (delta_tsc > lapic_timer_advance_cycles)
7495                 delta_tsc -= lapic_timer_advance_cycles;
7496         else
7497                 delta_tsc = 0;
7498
7499         /* Convert to host delta tsc if tsc scaling is enabled */
7500         if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
7501             delta_tsc && u64_shl_div_u64(delta_tsc,
7502                                 kvm_tsc_scaling_ratio_frac_bits,
7503                                 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
7504                 return -ERANGE;
7505
7506         /*
7507          * If the delta tsc can't fit in the 32 bit after the multi shift,
7508          * we can't use the preemption timer.
7509          * It's possible that it fits on later vmentries, but checking
7510          * on every vmentry is costly so we just use an hrtimer.
7511          */
7512         if (delta_tsc >> (cpu_preemption_timer_multi + 32))
7513                 return -ERANGE;
7514
7515         vmx->hv_deadline_tsc = tscl + delta_tsc;
7516         *expired = !delta_tsc;
7517         return 0;
7518 }
7519
7520 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
7521 {
7522         to_vmx(vcpu)->hv_deadline_tsc = -1;
7523 }
7524 #endif
7525
7526 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
7527 {
7528         if (!kvm_pause_in_guest(vcpu->kvm))
7529                 shrink_ple_window(vcpu);
7530 }
7531
7532 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
7533 {
7534         struct vcpu_vmx *vmx = to_vmx(vcpu);
7535
7536         if (is_guest_mode(vcpu)) {
7537                 vmx->nested.update_vmcs01_cpu_dirty_logging = true;
7538                 return;
7539         }
7540
7541         /*
7542          * Note, cpu_dirty_logging_count can be changed concurrent with this
7543          * code, but in that case another update request will be made and so
7544          * the guest will never run with a stale PML value.
7545          */
7546         if (vcpu->kvm->arch.cpu_dirty_logging_count)
7547                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
7548         else
7549                 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
7550 }
7551
7552 static int vmx_pre_block(struct kvm_vcpu *vcpu)
7553 {
7554         if (pi_pre_block(vcpu))
7555                 return 1;
7556
7557         if (kvm_lapic_hv_timer_in_use(vcpu))
7558                 kvm_lapic_switch_to_sw_timer(vcpu);
7559
7560         return 0;
7561 }
7562
7563 static void vmx_post_block(struct kvm_vcpu *vcpu)
7564 {
7565         if (kvm_x86_ops.set_hv_timer)
7566                 kvm_lapic_switch_to_hv_timer(vcpu);
7567
7568         pi_post_block(vcpu);
7569 }
7570
7571 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
7572 {
7573         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
7574                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7575                         FEAT_CTL_LMCE_ENABLED;
7576         else
7577                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7578                         ~FEAT_CTL_LMCE_ENABLED;
7579 }
7580
7581 static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
7582 {
7583         /* we need a nested vmexit to enter SMM, postpone if run is pending */
7584         if (to_vmx(vcpu)->nested.nested_run_pending)
7585                 return -EBUSY;
7586         return !is_smm(vcpu);
7587 }
7588
7589 static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
7590 {
7591         struct vcpu_vmx *vmx = to_vmx(vcpu);
7592
7593         vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
7594         if (vmx->nested.smm.guest_mode)
7595                 nested_vmx_vmexit(vcpu, -1, 0, 0);
7596
7597         vmx->nested.smm.vmxon = vmx->nested.vmxon;
7598         vmx->nested.vmxon = false;
7599         vmx_clear_hlt(vcpu);
7600         return 0;
7601 }
7602
7603 static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
7604 {
7605         struct vcpu_vmx *vmx = to_vmx(vcpu);
7606         int ret;
7607
7608         if (vmx->nested.smm.vmxon) {
7609                 vmx->nested.vmxon = true;
7610                 vmx->nested.smm.vmxon = false;
7611         }
7612
7613         if (vmx->nested.smm.guest_mode) {
7614                 ret = nested_vmx_enter_non_root_mode(vcpu, false);
7615                 if (ret)
7616                         return ret;
7617
7618                 vmx->nested.smm.guest_mode = false;
7619         }
7620         return 0;
7621 }
7622
7623 static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
7624 {
7625         /* RSM will cause a vmexit anyway.  */
7626 }
7627
7628 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
7629 {
7630         return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
7631 }
7632
7633 static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
7634 {
7635         if (is_guest_mode(vcpu)) {
7636                 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
7637
7638                 if (hrtimer_try_to_cancel(timer) == 1)
7639                         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
7640         }
7641 }
7642
7643 static void hardware_unsetup(void)
7644 {
7645         kvm_set_posted_intr_wakeup_handler(NULL);
7646
7647         if (nested)
7648                 nested_vmx_hardware_unsetup();
7649
7650         free_kvm_area();
7651 }
7652
7653 static bool vmx_check_apicv_inhibit_reasons(ulong bit)
7654 {
7655         ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
7656                           BIT(APICV_INHIBIT_REASON_ABSENT) |
7657                           BIT(APICV_INHIBIT_REASON_HYPERV) |
7658                           BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
7659
7660         return supported & BIT(bit);
7661 }
7662
7663 static struct kvm_x86_ops vmx_x86_ops __initdata = {
7664         .name = "kvm_intel",
7665
7666         .hardware_unsetup = hardware_unsetup,
7667
7668         .hardware_enable = hardware_enable,
7669         .hardware_disable = hardware_disable,
7670         .cpu_has_accelerated_tpr = report_flexpriority,
7671         .has_emulated_msr = vmx_has_emulated_msr,
7672
7673         .vm_size = sizeof(struct kvm_vmx),
7674         .vm_init = vmx_vm_init,
7675
7676         .vcpu_create = vmx_create_vcpu,
7677         .vcpu_free = vmx_free_vcpu,
7678         .vcpu_reset = vmx_vcpu_reset,
7679
7680         .prepare_guest_switch = vmx_prepare_switch_to_guest,
7681         .vcpu_load = vmx_vcpu_load,
7682         .vcpu_put = vmx_vcpu_put,
7683
7684         .update_exception_bitmap = vmx_update_exception_bitmap,
7685         .get_msr_feature = vmx_get_msr_feature,
7686         .get_msr = vmx_get_msr,
7687         .set_msr = vmx_set_msr,
7688         .get_segment_base = vmx_get_segment_base,
7689         .get_segment = vmx_get_segment,
7690         .set_segment = vmx_set_segment,
7691         .get_cpl = vmx_get_cpl,
7692         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
7693         .set_cr0 = vmx_set_cr0,
7694         .is_valid_cr4 = vmx_is_valid_cr4,
7695         .set_cr4 = vmx_set_cr4,
7696         .set_efer = vmx_set_efer,
7697         .get_idt = vmx_get_idt,
7698         .set_idt = vmx_set_idt,
7699         .get_gdt = vmx_get_gdt,
7700         .set_gdt = vmx_set_gdt,
7701         .set_dr7 = vmx_set_dr7,
7702         .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
7703         .cache_reg = vmx_cache_reg,
7704         .get_rflags = vmx_get_rflags,
7705         .set_rflags = vmx_set_rflags,
7706         .get_if_flag = vmx_get_if_flag,
7707
7708         .tlb_flush_all = vmx_flush_tlb_all,
7709         .tlb_flush_current = vmx_flush_tlb_current,
7710         .tlb_flush_gva = vmx_flush_tlb_gva,
7711         .tlb_flush_guest = vmx_flush_tlb_guest,
7712
7713         .run = vmx_vcpu_run,
7714         .handle_exit = vmx_handle_exit,
7715         .skip_emulated_instruction = vmx_skip_emulated_instruction,
7716         .update_emulated_instruction = vmx_update_emulated_instruction,
7717         .set_interrupt_shadow = vmx_set_interrupt_shadow,
7718         .get_interrupt_shadow = vmx_get_interrupt_shadow,
7719         .patch_hypercall = vmx_patch_hypercall,
7720         .set_irq = vmx_inject_irq,
7721         .set_nmi = vmx_inject_nmi,
7722         .queue_exception = vmx_queue_exception,
7723         .cancel_injection = vmx_cancel_injection,
7724         .interrupt_allowed = vmx_interrupt_allowed,
7725         .nmi_allowed = vmx_nmi_allowed,
7726         .get_nmi_mask = vmx_get_nmi_mask,
7727         .set_nmi_mask = vmx_set_nmi_mask,
7728         .enable_nmi_window = vmx_enable_nmi_window,
7729         .enable_irq_window = vmx_enable_irq_window,
7730         .update_cr8_intercept = vmx_update_cr8_intercept,
7731         .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
7732         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
7733         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
7734         .load_eoi_exitmap = vmx_load_eoi_exitmap,
7735         .apicv_post_state_restore = vmx_apicv_post_state_restore,
7736         .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
7737         .hwapic_irr_update = vmx_hwapic_irr_update,
7738         .hwapic_isr_update = vmx_hwapic_isr_update,
7739         .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
7740         .sync_pir_to_irr = vmx_sync_pir_to_irr,
7741         .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
7742         .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
7743
7744         .set_tss_addr = vmx_set_tss_addr,
7745         .set_identity_map_addr = vmx_set_identity_map_addr,
7746         .get_mt_mask = vmx_get_mt_mask,
7747
7748         .get_exit_info = vmx_get_exit_info,
7749
7750         .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
7751
7752         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
7753
7754         .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
7755         .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
7756         .write_tsc_offset = vmx_write_tsc_offset,
7757         .write_tsc_multiplier = vmx_write_tsc_multiplier,
7758
7759         .load_mmu_pgd = vmx_load_mmu_pgd,
7760
7761         .check_intercept = vmx_check_intercept,
7762         .handle_exit_irqoff = vmx_handle_exit_irqoff,
7763
7764         .request_immediate_exit = vmx_request_immediate_exit,
7765
7766         .sched_in = vmx_sched_in,
7767
7768         .cpu_dirty_log_size = PML_ENTITY_NUM,
7769         .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
7770
7771         .pre_block = vmx_pre_block,
7772         .post_block = vmx_post_block,
7773
7774         .pmu_ops = &intel_pmu_ops,
7775         .nested_ops = &vmx_nested_ops,
7776
7777         .update_pi_irte = pi_update_irte,
7778         .start_assignment = vmx_pi_start_assignment,
7779
7780 #ifdef CONFIG_X86_64
7781         .set_hv_timer = vmx_set_hv_timer,
7782         .cancel_hv_timer = vmx_cancel_hv_timer,
7783 #endif
7784
7785         .setup_mce = vmx_setup_mce,
7786
7787         .smi_allowed = vmx_smi_allowed,
7788         .enter_smm = vmx_enter_smm,
7789         .leave_smm = vmx_leave_smm,
7790         .enable_smi_window = vmx_enable_smi_window,
7791
7792         .can_emulate_instruction = vmx_can_emulate_instruction,
7793         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
7794         .migrate_timers = vmx_migrate_timers,
7795
7796         .msr_filter_changed = vmx_msr_filter_changed,
7797         .complete_emulated_msr = kvm_complete_insn_gp,
7798
7799         .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
7800 };
7801
7802 static unsigned int vmx_handle_intel_pt_intr(void)
7803 {
7804         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
7805
7806         /* '0' on failure so that the !PT case can use a RET0 static call. */
7807         if (!kvm_arch_pmi_in_guest(vcpu))
7808                 return 0;
7809
7810         kvm_make_request(KVM_REQ_PMI, vcpu);
7811         __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
7812                   (unsigned long *)&vcpu->arch.pmu.global_status);
7813         return 1;
7814 }
7815
7816 static __init void vmx_setup_user_return_msrs(void)
7817 {
7818
7819         /*
7820          * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
7821          * will emulate SYSCALL in legacy mode if the vendor string in guest
7822          * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
7823          * support this emulation, MSR_STAR is included in the list for i386,
7824          * but is never loaded into hardware.  MSR_CSTAR is also never loaded
7825          * into hardware and is here purely for emulation purposes.
7826          */
7827         const u32 vmx_uret_msrs_list[] = {
7828         #ifdef CONFIG_X86_64
7829                 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
7830         #endif
7831                 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
7832                 MSR_IA32_TSX_CTRL,
7833         };
7834         int i;
7835
7836         BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
7837
7838         for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
7839                 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
7840 }
7841
7842 static struct kvm_x86_init_ops vmx_init_ops __initdata;
7843
7844 static __init int hardware_setup(void)
7845 {
7846         unsigned long host_bndcfgs;
7847         struct desc_ptr dt;
7848         int r;
7849
7850         store_idt(&dt);
7851         host_idt_base = dt.address;
7852
7853         vmx_setup_user_return_msrs();
7854
7855         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
7856                 return -EIO;
7857
7858         if (boot_cpu_has(X86_FEATURE_NX))
7859                 kvm_enable_efer_bits(EFER_NX);
7860
7861         if (boot_cpu_has(X86_FEATURE_MPX)) {
7862                 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
7863                 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
7864         }
7865
7866         if (!cpu_has_vmx_mpx())
7867                 supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
7868                                     XFEATURE_MASK_BNDCSR);
7869
7870         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7871             !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
7872                 enable_vpid = 0;
7873
7874         if (!cpu_has_vmx_ept() ||
7875             !cpu_has_vmx_ept_4levels() ||
7876             !cpu_has_vmx_ept_mt_wb() ||
7877             !cpu_has_vmx_invept_global())
7878                 enable_ept = 0;
7879
7880         /* NX support is required for shadow paging. */
7881         if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
7882                 pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
7883                 return -EOPNOTSUPP;
7884         }
7885
7886         if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
7887                 enable_ept_ad_bits = 0;
7888
7889         if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
7890                 enable_unrestricted_guest = 0;
7891
7892         if (!cpu_has_vmx_flexpriority())
7893                 flexpriority_enabled = 0;
7894
7895         if (!cpu_has_virtual_nmis())
7896                 enable_vnmi = 0;
7897
7898         /*
7899          * set_apic_access_page_addr() is used to reload apic access
7900          * page upon invalidation.  No need to do anything if not
7901          * using the APIC_ACCESS_ADDR VMCS field.
7902          */
7903         if (!flexpriority_enabled)
7904                 vmx_x86_ops.set_apic_access_page_addr = NULL;
7905
7906         if (!cpu_has_vmx_tpr_shadow())
7907                 vmx_x86_ops.update_cr8_intercept = NULL;
7908
7909 #if IS_ENABLED(CONFIG_HYPERV)
7910         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
7911             && enable_ept) {
7912                 vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
7913                 vmx_x86_ops.tlb_remote_flush_with_range =
7914                                 hv_remote_flush_tlb_with_range;
7915         }
7916 #endif
7917
7918         if (!cpu_has_vmx_ple()) {
7919                 ple_gap = 0;
7920                 ple_window = 0;
7921                 ple_window_grow = 0;
7922                 ple_window_max = 0;
7923                 ple_window_shrink = 0;
7924         }
7925
7926         if (!cpu_has_vmx_apicv())
7927                 enable_apicv = 0;
7928         if (!enable_apicv)
7929                 vmx_x86_ops.sync_pir_to_irr = NULL;
7930
7931         if (cpu_has_vmx_tsc_scaling()) {
7932                 kvm_has_tsc_control = true;
7933                 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
7934                 kvm_tsc_scaling_ratio_frac_bits = 48;
7935         }
7936
7937         kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
7938
7939         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
7940
7941         if (enable_ept)
7942                 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
7943                                       cpu_has_vmx_ept_execute_only());
7944
7945         kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
7946                           ept_caps_to_lpage_level(vmx_capability.ept));
7947
7948         /*
7949          * Only enable PML when hardware supports PML feature, and both EPT
7950          * and EPT A/D bit features are enabled -- PML depends on them to work.
7951          */
7952         if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
7953                 enable_pml = 0;
7954
7955         if (!enable_pml)
7956                 vmx_x86_ops.cpu_dirty_log_size = 0;
7957
7958         if (!cpu_has_vmx_preemption_timer())
7959                 enable_preemption_timer = false;
7960
7961         if (enable_preemption_timer) {
7962                 u64 use_timer_freq = 5000ULL * 1000 * 1000;
7963                 u64 vmx_msr;
7964
7965                 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
7966                 cpu_preemption_timer_multi =
7967                         vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
7968
7969                 if (tsc_khz)
7970                         use_timer_freq = (u64)tsc_khz * 1000;
7971                 use_timer_freq >>= cpu_preemption_timer_multi;
7972
7973                 /*
7974                  * KVM "disables" the preemption timer by setting it to its max
7975                  * value.  Don't use the timer if it might cause spurious exits
7976                  * at a rate faster than 0.1 Hz (of uninterrupted guest time).
7977                  */
7978                 if (use_timer_freq > 0xffffffffu / 10)
7979                         enable_preemption_timer = false;
7980         }
7981
7982         if (!enable_preemption_timer) {
7983                 vmx_x86_ops.set_hv_timer = NULL;
7984                 vmx_x86_ops.cancel_hv_timer = NULL;
7985                 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
7986         }
7987
7988         kvm_mce_cap_supported |= MCG_LMCE_P;
7989
7990         if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
7991                 return -EINVAL;
7992         if (!enable_ept || !cpu_has_vmx_intel_pt())
7993                 pt_mode = PT_MODE_SYSTEM;
7994         if (pt_mode == PT_MODE_HOST_GUEST)
7995                 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
7996         else
7997                 vmx_init_ops.handle_intel_pt_intr = NULL;
7998
7999         setup_default_sgx_lepubkeyhash();
8000
8001         if (nested) {
8002                 nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
8003                                            vmx_capability.ept);
8004
8005                 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8006                 if (r)
8007                         return r;
8008         }
8009
8010         vmx_set_cpu_caps();
8011
8012         r = alloc_kvm_area();
8013         if (r)
8014                 nested_vmx_hardware_unsetup();
8015
8016         kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8017
8018         return r;
8019 }
8020
8021 static struct kvm_x86_init_ops vmx_init_ops __initdata = {
8022         .cpu_has_kvm_support = cpu_has_kvm_support,
8023         .disabled_by_bios = vmx_disabled_by_bios,
8024         .check_processor_compatibility = vmx_check_processor_compat,
8025         .hardware_setup = hardware_setup,
8026         .handle_intel_pt_intr = NULL,
8027
8028         .runtime_ops = &vmx_x86_ops,
8029 };
8030
8031 static void vmx_cleanup_l1d_flush(void)
8032 {
8033         if (vmx_l1d_flush_pages) {
8034                 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8035                 vmx_l1d_flush_pages = NULL;
8036         }
8037         /* Restore state so sysfs ignores VMX */
8038         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8039 }
8040
8041 static void vmx_exit(void)
8042 {
8043 #ifdef CONFIG_KEXEC_CORE
8044         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8045         synchronize_rcu();
8046 #endif
8047
8048         kvm_exit();
8049
8050 #if IS_ENABLED(CONFIG_HYPERV)
8051         if (static_branch_unlikely(&enable_evmcs)) {
8052                 int cpu;
8053                 struct hv_vp_assist_page *vp_ap;
8054                 /*
8055                  * Reset everything to support using non-enlightened VMCS
8056                  * access later (e.g. when we reload the module with
8057                  * enlightened_vmcs=0)
8058                  */
8059                 for_each_online_cpu(cpu) {
8060                         vp_ap = hv_get_vp_assist_page(cpu);
8061
8062                         if (!vp_ap)
8063                                 continue;
8064
8065                         vp_ap->nested_control.features.directhypercall = 0;
8066                         vp_ap->current_nested_vmcs = 0;
8067                         vp_ap->enlighten_vmentry = 0;
8068                 }
8069
8070                 static_branch_disable(&enable_evmcs);
8071         }
8072 #endif
8073         vmx_cleanup_l1d_flush();
8074
8075         allow_smaller_maxphyaddr = false;
8076 }
8077 module_exit(vmx_exit);
8078
8079 static int __init vmx_init(void)
8080 {
8081         int r, cpu;
8082
8083 #if IS_ENABLED(CONFIG_HYPERV)
8084         /*
8085          * Enlightened VMCS usage should be recommended and the host needs
8086          * to support eVMCS v1 or above. We can also disable eVMCS support
8087          * with module parameter.
8088          */
8089         if (enlightened_vmcs &&
8090             ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
8091             (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
8092             KVM_EVMCS_VERSION) {
8093                 int cpu;
8094
8095                 /* Check that we have assist pages on all online CPUs */
8096                 for_each_online_cpu(cpu) {
8097                         if (!hv_get_vp_assist_page(cpu)) {
8098                                 enlightened_vmcs = false;
8099                                 break;
8100                         }
8101                 }
8102
8103                 if (enlightened_vmcs) {
8104                         pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
8105                         static_branch_enable(&enable_evmcs);
8106                 }
8107
8108                 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
8109                         vmx_x86_ops.enable_direct_tlbflush
8110                                 = hv_enable_direct_tlbflush;
8111
8112         } else {
8113                 enlightened_vmcs = false;
8114         }
8115 #endif
8116
8117         r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
8118                      __alignof__(struct vcpu_vmx), THIS_MODULE);
8119         if (r)
8120                 return r;
8121
8122         /*
8123          * Must be called after kvm_init() so enable_ept is properly set
8124          * up. Hand the parameter mitigation value in which was stored in
8125          * the pre module init parser. If no parameter was given, it will
8126          * contain 'auto' which will be turned into the default 'cond'
8127          * mitigation mode.
8128          */
8129         r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8130         if (r) {
8131                 vmx_exit();
8132                 return r;
8133         }
8134
8135         for_each_possible_cpu(cpu) {
8136                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8137
8138                 pi_init_cpu(cpu);
8139         }
8140
8141 #ifdef CONFIG_KEXEC_CORE
8142         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8143                            crash_vmclear_local_loaded_vmcss);
8144 #endif
8145         vmx_check_vmcs12_offsets();
8146
8147         /*
8148          * Shadow paging doesn't have a (further) performance penalty
8149          * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8150          * by default
8151          */
8152         if (!enable_ept)
8153                 allow_smaller_maxphyaddr = true;
8154
8155         return 0;
8156 }
8157 module_init(vmx_init);