Merge tag 'drm-misc-next-fixes-2023-09-01' of git://anongit.freedesktop.org/drm/drm...
[platform/kernel/linux-rpi.git] / arch / x86 / kvm / vmx / vmx.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * Copyright (C) 2006 Qumranet, Inc.
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  */
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17 #include <linux/highmem.h>
18 #include <linux/hrtimer.h>
19 #include <linux/kernel.h>
20 #include <linux/kvm_host.h>
21 #include <linux/module.h>
22 #include <linux/moduleparam.h>
23 #include <linux/mod_devicetable.h>
24 #include <linux/mm.h>
25 #include <linux/objtool.h>
26 #include <linux/sched.h>
27 #include <linux/sched/smt.h>
28 #include <linux/slab.h>
29 #include <linux/tboot.h>
30 #include <linux/trace_events.h>
31 #include <linux/entry-kvm.h>
32
33 #include <asm/apic.h>
34 #include <asm/asm.h>
35 #include <asm/cpu.h>
36 #include <asm/cpu_device_id.h>
37 #include <asm/debugreg.h>
38 #include <asm/desc.h>
39 #include <asm/fpu/api.h>
40 #include <asm/fpu/xstate.h>
41 #include <asm/idtentry.h>
42 #include <asm/io.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/kexec.h>
45 #include <asm/perf_event.h>
46 #include <asm/mmu_context.h>
47 #include <asm/mshyperv.h>
48 #include <asm/mwait.h>
49 #include <asm/spec-ctrl.h>
50 #include <asm/virtext.h>
51 #include <asm/vmx.h>
52
53 #include "capabilities.h"
54 #include "cpuid.h"
55 #include "hyperv.h"
56 #include "kvm_onhyperv.h"
57 #include "irq.h"
58 #include "kvm_cache_regs.h"
59 #include "lapic.h"
60 #include "mmu.h"
61 #include "nested.h"
62 #include "pmu.h"
63 #include "sgx.h"
64 #include "trace.h"
65 #include "vmcs.h"
66 #include "vmcs12.h"
67 #include "vmx.h"
68 #include "x86.h"
69 #include "smm.h"
70
71 MODULE_AUTHOR("Qumranet");
72 MODULE_LICENSE("GPL");
73
74 #ifdef MODULE
75 static const struct x86_cpu_id vmx_cpu_id[] = {
76         X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
77         {}
78 };
79 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
80 #endif
81
82 bool __read_mostly enable_vpid = 1;
83 module_param_named(vpid, enable_vpid, bool, 0444);
84
85 static bool __read_mostly enable_vnmi = 1;
86 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
87
88 bool __read_mostly flexpriority_enabled = 1;
89 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
90
91 bool __read_mostly enable_ept = 1;
92 module_param_named(ept, enable_ept, bool, S_IRUGO);
93
94 bool __read_mostly enable_unrestricted_guest = 1;
95 module_param_named(unrestricted_guest,
96                         enable_unrestricted_guest, bool, S_IRUGO);
97
98 bool __read_mostly enable_ept_ad_bits = 1;
99 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
100
101 static bool __read_mostly emulate_invalid_guest_state = true;
102 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
103
104 static bool __read_mostly fasteoi = 1;
105 module_param(fasteoi, bool, S_IRUGO);
106
107 module_param(enable_apicv, bool, S_IRUGO);
108
109 bool __read_mostly enable_ipiv = true;
110 module_param(enable_ipiv, bool, 0444);
111
112 /*
113  * If nested=1, nested virtualization is supported, i.e., guests may use
114  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
115  * use VMX instructions.
116  */
117 static bool __read_mostly nested = 1;
118 module_param(nested, bool, S_IRUGO);
119
120 bool __read_mostly enable_pml = 1;
121 module_param_named(pml, enable_pml, bool, S_IRUGO);
122
123 static bool __read_mostly error_on_inconsistent_vmcs_config = true;
124 module_param(error_on_inconsistent_vmcs_config, bool, 0444);
125
126 static bool __read_mostly dump_invalid_vmcs = 0;
127 module_param(dump_invalid_vmcs, bool, 0644);
128
129 #define MSR_BITMAP_MODE_X2APIC          1
130 #define MSR_BITMAP_MODE_X2APIC_APICV    2
131
132 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
133
134 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
135 static int __read_mostly cpu_preemption_timer_multi;
136 static bool __read_mostly enable_preemption_timer = 1;
137 #ifdef CONFIG_X86_64
138 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
139 #endif
140
141 extern bool __read_mostly allow_smaller_maxphyaddr;
142 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
143
144 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
145 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
146 #define KVM_VM_CR0_ALWAYS_ON                            \
147         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
148
149 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
150 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
151 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
152
153 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
154
155 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
156         RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
157         RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
158         RTIT_STATUS_BYTECNT))
159
160 /*
161  * List of MSRs that can be directly passed to the guest.
162  * In addition to these x2apic and PT MSRs are handled specially.
163  */
164 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
165         MSR_IA32_SPEC_CTRL,
166         MSR_IA32_PRED_CMD,
167         MSR_IA32_FLUSH_CMD,
168         MSR_IA32_TSC,
169 #ifdef CONFIG_X86_64
170         MSR_FS_BASE,
171         MSR_GS_BASE,
172         MSR_KERNEL_GS_BASE,
173         MSR_IA32_XFD,
174         MSR_IA32_XFD_ERR,
175 #endif
176         MSR_IA32_SYSENTER_CS,
177         MSR_IA32_SYSENTER_ESP,
178         MSR_IA32_SYSENTER_EIP,
179         MSR_CORE_C1_RES,
180         MSR_CORE_C3_RESIDENCY,
181         MSR_CORE_C6_RESIDENCY,
182         MSR_CORE_C7_RESIDENCY,
183 };
184
185 /*
186  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
187  * ple_gap:    upper bound on the amount of time between two successive
188  *             executions of PAUSE in a loop. Also indicate if ple enabled.
189  *             According to test, this time is usually smaller than 128 cycles.
190  * ple_window: upper bound on the amount of time a guest is allowed to execute
191  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
192  *             less than 2^12 cycles
193  * Time is measured based on a counter that runs at the same rate as the TSC,
194  * refer SDM volume 3b section 21.6.13 & 22.1.3.
195  */
196 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
197 module_param(ple_gap, uint, 0444);
198
199 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
200 module_param(ple_window, uint, 0444);
201
202 /* Default doubles per-vcpu window every exit. */
203 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
204 module_param(ple_window_grow, uint, 0444);
205
206 /* Default resets per-vcpu window every exit to ple_window. */
207 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
208 module_param(ple_window_shrink, uint, 0444);
209
210 /* Default is to compute the maximum so we can never overflow. */
211 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
212 module_param(ple_window_max, uint, 0444);
213
214 /* Default is SYSTEM mode, 1 for host-guest mode */
215 int __read_mostly pt_mode = PT_MODE_SYSTEM;
216 module_param(pt_mode, int, S_IRUGO);
217
218 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
219 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
220 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
221
222 /* Storage for pre module init parameter parsing */
223 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
224
225 static const struct {
226         const char *option;
227         bool for_parse;
228 } vmentry_l1d_param[] = {
229         [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
230         [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
231         [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
232         [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
233         [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
234         [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
235 };
236
237 #define L1D_CACHE_ORDER 4
238 static void *vmx_l1d_flush_pages;
239
240 /* Control for disabling CPU Fill buffer clear */
241 static bool __read_mostly vmx_fb_clear_ctrl_available;
242
243 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
244 {
245         struct page *page;
246         unsigned int i;
247
248         if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
249                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
250                 return 0;
251         }
252
253         if (!enable_ept) {
254                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
255                 return 0;
256         }
257
258         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
259                 u64 msr;
260
261                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
262                 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
263                         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
264                         return 0;
265                 }
266         }
267
268         /* If set to auto use the default l1tf mitigation method */
269         if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
270                 switch (l1tf_mitigation) {
271                 case L1TF_MITIGATION_OFF:
272                         l1tf = VMENTER_L1D_FLUSH_NEVER;
273                         break;
274                 case L1TF_MITIGATION_FLUSH_NOWARN:
275                 case L1TF_MITIGATION_FLUSH:
276                 case L1TF_MITIGATION_FLUSH_NOSMT:
277                         l1tf = VMENTER_L1D_FLUSH_COND;
278                         break;
279                 case L1TF_MITIGATION_FULL:
280                 case L1TF_MITIGATION_FULL_FORCE:
281                         l1tf = VMENTER_L1D_FLUSH_ALWAYS;
282                         break;
283                 }
284         } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
285                 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
286         }
287
288         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
289             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
290                 /*
291                  * This allocation for vmx_l1d_flush_pages is not tied to a VM
292                  * lifetime and so should not be charged to a memcg.
293                  */
294                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
295                 if (!page)
296                         return -ENOMEM;
297                 vmx_l1d_flush_pages = page_address(page);
298
299                 /*
300                  * Initialize each page with a different pattern in
301                  * order to protect against KSM in the nested
302                  * virtualization case.
303                  */
304                 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
305                         memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
306                                PAGE_SIZE);
307                 }
308         }
309
310         l1tf_vmx_mitigation = l1tf;
311
312         if (l1tf != VMENTER_L1D_FLUSH_NEVER)
313                 static_branch_enable(&vmx_l1d_should_flush);
314         else
315                 static_branch_disable(&vmx_l1d_should_flush);
316
317         if (l1tf == VMENTER_L1D_FLUSH_COND)
318                 static_branch_enable(&vmx_l1d_flush_cond);
319         else
320                 static_branch_disable(&vmx_l1d_flush_cond);
321         return 0;
322 }
323
324 static int vmentry_l1d_flush_parse(const char *s)
325 {
326         unsigned int i;
327
328         if (s) {
329                 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
330                         if (vmentry_l1d_param[i].for_parse &&
331                             sysfs_streq(s, vmentry_l1d_param[i].option))
332                                 return i;
333                 }
334         }
335         return -EINVAL;
336 }
337
338 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
339 {
340         int l1tf, ret;
341
342         l1tf = vmentry_l1d_flush_parse(s);
343         if (l1tf < 0)
344                 return l1tf;
345
346         if (!boot_cpu_has(X86_BUG_L1TF))
347                 return 0;
348
349         /*
350          * Has vmx_init() run already? If not then this is the pre init
351          * parameter parsing. In that case just store the value and let
352          * vmx_init() do the proper setup after enable_ept has been
353          * established.
354          */
355         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
356                 vmentry_l1d_flush_param = l1tf;
357                 return 0;
358         }
359
360         mutex_lock(&vmx_l1d_flush_mutex);
361         ret = vmx_setup_l1d_flush(l1tf);
362         mutex_unlock(&vmx_l1d_flush_mutex);
363         return ret;
364 }
365
366 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
367 {
368         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
369                 return sprintf(s, "???\n");
370
371         return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
372 }
373
374 static void vmx_setup_fb_clear_ctrl(void)
375 {
376         u64 msr;
377
378         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
379             !boot_cpu_has_bug(X86_BUG_MDS) &&
380             !boot_cpu_has_bug(X86_BUG_TAA)) {
381                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
382                 if (msr & ARCH_CAP_FB_CLEAR_CTRL)
383                         vmx_fb_clear_ctrl_available = true;
384         }
385 }
386
387 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
388 {
389         u64 msr;
390
391         if (!vmx->disable_fb_clear)
392                 return;
393
394         msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
395         msr |= FB_CLEAR_DIS;
396         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
397         /* Cache the MSR value to avoid reading it later */
398         vmx->msr_ia32_mcu_opt_ctrl = msr;
399 }
400
401 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
402 {
403         if (!vmx->disable_fb_clear)
404                 return;
405
406         vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
407         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
408 }
409
410 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
411 {
412         vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
413
414         /*
415          * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
416          * at VMEntry. Skip the MSR read/write when a guest has no use case to
417          * execute VERW.
418          */
419         if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
420            ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
421             (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
422             (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
423             (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
424             (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
425                 vmx->disable_fb_clear = false;
426 }
427
428 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
429         .set = vmentry_l1d_flush_set,
430         .get = vmentry_l1d_flush_get,
431 };
432 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
433
434 static u32 vmx_segment_access_rights(struct kvm_segment *var);
435
436 void vmx_vmexit(void);
437
438 #define vmx_insn_failed(fmt...)         \
439 do {                                    \
440         WARN_ONCE(1, fmt);              \
441         pr_warn_ratelimited(fmt);       \
442 } while (0)
443
444 noinline void vmread_error(unsigned long field)
445 {
446         vmx_insn_failed("vmread failed: field=%lx\n", field);
447 }
448
449 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
450 noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
451 {
452         if (fault) {
453                 kvm_spurious_fault();
454         } else {
455                 instrumentation_begin();
456                 vmread_error(field);
457                 instrumentation_end();
458         }
459 }
460 #endif
461
462 noinline void vmwrite_error(unsigned long field, unsigned long value)
463 {
464         vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
465                         field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
466 }
467
468 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
469 {
470         vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
471                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
472 }
473
474 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
475 {
476         vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
477                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
478 }
479
480 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
481 {
482         vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
483                         ext, vpid, gva);
484 }
485
486 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
487 {
488         vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
489                         ext, eptp, gpa);
490 }
491
492 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
493 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
494 /*
495  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
496  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
497  */
498 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
499
500 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
501 static DEFINE_SPINLOCK(vmx_vpid_lock);
502
503 struct vmcs_config vmcs_config __ro_after_init;
504 struct vmx_capability vmx_capability __ro_after_init;
505
506 #define VMX_SEGMENT_FIELD(seg)                                  \
507         [VCPU_SREG_##seg] = {                                   \
508                 .selector = GUEST_##seg##_SELECTOR,             \
509                 .base = GUEST_##seg##_BASE,                     \
510                 .limit = GUEST_##seg##_LIMIT,                   \
511                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
512         }
513
514 static const struct kvm_vmx_segment_field {
515         unsigned selector;
516         unsigned base;
517         unsigned limit;
518         unsigned ar_bytes;
519 } kvm_vmx_segment_fields[] = {
520         VMX_SEGMENT_FIELD(CS),
521         VMX_SEGMENT_FIELD(DS),
522         VMX_SEGMENT_FIELD(ES),
523         VMX_SEGMENT_FIELD(FS),
524         VMX_SEGMENT_FIELD(GS),
525         VMX_SEGMENT_FIELD(SS),
526         VMX_SEGMENT_FIELD(TR),
527         VMX_SEGMENT_FIELD(LDTR),
528 };
529
530 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
531 {
532         vmx->segment_cache.bitmask = 0;
533 }
534
535 static unsigned long host_idt_base;
536
537 #if IS_ENABLED(CONFIG_HYPERV)
538 static struct kvm_x86_ops vmx_x86_ops __initdata;
539
540 static bool __read_mostly enlightened_vmcs = true;
541 module_param(enlightened_vmcs, bool, 0444);
542
543 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
544 {
545         struct hv_enlightened_vmcs *evmcs;
546         struct hv_partition_assist_pg **p_hv_pa_pg =
547                         &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
548         /*
549          * Synthetic VM-Exit is not enabled in current code and so All
550          * evmcs in singe VM shares same assist page.
551          */
552         if (!*p_hv_pa_pg)
553                 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
554
555         if (!*p_hv_pa_pg)
556                 return -ENOMEM;
557
558         evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
559
560         evmcs->partition_assist_page =
561                 __pa(*p_hv_pa_pg);
562         evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
563         evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
564
565         return 0;
566 }
567
568 static __init void hv_init_evmcs(void)
569 {
570         int cpu;
571
572         if (!enlightened_vmcs)
573                 return;
574
575         /*
576          * Enlightened VMCS usage should be recommended and the host needs
577          * to support eVMCS v1 or above.
578          */
579         if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
580             (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
581              KVM_EVMCS_VERSION) {
582
583                 /* Check that we have assist pages on all online CPUs */
584                 for_each_online_cpu(cpu) {
585                         if (!hv_get_vp_assist_page(cpu)) {
586                                 enlightened_vmcs = false;
587                                 break;
588                         }
589                 }
590
591                 if (enlightened_vmcs) {
592                         pr_info("Using Hyper-V Enlightened VMCS\n");
593                         static_branch_enable(&__kvm_is_using_evmcs);
594                 }
595
596                 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
597                         vmx_x86_ops.enable_l2_tlb_flush
598                                 = hv_enable_l2_tlb_flush;
599
600         } else {
601                 enlightened_vmcs = false;
602         }
603 }
604
605 static void hv_reset_evmcs(void)
606 {
607         struct hv_vp_assist_page *vp_ap;
608
609         if (!kvm_is_using_evmcs())
610                 return;
611
612         /*
613          * KVM should enable eVMCS if and only if all CPUs have a VP assist
614          * page, and should reject CPU onlining if eVMCS is enabled the CPU
615          * doesn't have a VP assist page allocated.
616          */
617         vp_ap = hv_get_vp_assist_page(smp_processor_id());
618         if (WARN_ON_ONCE(!vp_ap))
619                 return;
620
621         /*
622          * Reset everything to support using non-enlightened VMCS access later
623          * (e.g. when we reload the module with enlightened_vmcs=0)
624          */
625         vp_ap->nested_control.features.directhypercall = 0;
626         vp_ap->current_nested_vmcs = 0;
627         vp_ap->enlighten_vmentry = 0;
628 }
629
630 #else /* IS_ENABLED(CONFIG_HYPERV) */
631 static void hv_init_evmcs(void) {}
632 static void hv_reset_evmcs(void) {}
633 #endif /* IS_ENABLED(CONFIG_HYPERV) */
634
635 /*
636  * Comment's format: document - errata name - stepping - processor name.
637  * Refer from
638  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
639  */
640 static u32 vmx_preemption_cpu_tfms[] = {
641 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
642 0x000206E6,
643 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
644 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
645 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
646 0x00020652,
647 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
648 0x00020655,
649 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
650 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
651 /*
652  * 320767.pdf - AAP86  - B1 -
653  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
654  */
655 0x000106E5,
656 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
657 0x000106A0,
658 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
659 0x000106A1,
660 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
661 0x000106A4,
662  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
663  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
664  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
665 0x000106A5,
666  /* Xeon E3-1220 V2 */
667 0x000306A8,
668 };
669
670 static inline bool cpu_has_broken_vmx_preemption_timer(void)
671 {
672         u32 eax = cpuid_eax(0x00000001), i;
673
674         /* Clear the reserved bits */
675         eax &= ~(0x3U << 14 | 0xfU << 28);
676         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
677                 if (eax == vmx_preemption_cpu_tfms[i])
678                         return true;
679
680         return false;
681 }
682
683 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
684 {
685         return flexpriority_enabled && lapic_in_kernel(vcpu);
686 }
687
688 static int possible_passthrough_msr_slot(u32 msr)
689 {
690         u32 i;
691
692         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
693                 if (vmx_possible_passthrough_msrs[i] == msr)
694                         return i;
695
696         return -ENOENT;
697 }
698
699 static bool is_valid_passthrough_msr(u32 msr)
700 {
701         bool r;
702
703         switch (msr) {
704         case 0x800 ... 0x8ff:
705                 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
706                 return true;
707         case MSR_IA32_RTIT_STATUS:
708         case MSR_IA32_RTIT_OUTPUT_BASE:
709         case MSR_IA32_RTIT_OUTPUT_MASK:
710         case MSR_IA32_RTIT_CR3_MATCH:
711         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
712                 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
713         case MSR_LBR_SELECT:
714         case MSR_LBR_TOS:
715         case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
716         case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
717         case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
718         case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
719         case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
720                 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
721                 return true;
722         }
723
724         r = possible_passthrough_msr_slot(msr) != -ENOENT;
725
726         WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
727
728         return r;
729 }
730
731 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
732 {
733         int i;
734
735         i = kvm_find_user_return_msr(msr);
736         if (i >= 0)
737                 return &vmx->guest_uret_msrs[i];
738         return NULL;
739 }
740
741 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
742                                   struct vmx_uret_msr *msr, u64 data)
743 {
744         unsigned int slot = msr - vmx->guest_uret_msrs;
745         int ret = 0;
746
747         if (msr->load_into_hardware) {
748                 preempt_disable();
749                 ret = kvm_set_user_return_msr(slot, data, msr->mask);
750                 preempt_enable();
751         }
752         if (!ret)
753                 msr->data = data;
754         return ret;
755 }
756
757 #ifdef CONFIG_KEXEC_CORE
758 static void crash_vmclear_local_loaded_vmcss(void)
759 {
760         int cpu = raw_smp_processor_id();
761         struct loaded_vmcs *v;
762
763         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
764                             loaded_vmcss_on_cpu_link)
765                 vmcs_clear(v->vmcs);
766 }
767 #endif /* CONFIG_KEXEC_CORE */
768
769 static void __loaded_vmcs_clear(void *arg)
770 {
771         struct loaded_vmcs *loaded_vmcs = arg;
772         int cpu = raw_smp_processor_id();
773
774         if (loaded_vmcs->cpu != cpu)
775                 return; /* vcpu migration can race with cpu offline */
776         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
777                 per_cpu(current_vmcs, cpu) = NULL;
778
779         vmcs_clear(loaded_vmcs->vmcs);
780         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
781                 vmcs_clear(loaded_vmcs->shadow_vmcs);
782
783         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
784
785         /*
786          * Ensure all writes to loaded_vmcs, including deleting it from its
787          * current percpu list, complete before setting loaded_vmcs->cpu to
788          * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
789          * and add loaded_vmcs to its percpu list before it's deleted from this
790          * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
791          */
792         smp_wmb();
793
794         loaded_vmcs->cpu = -1;
795         loaded_vmcs->launched = 0;
796 }
797
798 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
799 {
800         int cpu = loaded_vmcs->cpu;
801
802         if (cpu != -1)
803                 smp_call_function_single(cpu,
804                          __loaded_vmcs_clear, loaded_vmcs, 1);
805 }
806
807 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
808                                        unsigned field)
809 {
810         bool ret;
811         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
812
813         if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
814                 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
815                 vmx->segment_cache.bitmask = 0;
816         }
817         ret = vmx->segment_cache.bitmask & mask;
818         vmx->segment_cache.bitmask |= mask;
819         return ret;
820 }
821
822 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
823 {
824         u16 *p = &vmx->segment_cache.seg[seg].selector;
825
826         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
827                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
828         return *p;
829 }
830
831 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
832 {
833         ulong *p = &vmx->segment_cache.seg[seg].base;
834
835         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
836                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
837         return *p;
838 }
839
840 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
841 {
842         u32 *p = &vmx->segment_cache.seg[seg].limit;
843
844         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
845                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
846         return *p;
847 }
848
849 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
850 {
851         u32 *p = &vmx->segment_cache.seg[seg].ar;
852
853         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
854                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
855         return *p;
856 }
857
858 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
859 {
860         u32 eb;
861
862         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
863              (1u << DB_VECTOR) | (1u << AC_VECTOR);
864         /*
865          * Guest access to VMware backdoor ports could legitimately
866          * trigger #GP because of TSS I/O permission bitmap.
867          * We intercept those #GP and allow access to them anyway
868          * as VMware does.
869          */
870         if (enable_vmware_backdoor)
871                 eb |= (1u << GP_VECTOR);
872         if ((vcpu->guest_debug &
873              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
874             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
875                 eb |= 1u << BP_VECTOR;
876         if (to_vmx(vcpu)->rmode.vm86_active)
877                 eb = ~0;
878         if (!vmx_need_pf_intercept(vcpu))
879                 eb &= ~(1u << PF_VECTOR);
880
881         /* When we are running a nested L2 guest and L1 specified for it a
882          * certain exception bitmap, we must trap the same exceptions and pass
883          * them to L1. When running L2, we will only handle the exceptions
884          * specified above if L1 did not want them.
885          */
886         if (is_guest_mode(vcpu))
887                 eb |= get_vmcs12(vcpu)->exception_bitmap;
888         else {
889                 int mask = 0, match = 0;
890
891                 if (enable_ept && (eb & (1u << PF_VECTOR))) {
892                         /*
893                          * If EPT is enabled, #PF is currently only intercepted
894                          * if MAXPHYADDR is smaller on the guest than on the
895                          * host.  In that case we only care about present,
896                          * non-reserved faults.  For vmcs02, however, PFEC_MASK
897                          * and PFEC_MATCH are set in prepare_vmcs02_rare.
898                          */
899                         mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
900                         match = PFERR_PRESENT_MASK;
901                 }
902                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
903                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
904         }
905
906         /*
907          * Disabling xfd interception indicates that dynamic xfeatures
908          * might be used in the guest. Always trap #NM in this case
909          * to save guest xfd_err timely.
910          */
911         if (vcpu->arch.xfd_no_write_intercept)
912                 eb |= (1u << NM_VECTOR);
913
914         vmcs_write32(EXCEPTION_BITMAP, eb);
915 }
916
917 /*
918  * Check if MSR is intercepted for currently loaded MSR bitmap.
919  */
920 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
921 {
922         if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
923                 return true;
924
925         return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
926 }
927
928 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
929 {
930         unsigned int flags = 0;
931
932         if (vmx->loaded_vmcs->launched)
933                 flags |= VMX_RUN_VMRESUME;
934
935         /*
936          * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
937          * to change it directly without causing a vmexit.  In that case read
938          * it after vmexit and store it in vmx->spec_ctrl.
939          */
940         if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
941                 flags |= VMX_RUN_SAVE_SPEC_CTRL;
942
943         return flags;
944 }
945
946 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
947                 unsigned long entry, unsigned long exit)
948 {
949         vm_entry_controls_clearbit(vmx, entry);
950         vm_exit_controls_clearbit(vmx, exit);
951 }
952
953 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
954 {
955         unsigned int i;
956
957         for (i = 0; i < m->nr; ++i) {
958                 if (m->val[i].index == msr)
959                         return i;
960         }
961         return -ENOENT;
962 }
963
964 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
965 {
966         int i;
967         struct msr_autoload *m = &vmx->msr_autoload;
968
969         switch (msr) {
970         case MSR_EFER:
971                 if (cpu_has_load_ia32_efer()) {
972                         clear_atomic_switch_msr_special(vmx,
973                                         VM_ENTRY_LOAD_IA32_EFER,
974                                         VM_EXIT_LOAD_IA32_EFER);
975                         return;
976                 }
977                 break;
978         case MSR_CORE_PERF_GLOBAL_CTRL:
979                 if (cpu_has_load_perf_global_ctrl()) {
980                         clear_atomic_switch_msr_special(vmx,
981                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
982                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
983                         return;
984                 }
985                 break;
986         }
987         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
988         if (i < 0)
989                 goto skip_guest;
990         --m->guest.nr;
991         m->guest.val[i] = m->guest.val[m->guest.nr];
992         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
993
994 skip_guest:
995         i = vmx_find_loadstore_msr_slot(&m->host, msr);
996         if (i < 0)
997                 return;
998
999         --m->host.nr;
1000         m->host.val[i] = m->host.val[m->host.nr];
1001         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1002 }
1003
1004 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1005                 unsigned long entry, unsigned long exit,
1006                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1007                 u64 guest_val, u64 host_val)
1008 {
1009         vmcs_write64(guest_val_vmcs, guest_val);
1010         if (host_val_vmcs != HOST_IA32_EFER)
1011                 vmcs_write64(host_val_vmcs, host_val);
1012         vm_entry_controls_setbit(vmx, entry);
1013         vm_exit_controls_setbit(vmx, exit);
1014 }
1015
1016 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1017                                   u64 guest_val, u64 host_val, bool entry_only)
1018 {
1019         int i, j = 0;
1020         struct msr_autoload *m = &vmx->msr_autoload;
1021
1022         switch (msr) {
1023         case MSR_EFER:
1024                 if (cpu_has_load_ia32_efer()) {
1025                         add_atomic_switch_msr_special(vmx,
1026                                         VM_ENTRY_LOAD_IA32_EFER,
1027                                         VM_EXIT_LOAD_IA32_EFER,
1028                                         GUEST_IA32_EFER,
1029                                         HOST_IA32_EFER,
1030                                         guest_val, host_val);
1031                         return;
1032                 }
1033                 break;
1034         case MSR_CORE_PERF_GLOBAL_CTRL:
1035                 if (cpu_has_load_perf_global_ctrl()) {
1036                         add_atomic_switch_msr_special(vmx,
1037                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1038                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1039                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1040                                         HOST_IA32_PERF_GLOBAL_CTRL,
1041                                         guest_val, host_val);
1042                         return;
1043                 }
1044                 break;
1045         case MSR_IA32_PEBS_ENABLE:
1046                 /* PEBS needs a quiescent period after being disabled (to write
1047                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
1048                  * provide that period, so a CPU could write host's record into
1049                  * guest's memory.
1050                  */
1051                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1052         }
1053
1054         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
1055         if (!entry_only)
1056                 j = vmx_find_loadstore_msr_slot(&m->host, msr);
1057
1058         if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
1059             (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
1060                 printk_once(KERN_WARNING "Not enough msr switch entries. "
1061                                 "Can't add msr %x\n", msr);
1062                 return;
1063         }
1064         if (i < 0) {
1065                 i = m->guest.nr++;
1066                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1067         }
1068         m->guest.val[i].index = msr;
1069         m->guest.val[i].value = guest_val;
1070
1071         if (entry_only)
1072                 return;
1073
1074         if (j < 0) {
1075                 j = m->host.nr++;
1076                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1077         }
1078         m->host.val[j].index = msr;
1079         m->host.val[j].value = host_val;
1080 }
1081
1082 static bool update_transition_efer(struct vcpu_vmx *vmx)
1083 {
1084         u64 guest_efer = vmx->vcpu.arch.efer;
1085         u64 ignore_bits = 0;
1086         int i;
1087
1088         /* Shadow paging assumes NX to be available.  */
1089         if (!enable_ept)
1090                 guest_efer |= EFER_NX;
1091
1092         /*
1093          * LMA and LME handled by hardware; SCE meaningless outside long mode.
1094          */
1095         ignore_bits |= EFER_SCE;
1096 #ifdef CONFIG_X86_64
1097         ignore_bits |= EFER_LMA | EFER_LME;
1098         /* SCE is meaningful only in long mode on Intel */
1099         if (guest_efer & EFER_LMA)
1100                 ignore_bits &= ~(u64)EFER_SCE;
1101 #endif
1102
1103         /*
1104          * On EPT, we can't emulate NX, so we must switch EFER atomically.
1105          * On CPUs that support "load IA32_EFER", always switch EFER
1106          * atomically, since it's faster than switching it manually.
1107          */
1108         if (cpu_has_load_ia32_efer() ||
1109             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1110                 if (!(guest_efer & EFER_LMA))
1111                         guest_efer &= ~EFER_LME;
1112                 if (guest_efer != host_efer)
1113                         add_atomic_switch_msr(vmx, MSR_EFER,
1114                                               guest_efer, host_efer, false);
1115                 else
1116                         clear_atomic_switch_msr(vmx, MSR_EFER);
1117                 return false;
1118         }
1119
1120         i = kvm_find_user_return_msr(MSR_EFER);
1121         if (i < 0)
1122                 return false;
1123
1124         clear_atomic_switch_msr(vmx, MSR_EFER);
1125
1126         guest_efer &= ~ignore_bits;
1127         guest_efer |= host_efer & ignore_bits;
1128
1129         vmx->guest_uret_msrs[i].data = guest_efer;
1130         vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1131
1132         return true;
1133 }
1134
1135 #ifdef CONFIG_X86_32
1136 /*
1137  * On 32-bit kernels, VM exits still load the FS and GS bases from the
1138  * VMCS rather than the segment table.  KVM uses this helper to figure
1139  * out the current bases to poke them into the VMCS before entry.
1140  */
1141 static unsigned long segment_base(u16 selector)
1142 {
1143         struct desc_struct *table;
1144         unsigned long v;
1145
1146         if (!(selector & ~SEGMENT_RPL_MASK))
1147                 return 0;
1148
1149         table = get_current_gdt_ro();
1150
1151         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1152                 u16 ldt_selector = kvm_read_ldt();
1153
1154                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1155                         return 0;
1156
1157                 table = (struct desc_struct *)segment_base(ldt_selector);
1158         }
1159         v = get_desc_base(&table[selector >> 3]);
1160         return v;
1161 }
1162 #endif
1163
1164 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1165 {
1166         return vmx_pt_mode_is_host_guest() &&
1167                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1168 }
1169
1170 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1171 {
1172         /* The base must be 128-byte aligned and a legal physical address. */
1173         return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1174 }
1175
1176 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1177 {
1178         u32 i;
1179
1180         wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1181         wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1182         wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1183         wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1184         for (i = 0; i < addr_range; i++) {
1185                 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1186                 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1187         }
1188 }
1189
1190 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1191 {
1192         u32 i;
1193
1194         rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1195         rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1196         rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1197         rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1198         for (i = 0; i < addr_range; i++) {
1199                 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1200                 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1201         }
1202 }
1203
1204 static void pt_guest_enter(struct vcpu_vmx *vmx)
1205 {
1206         if (vmx_pt_mode_is_system())
1207                 return;
1208
1209         /*
1210          * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1211          * Save host state before VM entry.
1212          */
1213         rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1214         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1215                 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1216                 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1217                 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1218         }
1219 }
1220
1221 static void pt_guest_exit(struct vcpu_vmx *vmx)
1222 {
1223         if (vmx_pt_mode_is_system())
1224                 return;
1225
1226         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1227                 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1228                 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1229         }
1230
1231         /*
1232          * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1233          * i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
1234          */
1235         if (vmx->pt_desc.host.ctl)
1236                 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1237 }
1238
1239 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1240                         unsigned long fs_base, unsigned long gs_base)
1241 {
1242         if (unlikely(fs_sel != host->fs_sel)) {
1243                 if (!(fs_sel & 7))
1244                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1245                 else
1246                         vmcs_write16(HOST_FS_SELECTOR, 0);
1247                 host->fs_sel = fs_sel;
1248         }
1249         if (unlikely(gs_sel != host->gs_sel)) {
1250                 if (!(gs_sel & 7))
1251                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1252                 else
1253                         vmcs_write16(HOST_GS_SELECTOR, 0);
1254                 host->gs_sel = gs_sel;
1255         }
1256         if (unlikely(fs_base != host->fs_base)) {
1257                 vmcs_writel(HOST_FS_BASE, fs_base);
1258                 host->fs_base = fs_base;
1259         }
1260         if (unlikely(gs_base != host->gs_base)) {
1261                 vmcs_writel(HOST_GS_BASE, gs_base);
1262                 host->gs_base = gs_base;
1263         }
1264 }
1265
1266 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1267 {
1268         struct vcpu_vmx *vmx = to_vmx(vcpu);
1269         struct vmcs_host_state *host_state;
1270 #ifdef CONFIG_X86_64
1271         int cpu = raw_smp_processor_id();
1272 #endif
1273         unsigned long fs_base, gs_base;
1274         u16 fs_sel, gs_sel;
1275         int i;
1276
1277         vmx->req_immediate_exit = false;
1278
1279         /*
1280          * Note that guest MSRs to be saved/restored can also be changed
1281          * when guest state is loaded. This happens when guest transitions
1282          * to/from long-mode by setting MSR_EFER.LMA.
1283          */
1284         if (!vmx->guest_uret_msrs_loaded) {
1285                 vmx->guest_uret_msrs_loaded = true;
1286                 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
1287                         if (!vmx->guest_uret_msrs[i].load_into_hardware)
1288                                 continue;
1289
1290                         kvm_set_user_return_msr(i,
1291                                                 vmx->guest_uret_msrs[i].data,
1292                                                 vmx->guest_uret_msrs[i].mask);
1293                 }
1294         }
1295
1296         if (vmx->nested.need_vmcs12_to_shadow_sync)
1297                 nested_sync_vmcs12_to_shadow(vcpu);
1298
1299         if (vmx->guest_state_loaded)
1300                 return;
1301
1302         host_state = &vmx->loaded_vmcs->host_state;
1303
1304         /*
1305          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1306          * allow segment selectors with cpl > 0 or ti == 1.
1307          */
1308         host_state->ldt_sel = kvm_read_ldt();
1309
1310 #ifdef CONFIG_X86_64
1311         savesegment(ds, host_state->ds_sel);
1312         savesegment(es, host_state->es_sel);
1313
1314         gs_base = cpu_kernelmode_gs_base(cpu);
1315         if (likely(is_64bit_mm(current->mm))) {
1316                 current_save_fsgs();
1317                 fs_sel = current->thread.fsindex;
1318                 gs_sel = current->thread.gsindex;
1319                 fs_base = current->thread.fsbase;
1320                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1321         } else {
1322                 savesegment(fs, fs_sel);
1323                 savesegment(gs, gs_sel);
1324                 fs_base = read_msr(MSR_FS_BASE);
1325                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1326         }
1327
1328         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1329 #else
1330         savesegment(fs, fs_sel);
1331         savesegment(gs, gs_sel);
1332         fs_base = segment_base(fs_sel);
1333         gs_base = segment_base(gs_sel);
1334 #endif
1335
1336         vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1337         vmx->guest_state_loaded = true;
1338 }
1339
1340 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1341 {
1342         struct vmcs_host_state *host_state;
1343
1344         if (!vmx->guest_state_loaded)
1345                 return;
1346
1347         host_state = &vmx->loaded_vmcs->host_state;
1348
1349         ++vmx->vcpu.stat.host_state_reload;
1350
1351 #ifdef CONFIG_X86_64
1352         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1353 #endif
1354         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1355                 kvm_load_ldt(host_state->ldt_sel);
1356 #ifdef CONFIG_X86_64
1357                 load_gs_index(host_state->gs_sel);
1358 #else
1359                 loadsegment(gs, host_state->gs_sel);
1360 #endif
1361         }
1362         if (host_state->fs_sel & 7)
1363                 loadsegment(fs, host_state->fs_sel);
1364 #ifdef CONFIG_X86_64
1365         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1366                 loadsegment(ds, host_state->ds_sel);
1367                 loadsegment(es, host_state->es_sel);
1368         }
1369 #endif
1370         invalidate_tss_limit();
1371 #ifdef CONFIG_X86_64
1372         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1373 #endif
1374         load_fixmap_gdt(raw_smp_processor_id());
1375         vmx->guest_state_loaded = false;
1376         vmx->guest_uret_msrs_loaded = false;
1377 }
1378
1379 #ifdef CONFIG_X86_64
1380 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1381 {
1382         preempt_disable();
1383         if (vmx->guest_state_loaded)
1384                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1385         preempt_enable();
1386         return vmx->msr_guest_kernel_gs_base;
1387 }
1388
1389 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1390 {
1391         preempt_disable();
1392         if (vmx->guest_state_loaded)
1393                 wrmsrl(MSR_KERNEL_GS_BASE, data);
1394         preempt_enable();
1395         vmx->msr_guest_kernel_gs_base = data;
1396 }
1397 #endif
1398
1399 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1400                         struct loaded_vmcs *buddy)
1401 {
1402         struct vcpu_vmx *vmx = to_vmx(vcpu);
1403         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1404         struct vmcs *prev;
1405
1406         if (!already_loaded) {
1407                 loaded_vmcs_clear(vmx->loaded_vmcs);
1408                 local_irq_disable();
1409
1410                 /*
1411                  * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1412                  * this cpu's percpu list, otherwise it may not yet be deleted
1413                  * from its previous cpu's percpu list.  Pairs with the
1414                  * smb_wmb() in __loaded_vmcs_clear().
1415                  */
1416                 smp_rmb();
1417
1418                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1419                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1420                 local_irq_enable();
1421         }
1422
1423         prev = per_cpu(current_vmcs, cpu);
1424         if (prev != vmx->loaded_vmcs->vmcs) {
1425                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1426                 vmcs_load(vmx->loaded_vmcs->vmcs);
1427
1428                 /*
1429                  * No indirect branch prediction barrier needed when switching
1430                  * the active VMCS within a vCPU, unless IBRS is advertised to
1431                  * the vCPU.  To minimize the number of IBPBs executed, KVM
1432                  * performs IBPB on nested VM-Exit (a single nested transition
1433                  * may switch the active VMCS multiple times).
1434                  */
1435                 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1436                         indirect_branch_prediction_barrier();
1437         }
1438
1439         if (!already_loaded) {
1440                 void *gdt = get_current_gdt_ro();
1441
1442                 /*
1443                  * Flush all EPTP/VPID contexts, the new pCPU may have stale
1444                  * TLB entries from its previous association with the vCPU.
1445                  */
1446                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1447
1448                 /*
1449                  * Linux uses per-cpu TSS and GDT, so set these when switching
1450                  * processors.  See 22.2.4.
1451                  */
1452                 vmcs_writel(HOST_TR_BASE,
1453                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1454                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
1455
1456                 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1457                         /* 22.2.3 */
1458                         vmcs_writel(HOST_IA32_SYSENTER_ESP,
1459                                     (unsigned long)(cpu_entry_stack(cpu) + 1));
1460                 }
1461
1462                 vmx->loaded_vmcs->cpu = cpu;
1463         }
1464 }
1465
1466 /*
1467  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1468  * vcpu mutex is already taken.
1469  */
1470 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1471 {
1472         struct vcpu_vmx *vmx = to_vmx(vcpu);
1473
1474         vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1475
1476         vmx_vcpu_pi_load(vcpu, cpu);
1477
1478         vmx->host_debugctlmsr = get_debugctlmsr();
1479 }
1480
1481 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1482 {
1483         vmx_vcpu_pi_put(vcpu);
1484
1485         vmx_prepare_switch_to_host(to_vmx(vcpu));
1486 }
1487
1488 bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1489 {
1490         return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1491 }
1492
1493 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1494 {
1495         struct vcpu_vmx *vmx = to_vmx(vcpu);
1496         unsigned long rflags, save_rflags;
1497
1498         if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1499                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1500                 rflags = vmcs_readl(GUEST_RFLAGS);
1501                 if (vmx->rmode.vm86_active) {
1502                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1503                         save_rflags = vmx->rmode.save_rflags;
1504                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1505                 }
1506                 vmx->rflags = rflags;
1507         }
1508         return vmx->rflags;
1509 }
1510
1511 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1512 {
1513         struct vcpu_vmx *vmx = to_vmx(vcpu);
1514         unsigned long old_rflags;
1515
1516         /*
1517          * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
1518          * is an unrestricted guest in order to mark L2 as needing emulation
1519          * if L1 runs L2 as a restricted guest.
1520          */
1521         if (is_unrestricted_guest(vcpu)) {
1522                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1523                 vmx->rflags = rflags;
1524                 vmcs_writel(GUEST_RFLAGS, rflags);
1525                 return;
1526         }
1527
1528         old_rflags = vmx_get_rflags(vcpu);
1529         vmx->rflags = rflags;
1530         if (vmx->rmode.vm86_active) {
1531                 vmx->rmode.save_rflags = rflags;
1532                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1533         }
1534         vmcs_writel(GUEST_RFLAGS, rflags);
1535
1536         if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1537                 vmx->emulation_required = vmx_emulation_required(vcpu);
1538 }
1539
1540 static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1541 {
1542         return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1543 }
1544
1545 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1546 {
1547         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1548         int ret = 0;
1549
1550         if (interruptibility & GUEST_INTR_STATE_STI)
1551                 ret |= KVM_X86_SHADOW_INT_STI;
1552         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1553                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1554
1555         return ret;
1556 }
1557
1558 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1559 {
1560         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1561         u32 interruptibility = interruptibility_old;
1562
1563         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1564
1565         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1566                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1567         else if (mask & KVM_X86_SHADOW_INT_STI)
1568                 interruptibility |= GUEST_INTR_STATE_STI;
1569
1570         if ((interruptibility != interruptibility_old))
1571                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1572 }
1573
1574 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1575 {
1576         struct vcpu_vmx *vmx = to_vmx(vcpu);
1577         unsigned long value;
1578
1579         /*
1580          * Any MSR write that attempts to change bits marked reserved will
1581          * case a #GP fault.
1582          */
1583         if (data & vmx->pt_desc.ctl_bitmask)
1584                 return 1;
1585
1586         /*
1587          * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1588          * result in a #GP unless the same write also clears TraceEn.
1589          */
1590         if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1591                 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1592                 return 1;
1593
1594         /*
1595          * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1596          * and FabricEn would cause #GP, if
1597          * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1598          */
1599         if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1600                 !(data & RTIT_CTL_FABRIC_EN) &&
1601                 !intel_pt_validate_cap(vmx->pt_desc.caps,
1602                                         PT_CAP_single_range_output))
1603                 return 1;
1604
1605         /*
1606          * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1607          * utilize encodings marked reserved will cause a #GP fault.
1608          */
1609         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1610         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1611                         !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1612                         RTIT_CTL_MTC_RANGE_OFFSET, &value))
1613                 return 1;
1614         value = intel_pt_validate_cap(vmx->pt_desc.caps,
1615                                                 PT_CAP_cycle_thresholds);
1616         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1617                         !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1618                         RTIT_CTL_CYC_THRESH_OFFSET, &value))
1619                 return 1;
1620         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1621         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1622                         !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1623                         RTIT_CTL_PSB_FREQ_OFFSET, &value))
1624                 return 1;
1625
1626         /*
1627          * If ADDRx_CFG is reserved or the encodings is >2 will
1628          * cause a #GP fault.
1629          */
1630         value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1631         if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
1632                 return 1;
1633         value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1634         if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
1635                 return 1;
1636         value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1637         if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
1638                 return 1;
1639         value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1640         if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
1641                 return 1;
1642
1643         return 0;
1644 }
1645
1646 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1647                                         void *insn, int insn_len)
1648 {
1649         /*
1650          * Emulation of instructions in SGX enclaves is impossible as RIP does
1651          * not point at the failing instruction, and even if it did, the code
1652          * stream is inaccessible.  Inject #UD instead of exiting to userspace
1653          * so that guest userspace can't DoS the guest simply by triggering
1654          * emulation (enclaves are CPL3 only).
1655          */
1656         if (to_vmx(vcpu)->exit_reason.enclave_mode) {
1657                 kvm_queue_exception(vcpu, UD_VECTOR);
1658                 return false;
1659         }
1660         return true;
1661 }
1662
1663 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1664 {
1665         union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
1666         unsigned long rip, orig_rip;
1667         u32 instr_len;
1668
1669         /*
1670          * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1671          * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1672          * set when EPT misconfig occurs.  In practice, real hardware updates
1673          * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1674          * (namely Hyper-V) don't set it due to it being undefined behavior,
1675          * i.e. we end up advancing IP with some random value.
1676          */
1677         if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1678             exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1679                 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1680
1681                 /*
1682                  * Emulating an enclave's instructions isn't supported as KVM
1683                  * cannot access the enclave's memory or its true RIP, e.g. the
1684                  * vmcs.GUEST_RIP points at the exit point of the enclave, not
1685                  * the RIP that actually triggered the VM-Exit.  But, because
1686                  * most instructions that cause VM-Exit will #UD in an enclave,
1687                  * most instruction-based VM-Exits simply do not occur.
1688                  *
1689                  * There are a few exceptions, notably the debug instructions
1690                  * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1691                  * and generate #DB/#BP as expected, which KVM might intercept.
1692                  * But again, the CPU does the dirty work and saves an instr
1693                  * length of zero so VMMs don't shoot themselves in the foot.
1694                  * WARN if KVM tries to skip a non-zero length instruction on
1695                  * a VM-Exit from an enclave.
1696                  */
1697                 if (!instr_len)
1698                         goto rip_updated;
1699
1700                 WARN_ONCE(exit_reason.enclave_mode,
1701                           "skipping instruction after SGX enclave VM-Exit");
1702
1703                 orig_rip = kvm_rip_read(vcpu);
1704                 rip = orig_rip + instr_len;
1705 #ifdef CONFIG_X86_64
1706                 /*
1707                  * We need to mask out the high 32 bits of RIP if not in 64-bit
1708                  * mode, but just finding out that we are in 64-bit mode is
1709                  * quite expensive.  Only do it if there was a carry.
1710                  */
1711                 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1712                         rip = (u32)rip;
1713 #endif
1714                 kvm_rip_write(vcpu, rip);
1715         } else {
1716                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1717                         return 0;
1718         }
1719
1720 rip_updated:
1721         /* skipping an emulated instruction also counts */
1722         vmx_set_interrupt_shadow(vcpu, 0);
1723
1724         return 1;
1725 }
1726
1727 /*
1728  * Recognizes a pending MTF VM-exit and records the nested state for later
1729  * delivery.
1730  */
1731 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1732 {
1733         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1734         struct vcpu_vmx *vmx = to_vmx(vcpu);
1735
1736         if (!is_guest_mode(vcpu))
1737                 return;
1738
1739         /*
1740          * Per the SDM, MTF takes priority over debug-trap exceptions besides
1741          * TSS T-bit traps and ICEBP (INT1).  KVM doesn't emulate T-bit traps
1742          * or ICEBP (in the emulator proper), and skipping of ICEBP after an
1743          * intercepted #DB deliberately avoids single-step #DB and MTF updates
1744          * as ICEBP is higher priority than both.  As instruction emulation is
1745          * completed at this point (i.e. KVM is at the instruction boundary),
1746          * any #DB exception pending delivery must be a debug-trap of lower
1747          * priority than MTF.  Record the pending MTF state to be delivered in
1748          * vmx_check_nested_events().
1749          */
1750         if (nested_cpu_has_mtf(vmcs12) &&
1751             (!vcpu->arch.exception.pending ||
1752              vcpu->arch.exception.vector == DB_VECTOR) &&
1753             (!vcpu->arch.exception_vmexit.pending ||
1754              vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
1755                 vmx->nested.mtf_pending = true;
1756                 kvm_make_request(KVM_REQ_EVENT, vcpu);
1757         } else {
1758                 vmx->nested.mtf_pending = false;
1759         }
1760 }
1761
1762 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1763 {
1764         vmx_update_emulated_instruction(vcpu);
1765         return skip_emulated_instruction(vcpu);
1766 }
1767
1768 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1769 {
1770         /*
1771          * Ensure that we clear the HLT state in the VMCS.  We don't need to
1772          * explicitly skip the instruction because if the HLT state is set,
1773          * then the instruction is already executing and RIP has already been
1774          * advanced.
1775          */
1776         if (kvm_hlt_in_guest(vcpu->kvm) &&
1777                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1778                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1779 }
1780
1781 static void vmx_inject_exception(struct kvm_vcpu *vcpu)
1782 {
1783         struct kvm_queued_exception *ex = &vcpu->arch.exception;
1784         u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
1785         struct vcpu_vmx *vmx = to_vmx(vcpu);
1786
1787         kvm_deliver_exception_payload(vcpu, ex);
1788
1789         if (ex->has_error_code) {
1790                 /*
1791                  * Despite the error code being architecturally defined as 32
1792                  * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1793                  * VMX don't actually supporting setting bits 31:16.  Hardware
1794                  * will (should) never provide a bogus error code, but AMD CPUs
1795                  * do generate error codes with bits 31:16 set, and so KVM's
1796                  * ABI lets userspace shove in arbitrary 32-bit values.  Drop
1797                  * the upper bits to avoid VM-Fail, losing information that
1798                  * does't really exist is preferable to killing the VM.
1799                  */
1800                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
1801                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1802         }
1803
1804         if (vmx->rmode.vm86_active) {
1805                 int inc_eip = 0;
1806                 if (kvm_exception_is_soft(ex->vector))
1807                         inc_eip = vcpu->arch.event_exit_inst_len;
1808                 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
1809                 return;
1810         }
1811
1812         WARN_ON_ONCE(vmx->emulation_required);
1813
1814         if (kvm_exception_is_soft(ex->vector)) {
1815                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1816                              vmx->vcpu.arch.event_exit_inst_len);
1817                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1818         } else
1819                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1820
1821         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1822
1823         vmx_clear_hlt(vcpu);
1824 }
1825
1826 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1827                                bool load_into_hardware)
1828 {
1829         struct vmx_uret_msr *uret_msr;
1830
1831         uret_msr = vmx_find_uret_msr(vmx, msr);
1832         if (!uret_msr)
1833                 return;
1834
1835         uret_msr->load_into_hardware = load_into_hardware;
1836 }
1837
1838 /*
1839  * Configuring user return MSRs to automatically save, load, and restore MSRs
1840  * that need to be shoved into hardware when running the guest.  Note, omitting
1841  * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1842  * loaded into hardware when running the guest.
1843  */
1844 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
1845 {
1846 #ifdef CONFIG_X86_64
1847         bool load_syscall_msrs;
1848
1849         /*
1850          * The SYSCALL MSRs are only needed on long mode guests, and only
1851          * when EFER.SCE is set.
1852          */
1853         load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1854                             (vmx->vcpu.arch.efer & EFER_SCE);
1855
1856         vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1857         vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1858         vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
1859 #endif
1860         vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
1861
1862         vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1863                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1864                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
1865
1866         /*
1867          * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1868          * kernel and old userspace.  If those guests run on a tsx=off host, do
1869          * allow guests to use TSX_CTRL, but don't change the value in hardware
1870          * so that TSX remains always disabled.
1871          */
1872         vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
1873
1874         /*
1875          * The set of MSRs to load may have changed, reload MSRs before the
1876          * next VM-Enter.
1877          */
1878         vmx->guest_uret_msrs_loaded = false;
1879 }
1880
1881 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1882 {
1883         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1884
1885         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
1886                 return vmcs12->tsc_offset;
1887
1888         return 0;
1889 }
1890
1891 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1892 {
1893         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1894
1895         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
1896             nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
1897                 return vmcs12->tsc_multiplier;
1898
1899         return kvm_caps.default_tsc_scaling_ratio;
1900 }
1901
1902 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1903 {
1904         vmcs_write64(TSC_OFFSET, offset);
1905 }
1906
1907 static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1908 {
1909         vmcs_write64(TSC_MULTIPLIER, multiplier);
1910 }
1911
1912 /*
1913  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1914  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1915  * all guests if the "nested" module option is off, and can also be disabled
1916  * for a single guest by disabling its VMX cpuid bit.
1917  */
1918 bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1919 {
1920         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1921 }
1922
1923 /*
1924  * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
1925  * guest CPUID.  Note, KVM allows userspace to set "VMX in SMX" to maintain
1926  * backwards compatibility even though KVM doesn't support emulating SMX.  And
1927  * because userspace set "VMX in SMX", the guest must also be allowed to set it,
1928  * e.g. if the MSR is left unlocked and the guest does a RMW operation.
1929  */
1930 #define KVM_SUPPORTED_FEATURE_CONTROL  (FEAT_CTL_LOCKED                  | \
1931                                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX  | \
1932                                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
1933                                         FEAT_CTL_SGX_LC_ENABLED          | \
1934                                         FEAT_CTL_SGX_ENABLED             | \
1935                                         FEAT_CTL_LMCE_ENABLED)
1936
1937 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
1938                                                     struct msr_data *msr)
1939 {
1940         uint64_t valid_bits;
1941
1942         /*
1943          * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
1944          * exposed to the guest.
1945          */
1946         WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
1947                      ~KVM_SUPPORTED_FEATURE_CONTROL);
1948
1949         if (!msr->host_initiated &&
1950             (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
1951                 return false;
1952
1953         if (msr->host_initiated)
1954                 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
1955         else
1956                 valid_bits = vmx->msr_ia32_feature_control_valid_bits;
1957
1958         return !(msr->data & ~valid_bits);
1959 }
1960
1961 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1962 {
1963         switch (msr->index) {
1964         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
1965                 if (!nested)
1966                         return 1;
1967                 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1968         default:
1969                 return KVM_MSR_RET_INVALID;
1970         }
1971 }
1972
1973 /*
1974  * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
1975  * Returns 0 on success, non-0 otherwise.
1976  * Assumes vcpu_load() was already called.
1977  */
1978 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1979 {
1980         struct vcpu_vmx *vmx = to_vmx(vcpu);
1981         struct vmx_uret_msr *msr;
1982         u32 index;
1983
1984         switch (msr_info->index) {
1985 #ifdef CONFIG_X86_64
1986         case MSR_FS_BASE:
1987                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
1988                 break;
1989         case MSR_GS_BASE:
1990                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
1991                 break;
1992         case MSR_KERNEL_GS_BASE:
1993                 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1994                 break;
1995 #endif
1996         case MSR_EFER:
1997                 return kvm_get_msr_common(vcpu, msr_info);
1998         case MSR_IA32_TSX_CTRL:
1999                 if (!msr_info->host_initiated &&
2000                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2001                         return 1;
2002                 goto find_uret_msr;
2003         case MSR_IA32_UMWAIT_CONTROL:
2004                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2005                         return 1;
2006
2007                 msr_info->data = vmx->msr_ia32_umwait_control;
2008                 break;
2009         case MSR_IA32_SPEC_CTRL:
2010                 if (!msr_info->host_initiated &&
2011                     !guest_has_spec_ctrl_msr(vcpu))
2012                         return 1;
2013
2014                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
2015                 break;
2016         case MSR_IA32_SYSENTER_CS:
2017                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2018                 break;
2019         case MSR_IA32_SYSENTER_EIP:
2020                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2021                 break;
2022         case MSR_IA32_SYSENTER_ESP:
2023                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2024                 break;
2025         case MSR_IA32_BNDCFGS:
2026                 if (!kvm_mpx_supported() ||
2027                     (!msr_info->host_initiated &&
2028                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2029                         return 1;
2030                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2031                 break;
2032         case MSR_IA32_MCG_EXT_CTL:
2033                 if (!msr_info->host_initiated &&
2034                     !(vmx->msr_ia32_feature_control &
2035                       FEAT_CTL_LMCE_ENABLED))
2036                         return 1;
2037                 msr_info->data = vcpu->arch.mcg_ext_ctl;
2038                 break;
2039         case MSR_IA32_FEAT_CTL:
2040                 msr_info->data = vmx->msr_ia32_feature_control;
2041                 break;
2042         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2043                 if (!msr_info->host_initiated &&
2044                     !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
2045                         return 1;
2046                 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2047                         [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2048                 break;
2049         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2050                 if (!nested_vmx_allowed(vcpu))
2051                         return 1;
2052                 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2053                                     &msr_info->data))
2054                         return 1;
2055                 /*
2056                  * Enlightened VMCS v1 doesn't have certain VMCS fields but
2057                  * instead of just ignoring the features, different Hyper-V
2058                  * versions are either trying to use them and fail or do some
2059                  * sanity checking and refuse to boot. Filter all unsupported
2060                  * features out.
2061                  */
2062                 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
2063                         nested_evmcs_filter_control_msr(vcpu, msr_info->index,
2064                                                         &msr_info->data);
2065                 break;
2066         case MSR_IA32_RTIT_CTL:
2067                 if (!vmx_pt_mode_is_host_guest())
2068                         return 1;
2069                 msr_info->data = vmx->pt_desc.guest.ctl;
2070                 break;
2071         case MSR_IA32_RTIT_STATUS:
2072                 if (!vmx_pt_mode_is_host_guest())
2073                         return 1;
2074                 msr_info->data = vmx->pt_desc.guest.status;
2075                 break;
2076         case MSR_IA32_RTIT_CR3_MATCH:
2077                 if (!vmx_pt_mode_is_host_guest() ||
2078                         !intel_pt_validate_cap(vmx->pt_desc.caps,
2079                                                 PT_CAP_cr3_filtering))
2080                         return 1;
2081                 msr_info->data = vmx->pt_desc.guest.cr3_match;
2082                 break;
2083         case MSR_IA32_RTIT_OUTPUT_BASE:
2084                 if (!vmx_pt_mode_is_host_guest() ||
2085                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2086                                         PT_CAP_topa_output) &&
2087                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2088                                         PT_CAP_single_range_output)))
2089                         return 1;
2090                 msr_info->data = vmx->pt_desc.guest.output_base;
2091                 break;
2092         case MSR_IA32_RTIT_OUTPUT_MASK:
2093                 if (!vmx_pt_mode_is_host_guest() ||
2094                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2095                                         PT_CAP_topa_output) &&
2096                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2097                                         PT_CAP_single_range_output)))
2098                         return 1;
2099                 msr_info->data = vmx->pt_desc.guest.output_mask;
2100                 break;
2101         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2102                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2103                 if (!vmx_pt_mode_is_host_guest() ||
2104                     (index >= 2 * vmx->pt_desc.num_address_ranges))
2105                         return 1;
2106                 if (index % 2)
2107                         msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2108                 else
2109                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2110                 break;
2111         case MSR_IA32_DEBUGCTLMSR:
2112                 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
2113                 break;
2114         default:
2115         find_uret_msr:
2116                 msr = vmx_find_uret_msr(vmx, msr_info->index);
2117                 if (msr) {
2118                         msr_info->data = msr->data;
2119                         break;
2120                 }
2121                 return kvm_get_msr_common(vcpu, msr_info);
2122         }
2123
2124         return 0;
2125 }
2126
2127 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2128                                                     u64 data)
2129 {
2130 #ifdef CONFIG_X86_64
2131         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
2132                 return (u32)data;
2133 #endif
2134         return (unsigned long)data;
2135 }
2136
2137 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
2138 {
2139         u64 debugctl = 0;
2140
2141         if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
2142             (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
2143                 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
2144
2145         if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
2146             (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
2147                 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
2148
2149         return debugctl;
2150 }
2151
2152 /*
2153  * Writes msr value into the appropriate "register".
2154  * Returns 0 on success, non-0 otherwise.
2155  * Assumes vcpu_load() was already called.
2156  */
2157 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2158 {
2159         struct vcpu_vmx *vmx = to_vmx(vcpu);
2160         struct vmx_uret_msr *msr;
2161         int ret = 0;
2162         u32 msr_index = msr_info->index;
2163         u64 data = msr_info->data;
2164         u32 index;
2165
2166         switch (msr_index) {
2167         case MSR_EFER:
2168                 ret = kvm_set_msr_common(vcpu, msr_info);
2169                 break;
2170 #ifdef CONFIG_X86_64
2171         case MSR_FS_BASE:
2172                 vmx_segment_cache_clear(vmx);
2173                 vmcs_writel(GUEST_FS_BASE, data);
2174                 break;
2175         case MSR_GS_BASE:
2176                 vmx_segment_cache_clear(vmx);
2177                 vmcs_writel(GUEST_GS_BASE, data);
2178                 break;
2179         case MSR_KERNEL_GS_BASE:
2180                 vmx_write_guest_kernel_gs_base(vmx, data);
2181                 break;
2182         case MSR_IA32_XFD:
2183                 ret = kvm_set_msr_common(vcpu, msr_info);
2184                 /*
2185                  * Always intercepting WRMSR could incur non-negligible
2186                  * overhead given xfd might be changed frequently in
2187                  * guest context switch. Disable write interception
2188                  * upon the first write with a non-zero value (indicating
2189                  * potential usage on dynamic xfeatures). Also update
2190                  * exception bitmap to trap #NM for proper virtualization
2191                  * of guest xfd_err.
2192                  */
2193                 if (!ret && data) {
2194                         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2195                                                       MSR_TYPE_RW);
2196                         vcpu->arch.xfd_no_write_intercept = true;
2197                         vmx_update_exception_bitmap(vcpu);
2198                 }
2199                 break;
2200 #endif
2201         case MSR_IA32_SYSENTER_CS:
2202                 if (is_guest_mode(vcpu))
2203                         get_vmcs12(vcpu)->guest_sysenter_cs = data;
2204                 vmcs_write32(GUEST_SYSENTER_CS, data);
2205                 break;
2206         case MSR_IA32_SYSENTER_EIP:
2207                 if (is_guest_mode(vcpu)) {
2208                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2209                         get_vmcs12(vcpu)->guest_sysenter_eip = data;
2210                 }
2211                 vmcs_writel(GUEST_SYSENTER_EIP, data);
2212                 break;
2213         case MSR_IA32_SYSENTER_ESP:
2214                 if (is_guest_mode(vcpu)) {
2215                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2216                         get_vmcs12(vcpu)->guest_sysenter_esp = data;
2217                 }
2218                 vmcs_writel(GUEST_SYSENTER_ESP, data);
2219                 break;
2220         case MSR_IA32_DEBUGCTLMSR: {
2221                 u64 invalid;
2222
2223                 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
2224                 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
2225                         kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
2226                         data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2227                         invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2228                 }
2229
2230                 if (invalid)
2231                         return 1;
2232
2233                 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2234                                                 VM_EXIT_SAVE_DEBUG_CONTROLS)
2235                         get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2236
2237                 vmcs_write64(GUEST_IA32_DEBUGCTL, data);
2238                 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2239                     (data & DEBUGCTLMSR_LBR))
2240                         intel_pmu_create_guest_lbr_event(vcpu);
2241                 return 0;
2242         }
2243         case MSR_IA32_BNDCFGS:
2244                 if (!kvm_mpx_supported() ||
2245                     (!msr_info->host_initiated &&
2246                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2247                         return 1;
2248                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
2249                     (data & MSR_IA32_BNDCFGS_RSVD))
2250                         return 1;
2251
2252                 if (is_guest_mode(vcpu) &&
2253                     ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2254                      (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2255                         get_vmcs12(vcpu)->guest_bndcfgs = data;
2256
2257                 vmcs_write64(GUEST_BNDCFGS, data);
2258                 break;
2259         case MSR_IA32_UMWAIT_CONTROL:
2260                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2261                         return 1;
2262
2263                 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2264                 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2265                         return 1;
2266
2267                 vmx->msr_ia32_umwait_control = data;
2268                 break;
2269         case MSR_IA32_SPEC_CTRL:
2270                 if (!msr_info->host_initiated &&
2271                     !guest_has_spec_ctrl_msr(vcpu))
2272                         return 1;
2273
2274                 if (kvm_spec_ctrl_test_value(data))
2275                         return 1;
2276
2277                 vmx->spec_ctrl = data;
2278                 if (!data)
2279                         break;
2280
2281                 /*
2282                  * For non-nested:
2283                  * When it's written (to non-zero) for the first time, pass
2284                  * it through.
2285                  *
2286                  * For nested:
2287                  * The handling of the MSR bitmap for L2 guests is done in
2288                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2289                  * vmcs02.msr_bitmap here since it gets completely overwritten
2290                  * in the merging. We update the vmcs01 here for L1 as well
2291                  * since it will end up touching the MSR anyway now.
2292                  */
2293                 vmx_disable_intercept_for_msr(vcpu,
2294                                               MSR_IA32_SPEC_CTRL,
2295                                               MSR_TYPE_RW);
2296                 break;
2297         case MSR_IA32_TSX_CTRL:
2298                 if (!msr_info->host_initiated &&
2299                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2300                         return 1;
2301                 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2302                         return 1;
2303                 goto find_uret_msr;
2304         case MSR_IA32_CR_PAT:
2305                 ret = kvm_set_msr_common(vcpu, msr_info);
2306                 if (ret)
2307                         break;
2308
2309                 if (is_guest_mode(vcpu) &&
2310                     get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2311                         get_vmcs12(vcpu)->guest_ia32_pat = data;
2312
2313                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
2314                         vmcs_write64(GUEST_IA32_PAT, data);
2315                 break;
2316         case MSR_IA32_MCG_EXT_CTL:
2317                 if ((!msr_info->host_initiated &&
2318                      !(to_vmx(vcpu)->msr_ia32_feature_control &
2319                        FEAT_CTL_LMCE_ENABLED)) ||
2320                     (data & ~MCG_EXT_CTL_LMCE_EN))
2321                         return 1;
2322                 vcpu->arch.mcg_ext_ctl = data;
2323                 break;
2324         case MSR_IA32_FEAT_CTL:
2325                 if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
2326                         return 1;
2327
2328                 vmx->msr_ia32_feature_control = data;
2329                 if (msr_info->host_initiated && data == 0)
2330                         vmx_leave_nested(vcpu);
2331
2332                 /* SGX may be enabled/disabled by guest's firmware */
2333                 vmx_write_encls_bitmap(vcpu, NULL);
2334                 break;
2335         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2336                 /*
2337                  * On real hardware, the LE hash MSRs are writable before
2338                  * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2339                  * at which point SGX related bits in IA32_FEATURE_CONTROL
2340                  * become writable.
2341                  *
2342                  * KVM does not emulate SGX activation for simplicity, so
2343                  * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2344                  * is unlocked.  This is technically not architectural
2345                  * behavior, but it's close enough.
2346                  */
2347                 if (!msr_info->host_initiated &&
2348                     (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
2349                     ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2350                     !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2351                         return 1;
2352                 vmx->msr_ia32_sgxlepubkeyhash
2353                         [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2354                 break;
2355         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2356                 if (!msr_info->host_initiated)
2357                         return 1; /* they are read-only */
2358                 if (!nested_vmx_allowed(vcpu))
2359                         return 1;
2360                 return vmx_set_vmx_msr(vcpu, msr_index, data);
2361         case MSR_IA32_RTIT_CTL:
2362                 if (!vmx_pt_mode_is_host_guest() ||
2363                         vmx_rtit_ctl_check(vcpu, data) ||
2364                         vmx->nested.vmxon)
2365                         return 1;
2366                 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2367                 vmx->pt_desc.guest.ctl = data;
2368                 pt_update_intercept_for_msr(vcpu);
2369                 break;
2370         case MSR_IA32_RTIT_STATUS:
2371                 if (!pt_can_write_msr(vmx))
2372                         return 1;
2373                 if (data & MSR_IA32_RTIT_STATUS_MASK)
2374                         return 1;
2375                 vmx->pt_desc.guest.status = data;
2376                 break;
2377         case MSR_IA32_RTIT_CR3_MATCH:
2378                 if (!pt_can_write_msr(vmx))
2379                         return 1;
2380                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2381                                            PT_CAP_cr3_filtering))
2382                         return 1;
2383                 vmx->pt_desc.guest.cr3_match = data;
2384                 break;
2385         case MSR_IA32_RTIT_OUTPUT_BASE:
2386                 if (!pt_can_write_msr(vmx))
2387                         return 1;
2388                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2389                                            PT_CAP_topa_output) &&
2390                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2391                                            PT_CAP_single_range_output))
2392                         return 1;
2393                 if (!pt_output_base_valid(vcpu, data))
2394                         return 1;
2395                 vmx->pt_desc.guest.output_base = data;
2396                 break;
2397         case MSR_IA32_RTIT_OUTPUT_MASK:
2398                 if (!pt_can_write_msr(vmx))
2399                         return 1;
2400                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2401                                            PT_CAP_topa_output) &&
2402                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2403                                            PT_CAP_single_range_output))
2404                         return 1;
2405                 vmx->pt_desc.guest.output_mask = data;
2406                 break;
2407         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2408                 if (!pt_can_write_msr(vmx))
2409                         return 1;
2410                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2411                 if (index >= 2 * vmx->pt_desc.num_address_ranges)
2412                         return 1;
2413                 if (is_noncanonical_address(data, vcpu))
2414                         return 1;
2415                 if (index % 2)
2416                         vmx->pt_desc.guest.addr_b[index / 2] = data;
2417                 else
2418                         vmx->pt_desc.guest.addr_a[index / 2] = data;
2419                 break;
2420         case MSR_IA32_PERF_CAPABILITIES:
2421                 if (data && !vcpu_to_pmu(vcpu)->version)
2422                         return 1;
2423                 if (data & PMU_CAP_LBR_FMT) {
2424                         if ((data & PMU_CAP_LBR_FMT) !=
2425                             (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
2426                                 return 1;
2427                         if (!cpuid_model_is_consistent(vcpu))
2428                                 return 1;
2429                 }
2430                 if (data & PERF_CAP_PEBS_FORMAT) {
2431                         if ((data & PERF_CAP_PEBS_MASK) !=
2432                             (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
2433                                 return 1;
2434                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
2435                                 return 1;
2436                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
2437                                 return 1;
2438                         if (!cpuid_model_is_consistent(vcpu))
2439                                 return 1;
2440                 }
2441                 ret = kvm_set_msr_common(vcpu, msr_info);
2442                 break;
2443
2444         default:
2445         find_uret_msr:
2446                 msr = vmx_find_uret_msr(vmx, msr_index);
2447                 if (msr)
2448                         ret = vmx_set_guest_uret_msr(vmx, msr, data);
2449                 else
2450                         ret = kvm_set_msr_common(vcpu, msr_info);
2451         }
2452
2453         /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2454         if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2455                 vmx_update_fb_clear_dis(vcpu, vmx);
2456
2457         return ret;
2458 }
2459
2460 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2461 {
2462         unsigned long guest_owned_bits;
2463
2464         kvm_register_mark_available(vcpu, reg);
2465
2466         switch (reg) {
2467         case VCPU_REGS_RSP:
2468                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2469                 break;
2470         case VCPU_REGS_RIP:
2471                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2472                 break;
2473         case VCPU_EXREG_PDPTR:
2474                 if (enable_ept)
2475                         ept_save_pdptrs(vcpu);
2476                 break;
2477         case VCPU_EXREG_CR0:
2478                 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2479
2480                 vcpu->arch.cr0 &= ~guest_owned_bits;
2481                 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2482                 break;
2483         case VCPU_EXREG_CR3:
2484                 /*
2485                  * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2486                  * CR3 is loaded into hardware, not the guest's CR3.
2487                  */
2488                 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2489                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2490                 break;
2491         case VCPU_EXREG_CR4:
2492                 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2493
2494                 vcpu->arch.cr4 &= ~guest_owned_bits;
2495                 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2496                 break;
2497         default:
2498                 KVM_BUG_ON(1, vcpu->kvm);
2499                 break;
2500         }
2501 }
2502
2503 /*
2504  * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2505  * directly instead of going through cpu_has(), to ensure KVM is trapping
2506  * ENCLS whenever it's supported in hardware.  It does not matter whether
2507  * the host OS supports or has enabled SGX.
2508  */
2509 static bool cpu_has_sgx(void)
2510 {
2511         return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2512 }
2513
2514 /*
2515  * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2516  * can't be used due to errata where VM Exit may incorrectly clear
2517  * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
2518  * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2519  */
2520 static bool cpu_has_perf_global_ctrl_bug(void)
2521 {
2522         if (boot_cpu_data.x86 == 0x6) {
2523                 switch (boot_cpu_data.x86_model) {
2524                 case INTEL_FAM6_NEHALEM_EP:     /* AAK155 */
2525                 case INTEL_FAM6_NEHALEM:        /* AAP115 */
2526                 case INTEL_FAM6_WESTMERE:       /* AAT100 */
2527                 case INTEL_FAM6_WESTMERE_EP:    /* BC86,AAY89,BD102 */
2528                 case INTEL_FAM6_NEHALEM_EX:     /* BA97 */
2529                         return true;
2530                 default:
2531                         break;
2532                 }
2533         }
2534
2535         return false;
2536 }
2537
2538 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
2539 {
2540         u32 vmx_msr_low, vmx_msr_high;
2541         u32 ctl = ctl_min | ctl_opt;
2542
2543         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2544
2545         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2546         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2547
2548         /* Ensure minimum (required) set of control bits are supported. */
2549         if (ctl_min & ~ctl)
2550                 return -EIO;
2551
2552         *result = ctl;
2553         return 0;
2554 }
2555
2556 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
2557 {
2558         u64 allowed;
2559
2560         rdmsrl(msr, allowed);
2561
2562         return  ctl_opt & allowed;
2563 }
2564
2565 static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2566                              struct vmx_capability *vmx_cap)
2567 {
2568         u32 vmx_msr_low, vmx_msr_high;
2569         u32 _pin_based_exec_control = 0;
2570         u32 _cpu_based_exec_control = 0;
2571         u32 _cpu_based_2nd_exec_control = 0;
2572         u64 _cpu_based_3rd_exec_control = 0;
2573         u32 _vmexit_control = 0;
2574         u32 _vmentry_control = 0;
2575         u64 misc_msr;
2576         int i;
2577
2578         /*
2579          * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2580          * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2581          * intercepts writes to PAT and EFER, i.e. never enables those controls.
2582          */
2583         struct {
2584                 u32 entry_control;
2585                 u32 exit_control;
2586         } const vmcs_entry_exit_pairs[] = {
2587                 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,  VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2588                 { VM_ENTRY_LOAD_IA32_PAT,               VM_EXIT_LOAD_IA32_PAT },
2589                 { VM_ENTRY_LOAD_IA32_EFER,              VM_EXIT_LOAD_IA32_EFER },
2590                 { VM_ENTRY_LOAD_BNDCFGS,                VM_EXIT_CLEAR_BNDCFGS },
2591                 { VM_ENTRY_LOAD_IA32_RTIT_CTL,          VM_EXIT_CLEAR_IA32_RTIT_CTL },
2592         };
2593
2594         memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2595
2596         if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2597                                 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2598                                 MSR_IA32_VMX_PROCBASED_CTLS,
2599                                 &_cpu_based_exec_control))
2600                 return -EIO;
2601         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2602                 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2603                                         KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
2604                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2605                                         &_cpu_based_2nd_exec_control))
2606                         return -EIO;
2607         }
2608 #ifndef CONFIG_X86_64
2609         if (!(_cpu_based_2nd_exec_control &
2610                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2611                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2612 #endif
2613
2614         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2615                 _cpu_based_2nd_exec_control &= ~(
2616                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2617                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2618                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2619
2620         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2621                 &vmx_cap->ept, &vmx_cap->vpid);
2622
2623         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2624             vmx_cap->ept) {
2625                 pr_warn_once("EPT CAP should not exist if not support "
2626                                 "1-setting enable EPT VM-execution control\n");
2627
2628                 if (error_on_inconsistent_vmcs_config)
2629                         return -EIO;
2630
2631                 vmx_cap->ept = 0;
2632         }
2633         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2634             vmx_cap->vpid) {
2635                 pr_warn_once("VPID CAP should not exist if not support "
2636                                 "1-setting enable VPID VM-execution control\n");
2637
2638                 if (error_on_inconsistent_vmcs_config)
2639                         return -EIO;
2640
2641                 vmx_cap->vpid = 0;
2642         }
2643
2644         if (!cpu_has_sgx())
2645                 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2646
2647         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2648                 _cpu_based_3rd_exec_control =
2649                         adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
2650                                               MSR_IA32_VMX_PROCBASED_CTLS3);
2651
2652         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2653                                 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2654                                 MSR_IA32_VMX_EXIT_CTLS,
2655                                 &_vmexit_control))
2656                 return -EIO;
2657
2658         if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2659                                 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2660                                 MSR_IA32_VMX_PINBASED_CTLS,
2661                                 &_pin_based_exec_control))
2662                 return -EIO;
2663
2664         if (cpu_has_broken_vmx_preemption_timer())
2665                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2666         if (!(_cpu_based_2nd_exec_control &
2667                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2668                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2669
2670         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2671                                 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2672                                 MSR_IA32_VMX_ENTRY_CTLS,
2673                                 &_vmentry_control))
2674                 return -EIO;
2675
2676         for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
2677                 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
2678                 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
2679
2680                 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
2681                         continue;
2682
2683                 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
2684                              _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
2685
2686                 if (error_on_inconsistent_vmcs_config)
2687                         return -EIO;
2688
2689                 _vmentry_control &= ~n_ctrl;
2690                 _vmexit_control &= ~x_ctrl;
2691         }
2692
2693         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2694
2695         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2696         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2697                 return -EIO;
2698
2699 #ifdef CONFIG_X86_64
2700         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2701         if (vmx_msr_high & (1u<<16))
2702                 return -EIO;
2703 #endif
2704
2705         /* Require Write-Back (WB) memory type for VMCS accesses. */
2706         if (((vmx_msr_high >> 18) & 15) != 6)
2707                 return -EIO;
2708
2709         rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
2710
2711         vmcs_conf->size = vmx_msr_high & 0x1fff;
2712         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2713
2714         vmcs_conf->revision_id = vmx_msr_low;
2715
2716         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2717         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2718         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2719         vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
2720         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2721         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2722         vmcs_conf->misc = misc_msr;
2723
2724 #if IS_ENABLED(CONFIG_HYPERV)
2725         if (enlightened_vmcs)
2726                 evmcs_sanitize_exec_ctrls(vmcs_conf);
2727 #endif
2728
2729         return 0;
2730 }
2731
2732 static bool kvm_is_vmx_supported(void)
2733 {
2734         int cpu = raw_smp_processor_id();
2735
2736         if (!cpu_has_vmx()) {
2737                 pr_err("VMX not supported by CPU %d\n", cpu);
2738                 return false;
2739         }
2740
2741         if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2742             !this_cpu_has(X86_FEATURE_VMX)) {
2743                 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
2744                 return false;
2745         }
2746
2747         return true;
2748 }
2749
2750 static int vmx_check_processor_compat(void)
2751 {
2752         int cpu = raw_smp_processor_id();
2753         struct vmcs_config vmcs_conf;
2754         struct vmx_capability vmx_cap;
2755
2756         if (!kvm_is_vmx_supported())
2757                 return -EIO;
2758
2759         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2760                 pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
2761                 return -EIO;
2762         }
2763         if (nested)
2764                 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
2765         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2766                 pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
2767                 return -EIO;
2768         }
2769         return 0;
2770 }
2771
2772 static int kvm_cpu_vmxon(u64 vmxon_pointer)
2773 {
2774         u64 msr;
2775
2776         cr4_set_bits(X86_CR4_VMXE);
2777
2778         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2779                           _ASM_EXTABLE(1b, %l[fault])
2780                           : : [vmxon_pointer] "m"(vmxon_pointer)
2781                           : : fault);
2782         return 0;
2783
2784 fault:
2785         WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2786                   rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2787         cr4_clear_bits(X86_CR4_VMXE);
2788
2789         return -EFAULT;
2790 }
2791
2792 static int vmx_hardware_enable(void)
2793 {
2794         int cpu = raw_smp_processor_id();
2795         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2796         int r;
2797
2798         if (cr4_read_shadow() & X86_CR4_VMXE)
2799                 return -EBUSY;
2800
2801         /*
2802          * This can happen if we hot-added a CPU but failed to allocate
2803          * VP assist page for it.
2804          */
2805         if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
2806                 return -EFAULT;
2807
2808         intel_pt_handle_vmx(1);
2809
2810         r = kvm_cpu_vmxon(phys_addr);
2811         if (r) {
2812                 intel_pt_handle_vmx(0);
2813                 return r;
2814         }
2815
2816         if (enable_ept)
2817                 ept_sync_global();
2818
2819         return 0;
2820 }
2821
2822 static void vmclear_local_loaded_vmcss(void)
2823 {
2824         int cpu = raw_smp_processor_id();
2825         struct loaded_vmcs *v, *n;
2826
2827         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2828                                  loaded_vmcss_on_cpu_link)
2829                 __loaded_vmcs_clear(v);
2830 }
2831
2832 static void vmx_hardware_disable(void)
2833 {
2834         vmclear_local_loaded_vmcss();
2835
2836         if (cpu_vmxoff())
2837                 kvm_spurious_fault();
2838
2839         hv_reset_evmcs();
2840
2841         intel_pt_handle_vmx(0);
2842 }
2843
2844 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2845 {
2846         int node = cpu_to_node(cpu);
2847         struct page *pages;
2848         struct vmcs *vmcs;
2849
2850         pages = __alloc_pages_node(node, flags, 0);
2851         if (!pages)
2852                 return NULL;
2853         vmcs = page_address(pages);
2854         memset(vmcs, 0, vmcs_config.size);
2855
2856         /* KVM supports Enlightened VMCS v1 only */
2857         if (kvm_is_using_evmcs())
2858                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2859         else
2860                 vmcs->hdr.revision_id = vmcs_config.revision_id;
2861
2862         if (shadow)
2863                 vmcs->hdr.shadow_vmcs = 1;
2864         return vmcs;
2865 }
2866
2867 void free_vmcs(struct vmcs *vmcs)
2868 {
2869         free_page((unsigned long)vmcs);
2870 }
2871
2872 /*
2873  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2874  */
2875 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2876 {
2877         if (!loaded_vmcs->vmcs)
2878                 return;
2879         loaded_vmcs_clear(loaded_vmcs);
2880         free_vmcs(loaded_vmcs->vmcs);
2881         loaded_vmcs->vmcs = NULL;
2882         if (loaded_vmcs->msr_bitmap)
2883                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2884         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2885 }
2886
2887 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2888 {
2889         loaded_vmcs->vmcs = alloc_vmcs(false);
2890         if (!loaded_vmcs->vmcs)
2891                 return -ENOMEM;
2892
2893         vmcs_clear(loaded_vmcs->vmcs);
2894
2895         loaded_vmcs->shadow_vmcs = NULL;
2896         loaded_vmcs->hv_timer_soft_disabled = false;
2897         loaded_vmcs->cpu = -1;
2898         loaded_vmcs->launched = 0;
2899
2900         if (cpu_has_vmx_msr_bitmap()) {
2901                 loaded_vmcs->msr_bitmap = (unsigned long *)
2902                                 __get_free_page(GFP_KERNEL_ACCOUNT);
2903                 if (!loaded_vmcs->msr_bitmap)
2904                         goto out_vmcs;
2905                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2906         }
2907
2908         memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2909         memset(&loaded_vmcs->controls_shadow, 0,
2910                 sizeof(struct vmcs_controls_shadow));
2911
2912         return 0;
2913
2914 out_vmcs:
2915         free_loaded_vmcs(loaded_vmcs);
2916         return -ENOMEM;
2917 }
2918
2919 static void free_kvm_area(void)
2920 {
2921         int cpu;
2922
2923         for_each_possible_cpu(cpu) {
2924                 free_vmcs(per_cpu(vmxarea, cpu));
2925                 per_cpu(vmxarea, cpu) = NULL;
2926         }
2927 }
2928
2929 static __init int alloc_kvm_area(void)
2930 {
2931         int cpu;
2932
2933         for_each_possible_cpu(cpu) {
2934                 struct vmcs *vmcs;
2935
2936                 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2937                 if (!vmcs) {
2938                         free_kvm_area();
2939                         return -ENOMEM;
2940                 }
2941
2942                 /*
2943                  * When eVMCS is enabled, alloc_vmcs_cpu() sets
2944                  * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2945                  * revision_id reported by MSR_IA32_VMX_BASIC.
2946                  *
2947                  * However, even though not explicitly documented by
2948                  * TLFS, VMXArea passed as VMXON argument should
2949                  * still be marked with revision_id reported by
2950                  * physical CPU.
2951                  */
2952                 if (kvm_is_using_evmcs())
2953                         vmcs->hdr.revision_id = vmcs_config.revision_id;
2954
2955                 per_cpu(vmxarea, cpu) = vmcs;
2956         }
2957         return 0;
2958 }
2959
2960 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2961                 struct kvm_segment *save)
2962 {
2963         if (!emulate_invalid_guest_state) {
2964                 /*
2965                  * CS and SS RPL should be equal during guest entry according
2966                  * to VMX spec, but in reality it is not always so. Since vcpu
2967                  * is in the middle of the transition from real mode to
2968                  * protected mode it is safe to assume that RPL 0 is a good
2969                  * default value.
2970                  */
2971                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2972                         save->selector &= ~SEGMENT_RPL_MASK;
2973                 save->dpl = save->selector & SEGMENT_RPL_MASK;
2974                 save->s = 1;
2975         }
2976         __vmx_set_segment(vcpu, save, seg);
2977 }
2978
2979 static void enter_pmode(struct kvm_vcpu *vcpu)
2980 {
2981         unsigned long flags;
2982         struct vcpu_vmx *vmx = to_vmx(vcpu);
2983
2984         /*
2985          * Update real mode segment cache. It may be not up-to-date if segment
2986          * register was written while vcpu was in a guest mode.
2987          */
2988         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2989         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2990         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2991         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2992         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2993         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2994
2995         vmx->rmode.vm86_active = 0;
2996
2997         __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2998
2999         flags = vmcs_readl(GUEST_RFLAGS);
3000         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3001         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3002         vmcs_writel(GUEST_RFLAGS, flags);
3003
3004         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3005                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3006
3007         vmx_update_exception_bitmap(vcpu);
3008
3009         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3010         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3011         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3012         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3013         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3014         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3015 }
3016
3017 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3018 {
3019         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3020         struct kvm_segment var = *save;
3021
3022         var.dpl = 0x3;
3023         if (seg == VCPU_SREG_CS)
3024                 var.type = 0x3;
3025
3026         if (!emulate_invalid_guest_state) {
3027                 var.selector = var.base >> 4;
3028                 var.base = var.base & 0xffff0;
3029                 var.limit = 0xffff;
3030                 var.g = 0;
3031                 var.db = 0;
3032                 var.present = 1;
3033                 var.s = 1;
3034                 var.l = 0;
3035                 var.unusable = 0;
3036                 var.type = 0x3;
3037                 var.avl = 0;
3038                 if (save->base & 0xf)
3039                         pr_warn_once("segment base is not paragraph aligned "
3040                                      "when entering protected mode (seg=%d)", seg);
3041         }
3042
3043         vmcs_write16(sf->selector, var.selector);
3044         vmcs_writel(sf->base, var.base);
3045         vmcs_write32(sf->limit, var.limit);
3046         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3047 }
3048
3049 static void enter_rmode(struct kvm_vcpu *vcpu)
3050 {
3051         unsigned long flags;
3052         struct vcpu_vmx *vmx = to_vmx(vcpu);
3053         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3054
3055         /*
3056          * KVM should never use VM86 to virtualize Real Mode when L2 is active,
3057          * as using VM86 is unnecessary if unrestricted guest is enabled, and
3058          * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
3059          * should VM-Fail and KVM should reject userspace attempts to stuff
3060          * CR0.PG=0 when L2 is active.
3061          */
3062         WARN_ON_ONCE(is_guest_mode(vcpu));
3063
3064         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3065         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3066         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3067         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3068         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3069         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3070         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3071
3072         vmx->rmode.vm86_active = 1;
3073
3074         /*
3075          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3076          * vcpu. Warn the user that an update is overdue.
3077          */
3078         if (!kvm_vmx->tss_addr)
3079                 pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
3080
3081         vmx_segment_cache_clear(vmx);
3082
3083         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3084         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3085         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3086
3087         flags = vmcs_readl(GUEST_RFLAGS);
3088         vmx->rmode.save_rflags = flags;
3089
3090         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3091
3092         vmcs_writel(GUEST_RFLAGS, flags);
3093         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3094         vmx_update_exception_bitmap(vcpu);
3095
3096         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3097         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3098         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3099         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3100         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3101         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3102 }
3103
3104 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3105 {
3106         struct vcpu_vmx *vmx = to_vmx(vcpu);
3107
3108         /* Nothing to do if hardware doesn't support EFER. */
3109         if (!vmx_find_uret_msr(vmx, MSR_EFER))
3110                 return 0;
3111
3112         vcpu->arch.efer = efer;
3113 #ifdef CONFIG_X86_64
3114         if (efer & EFER_LMA)
3115                 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3116         else
3117                 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
3118 #else
3119         if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3120                 return 1;
3121 #endif
3122
3123         vmx_setup_uret_msrs(vmx);
3124         return 0;
3125 }
3126
3127 #ifdef CONFIG_X86_64
3128
3129 static void enter_lmode(struct kvm_vcpu *vcpu)
3130 {
3131         u32 guest_tr_ar;
3132
3133         vmx_segment_cache_clear(to_vmx(vcpu));
3134
3135         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3136         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3137                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3138                                      __func__);
3139                 vmcs_write32(GUEST_TR_AR_BYTES,
3140                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3141                              | VMX_AR_TYPE_BUSY_64_TSS);
3142         }
3143         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3144 }
3145
3146 static void exit_lmode(struct kvm_vcpu *vcpu)
3147 {
3148         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3149 }
3150
3151 #endif
3152
3153 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
3154 {
3155         struct vcpu_vmx *vmx = to_vmx(vcpu);
3156
3157         /*
3158          * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3159          * the CPU is not required to invalidate guest-physical mappings on
3160          * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
3161          * associated with the root EPT structure and not any particular VPID
3162          * (INVVPID also isn't required to invalidate guest-physical mappings).
3163          */
3164         if (enable_ept) {
3165                 ept_sync_global();
3166         } else if (enable_vpid) {
3167                 if (cpu_has_vmx_invvpid_global()) {
3168                         vpid_sync_vcpu_global();
3169                 } else {
3170                         vpid_sync_vcpu_single(vmx->vpid);
3171                         vpid_sync_vcpu_single(vmx->nested.vpid02);
3172                 }
3173         }
3174 }
3175
3176 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3177 {
3178         if (is_guest_mode(vcpu))
3179                 return nested_get_vpid02(vcpu);
3180         return to_vmx(vcpu)->vpid;
3181 }
3182
3183 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3184 {
3185         struct kvm_mmu *mmu = vcpu->arch.mmu;
3186         u64 root_hpa = mmu->root.hpa;
3187
3188         /* No flush required if the current context is invalid. */
3189         if (!VALID_PAGE(root_hpa))
3190                 return;
3191
3192         if (enable_ept)
3193                 ept_sync_context(construct_eptp(vcpu, root_hpa,
3194                                                 mmu->root_role.level));
3195         else
3196                 vpid_sync_context(vmx_get_current_vpid(vcpu));
3197 }
3198
3199 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3200 {
3201         /*
3202          * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3203          * vmx_flush_tlb_guest() for an explanation of why this is ok.
3204          */
3205         vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
3206 }
3207
3208 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3209 {
3210         /*
3211          * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3212          * vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit are
3213          * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3214          * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3215          * i.e. no explicit INVVPID is necessary.
3216          */
3217         vpid_sync_context(vmx_get_current_vpid(vcpu));
3218 }
3219
3220 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3221 {
3222         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3223
3224         if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3225                 return;
3226
3227         if (is_pae_paging(vcpu)) {
3228                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3229                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3230                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3231                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3232         }
3233 }
3234
3235 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3236 {
3237         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3238
3239         if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3240                 return;
3241
3242         mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3243         mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3244         mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3245         mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3246
3247         kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
3248 }
3249
3250 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3251                           CPU_BASED_CR3_STORE_EXITING)
3252
3253 static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3254 {
3255         if (is_guest_mode(vcpu))
3256                 return nested_guest_cr0_valid(vcpu, cr0);
3257
3258         if (to_vmx(vcpu)->nested.vmxon)
3259                 return nested_host_cr0_valid(vcpu, cr0);
3260
3261         return true;
3262 }
3263
3264 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3265 {
3266         struct vcpu_vmx *vmx = to_vmx(vcpu);
3267         unsigned long hw_cr0, old_cr0_pg;
3268         u32 tmp;
3269
3270         old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3271
3272         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3273         if (enable_unrestricted_guest)
3274                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3275         else {
3276                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3277                 if (!enable_ept)
3278                         hw_cr0 |= X86_CR0_WP;
3279
3280                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3281                         enter_pmode(vcpu);
3282
3283                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3284                         enter_rmode(vcpu);
3285         }
3286
3287         vmcs_writel(CR0_READ_SHADOW, cr0);
3288         vmcs_writel(GUEST_CR0, hw_cr0);
3289         vcpu->arch.cr0 = cr0;
3290         kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3291
3292 #ifdef CONFIG_X86_64
3293         if (vcpu->arch.efer & EFER_LME) {
3294                 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3295                         enter_lmode(vcpu);
3296                 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3297                         exit_lmode(vcpu);
3298         }
3299 #endif
3300
3301         if (enable_ept && !enable_unrestricted_guest) {
3302                 /*
3303                  * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
3304                  * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3305                  * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3306                  * KVM's CR3 is installed.
3307                  */
3308                 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3309                         vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3310
3311                 /*
3312                  * When running with EPT but not unrestricted guest, KVM must
3313                  * intercept CR3 accesses when paging is _disabled_.  This is
3314                  * necessary because restricted guests can't actually run with
3315                  * paging disabled, and so KVM stuffs its own CR3 in order to
3316                  * run the guest when identity mapped page tables.
3317                  *
3318                  * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3319                  * update, it may be stale with respect to CR3 interception,
3320                  * e.g. after nested VM-Enter.
3321                  *
3322                  * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3323                  * stores to forward them to L1, even if KVM does not need to
3324                  * intercept them to preserve its identity mapped page tables.
3325                  */
3326                 if (!(cr0 & X86_CR0_PG)) {
3327                         exec_controls_setbit(vmx, CR3_EXITING_BITS);
3328                 } else if (!is_guest_mode(vcpu)) {
3329                         exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3330                 } else {
3331                         tmp = exec_controls_get(vmx);
3332                         tmp &= ~CR3_EXITING_BITS;
3333                         tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3334                         exec_controls_set(vmx, tmp);
3335                 }
3336
3337                 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3338                 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3339                         vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3340
3341                 /*
3342                  * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3343                  * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3344                  */
3345                 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3346                         kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
3347         }
3348
3349         /* depends on vcpu->arch.cr0 to be set to a new value */
3350         vmx->emulation_required = vmx_emulation_required(vcpu);
3351 }
3352
3353 static int vmx_get_max_tdp_level(void)
3354 {
3355         if (cpu_has_vmx_ept_5levels())
3356                 return 5;
3357         return 4;
3358 }
3359
3360 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3361 {
3362         u64 eptp = VMX_EPTP_MT_WB;
3363
3364         eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3365
3366         if (enable_ept_ad_bits &&
3367             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3368                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
3369         eptp |= root_hpa;
3370
3371         return eptp;
3372 }
3373
3374 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3375                              int root_level)
3376 {
3377         struct kvm *kvm = vcpu->kvm;
3378         bool update_guest_cr3 = true;
3379         unsigned long guest_cr3;
3380         u64 eptp;
3381
3382         if (enable_ept) {
3383                 eptp = construct_eptp(vcpu, root_hpa, root_level);
3384                 vmcs_write64(EPT_POINTER, eptp);
3385
3386                 hv_track_root_tdp(vcpu, root_hpa);
3387
3388                 if (!enable_unrestricted_guest && !is_paging(vcpu))
3389                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3390                 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
3391                         guest_cr3 = vcpu->arch.cr3;
3392                 else /* vmcs.GUEST_CR3 is already up-to-date. */
3393                         update_guest_cr3 = false;
3394                 vmx_ept_load_pdptrs(vcpu);
3395         } else {
3396                 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
3397         }
3398
3399         if (update_guest_cr3)
3400                 vmcs_writel(GUEST_CR3, guest_cr3);
3401 }
3402
3403
3404 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3405 {
3406         /*
3407          * We operate under the default treatment of SMM, so VMX cannot be
3408          * enabled under SMM.  Note, whether or not VMXE is allowed at all,
3409          * i.e. is a reserved bit, is handled by common x86 code.
3410          */
3411         if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3412                 return false;
3413
3414         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3415                 return false;
3416
3417         return true;
3418 }
3419
3420 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3421 {
3422         unsigned long old_cr4 = kvm_read_cr4(vcpu);
3423         struct vcpu_vmx *vmx = to_vmx(vcpu);
3424         unsigned long hw_cr4;
3425
3426         /*
3427          * Pass through host's Machine Check Enable value to hw_cr4, which
3428          * is in force while we are in guest mode.  Do not let guests control
3429          * this bit, even if host CR4.MCE == 0.
3430          */
3431         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3432         if (enable_unrestricted_guest)
3433                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3434         else if (vmx->rmode.vm86_active)
3435                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3436         else
3437                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3438
3439         if (vmx_umip_emulated()) {
3440                 if (cr4 & X86_CR4_UMIP) {
3441                         secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3442                         hw_cr4 &= ~X86_CR4_UMIP;
3443                 } else if (!is_guest_mode(vcpu) ||
3444                         !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3445                         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3446                 }
3447         }
3448
3449         vcpu->arch.cr4 = cr4;
3450         kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3451
3452         if (!enable_unrestricted_guest) {
3453                 if (enable_ept) {
3454                         if (!is_paging(vcpu)) {
3455                                 hw_cr4 &= ~X86_CR4_PAE;
3456                                 hw_cr4 |= X86_CR4_PSE;
3457                         } else if (!(cr4 & X86_CR4_PAE)) {
3458                                 hw_cr4 &= ~X86_CR4_PAE;
3459                         }
3460                 }
3461
3462                 /*
3463                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3464                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
3465                  * to be manually disabled when guest switches to non-paging
3466                  * mode.
3467                  *
3468                  * If !enable_unrestricted_guest, the CPU is always running
3469                  * with CR0.PG=1 and CR4 needs to be modified.
3470                  * If enable_unrestricted_guest, the CPU automatically
3471                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3472                  */
3473                 if (!is_paging(vcpu))
3474                         hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3475         }
3476
3477         vmcs_writel(CR4_READ_SHADOW, cr4);
3478         vmcs_writel(GUEST_CR4, hw_cr4);
3479
3480         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3481                 kvm_update_cpuid_runtime(vcpu);
3482 }
3483
3484 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3485 {
3486         struct vcpu_vmx *vmx = to_vmx(vcpu);
3487         u32 ar;
3488
3489         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3490                 *var = vmx->rmode.segs[seg];
3491                 if (seg == VCPU_SREG_TR
3492                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3493                         return;
3494                 var->base = vmx_read_guest_seg_base(vmx, seg);
3495                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3496                 return;
3497         }
3498         var->base = vmx_read_guest_seg_base(vmx, seg);
3499         var->limit = vmx_read_guest_seg_limit(vmx, seg);
3500         var->selector = vmx_read_guest_seg_selector(vmx, seg);
3501         ar = vmx_read_guest_seg_ar(vmx, seg);
3502         var->unusable = (ar >> 16) & 1;
3503         var->type = ar & 15;
3504         var->s = (ar >> 4) & 1;
3505         var->dpl = (ar >> 5) & 3;
3506         /*
3507          * Some userspaces do not preserve unusable property. Since usable
3508          * segment has to be present according to VMX spec we can use present
3509          * property to amend userspace bug by making unusable segment always
3510          * nonpresent. vmx_segment_access_rights() already marks nonpresent
3511          * segment as unusable.
3512          */
3513         var->present = !var->unusable;
3514         var->avl = (ar >> 12) & 1;
3515         var->l = (ar >> 13) & 1;
3516         var->db = (ar >> 14) & 1;
3517         var->g = (ar >> 15) & 1;
3518 }
3519
3520 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3521 {
3522         struct kvm_segment s;
3523
3524         if (to_vmx(vcpu)->rmode.vm86_active) {
3525                 vmx_get_segment(vcpu, &s, seg);
3526                 return s.base;
3527         }
3528         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3529 }
3530
3531 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3532 {
3533         struct vcpu_vmx *vmx = to_vmx(vcpu);
3534
3535         if (unlikely(vmx->rmode.vm86_active))
3536                 return 0;
3537         else {
3538                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3539                 return VMX_AR_DPL(ar);
3540         }
3541 }
3542
3543 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3544 {
3545         u32 ar;
3546
3547         ar = var->type & 15;
3548         ar |= (var->s & 1) << 4;
3549         ar |= (var->dpl & 3) << 5;
3550         ar |= (var->present & 1) << 7;
3551         ar |= (var->avl & 1) << 12;
3552         ar |= (var->l & 1) << 13;
3553         ar |= (var->db & 1) << 14;
3554         ar |= (var->g & 1) << 15;
3555         ar |= (var->unusable || !var->present) << 16;
3556
3557         return ar;
3558 }
3559
3560 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3561 {
3562         struct vcpu_vmx *vmx = to_vmx(vcpu);
3563         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3564
3565         vmx_segment_cache_clear(vmx);
3566
3567         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3568                 vmx->rmode.segs[seg] = *var;
3569                 if (seg == VCPU_SREG_TR)
3570                         vmcs_write16(sf->selector, var->selector);
3571                 else if (var->s)
3572                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3573                 return;
3574         }
3575
3576         vmcs_writel(sf->base, var->base);
3577         vmcs_write32(sf->limit, var->limit);
3578         vmcs_write16(sf->selector, var->selector);
3579
3580         /*
3581          *   Fix the "Accessed" bit in AR field of segment registers for older
3582          * qemu binaries.
3583          *   IA32 arch specifies that at the time of processor reset the
3584          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3585          * is setting it to 0 in the userland code. This causes invalid guest
3586          * state vmexit when "unrestricted guest" mode is turned on.
3587          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3588          * tree. Newer qemu binaries with that qemu fix would not need this
3589          * kvm hack.
3590          */
3591         if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3592                 var->type |= 0x1; /* Accessed */
3593
3594         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3595 }
3596
3597 static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3598 {
3599         __vmx_set_segment(vcpu, var, seg);
3600
3601         to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
3602 }
3603
3604 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3605 {
3606         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3607
3608         *db = (ar >> 14) & 1;
3609         *l = (ar >> 13) & 1;
3610 }
3611
3612 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3613 {
3614         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3615         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3616 }
3617
3618 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3619 {
3620         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3621         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3622 }
3623
3624 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3625 {
3626         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3627         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3628 }
3629
3630 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3631 {
3632         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3633         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3634 }
3635
3636 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3637 {
3638         struct kvm_segment var;
3639         u32 ar;
3640
3641         vmx_get_segment(vcpu, &var, seg);
3642         var.dpl = 0x3;
3643         if (seg == VCPU_SREG_CS)
3644                 var.type = 0x3;
3645         ar = vmx_segment_access_rights(&var);
3646
3647         if (var.base != (var.selector << 4))
3648                 return false;
3649         if (var.limit != 0xffff)
3650                 return false;
3651         if (ar != 0xf3)
3652                 return false;
3653
3654         return true;
3655 }
3656
3657 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3658 {
3659         struct kvm_segment cs;
3660         unsigned int cs_rpl;
3661
3662         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3663         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3664
3665         if (cs.unusable)
3666                 return false;
3667         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3668                 return false;
3669         if (!cs.s)
3670                 return false;
3671         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3672                 if (cs.dpl > cs_rpl)
3673                         return false;
3674         } else {
3675                 if (cs.dpl != cs_rpl)
3676                         return false;
3677         }
3678         if (!cs.present)
3679                 return false;
3680
3681         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3682         return true;
3683 }
3684
3685 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3686 {
3687         struct kvm_segment ss;
3688         unsigned int ss_rpl;
3689
3690         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3691         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3692
3693         if (ss.unusable)
3694                 return true;
3695         if (ss.type != 3 && ss.type != 7)
3696                 return false;
3697         if (!ss.s)
3698                 return false;
3699         if (ss.dpl != ss_rpl) /* DPL != RPL */
3700                 return false;
3701         if (!ss.present)
3702                 return false;
3703
3704         return true;
3705 }
3706
3707 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3708 {
3709         struct kvm_segment var;
3710         unsigned int rpl;
3711
3712         vmx_get_segment(vcpu, &var, seg);
3713         rpl = var.selector & SEGMENT_RPL_MASK;
3714
3715         if (var.unusable)
3716                 return true;
3717         if (!var.s)
3718                 return false;
3719         if (!var.present)
3720                 return false;
3721         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3722                 if (var.dpl < rpl) /* DPL < RPL */
3723                         return false;
3724         }
3725
3726         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3727          * rights flags
3728          */
3729         return true;
3730 }
3731
3732 static bool tr_valid(struct kvm_vcpu *vcpu)
3733 {
3734         struct kvm_segment tr;
3735
3736         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3737
3738         if (tr.unusable)
3739                 return false;
3740         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
3741                 return false;
3742         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3743                 return false;
3744         if (!tr.present)
3745                 return false;
3746
3747         return true;
3748 }
3749
3750 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3751 {
3752         struct kvm_segment ldtr;
3753
3754         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3755
3756         if (ldtr.unusable)
3757                 return true;
3758         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
3759                 return false;
3760         if (ldtr.type != 2)
3761                 return false;
3762         if (!ldtr.present)
3763                 return false;
3764
3765         return true;
3766 }
3767
3768 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3769 {
3770         struct kvm_segment cs, ss;
3771
3772         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3773         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3774
3775         return ((cs.selector & SEGMENT_RPL_MASK) ==
3776                  (ss.selector & SEGMENT_RPL_MASK));
3777 }
3778
3779 /*
3780  * Check if guest state is valid. Returns true if valid, false if
3781  * not.
3782  * We assume that registers are always usable
3783  */
3784 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3785 {
3786         /* real mode guest state checks */
3787         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3788                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3789                         return false;
3790                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3791                         return false;
3792                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3793                         return false;
3794                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3795                         return false;
3796                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3797                         return false;
3798                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3799                         return false;
3800         } else {
3801         /* protected mode guest state checks */
3802                 if (!cs_ss_rpl_check(vcpu))
3803                         return false;
3804                 if (!code_segment_valid(vcpu))
3805                         return false;
3806                 if (!stack_segment_valid(vcpu))
3807                         return false;
3808                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3809                         return false;
3810                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3811                         return false;
3812                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3813                         return false;
3814                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3815                         return false;
3816                 if (!tr_valid(vcpu))
3817                         return false;
3818                 if (!ldtr_valid(vcpu))
3819                         return false;
3820         }
3821         /* TODO:
3822          * - Add checks on RIP
3823          * - Add checks on RFLAGS
3824          */
3825
3826         return true;
3827 }
3828
3829 static int init_rmode_tss(struct kvm *kvm, void __user *ua)
3830 {
3831         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3832         u16 data;
3833         int i;
3834
3835         for (i = 0; i < 3; i++) {
3836                 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
3837                         return -EFAULT;
3838         }
3839
3840         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3841         if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
3842                 return -EFAULT;
3843
3844         data = ~0;
3845         if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
3846                 return -EFAULT;
3847
3848         return 0;
3849 }
3850
3851 static int init_rmode_identity_map(struct kvm *kvm)
3852 {
3853         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3854         int i, r = 0;
3855         void __user *uaddr;
3856         u32 tmp;
3857
3858         /* Protect kvm_vmx->ept_identity_pagetable_done. */
3859         mutex_lock(&kvm->slots_lock);
3860
3861         if (likely(kvm_vmx->ept_identity_pagetable_done))
3862                 goto out;
3863
3864         if (!kvm_vmx->ept_identity_map_addr)
3865                 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3866
3867         uaddr = __x86_set_memory_region(kvm,
3868                                         IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3869                                         kvm_vmx->ept_identity_map_addr,
3870                                         PAGE_SIZE);
3871         if (IS_ERR(uaddr)) {
3872                 r = PTR_ERR(uaddr);
3873                 goto out;
3874         }
3875
3876         /* Set up identity-mapping pagetable for EPT in real mode */
3877         for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
3878                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3879                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3880                 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
3881                         r = -EFAULT;
3882                         goto out;
3883                 }
3884         }
3885         kvm_vmx->ept_identity_pagetable_done = true;
3886
3887 out:
3888         mutex_unlock(&kvm->slots_lock);
3889         return r;
3890 }
3891
3892 static void seg_setup(int seg)
3893 {
3894         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3895         unsigned int ar;
3896
3897         vmcs_write16(sf->selector, 0);
3898         vmcs_writel(sf->base, 0);
3899         vmcs_write32(sf->limit, 0xffff);
3900         ar = 0x93;
3901         if (seg == VCPU_SREG_CS)
3902                 ar |= 0x08; /* code segment */
3903
3904         vmcs_write32(sf->ar_bytes, ar);
3905 }
3906
3907 int allocate_vpid(void)
3908 {
3909         int vpid;
3910
3911         if (!enable_vpid)
3912                 return 0;
3913         spin_lock(&vmx_vpid_lock);
3914         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3915         if (vpid < VMX_NR_VPIDS)
3916                 __set_bit(vpid, vmx_vpid_bitmap);
3917         else
3918                 vpid = 0;
3919         spin_unlock(&vmx_vpid_lock);
3920         return vpid;
3921 }
3922
3923 void free_vpid(int vpid)
3924 {
3925         if (!enable_vpid || vpid == 0)
3926                 return;
3927         spin_lock(&vmx_vpid_lock);
3928         __clear_bit(vpid, vmx_vpid_bitmap);
3929         spin_unlock(&vmx_vpid_lock);
3930 }
3931
3932 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3933 {
3934         /*
3935          * When KVM is a nested hypervisor on top of Hyper-V and uses
3936          * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3937          * bitmap has changed.
3938          */
3939         if (kvm_is_using_evmcs()) {
3940                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
3941
3942                 if (evmcs->hv_enlightenments_control.msr_bitmap)
3943                         evmcs->hv_clean_fields &=
3944                                 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
3945         }
3946
3947         vmx->nested.force_msr_bitmap_recalc = true;
3948 }
3949
3950 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
3951 {
3952         struct vcpu_vmx *vmx = to_vmx(vcpu);
3953         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3954
3955         if (!cpu_has_vmx_msr_bitmap())
3956                 return;
3957
3958         vmx_msr_bitmap_l01_changed(vmx);
3959
3960         /*
3961          * Mark the desired intercept state in shadow bitmap, this is needed
3962          * for resync when the MSR filters change.
3963         */
3964         if (is_valid_passthrough_msr(msr)) {
3965                 int idx = possible_passthrough_msr_slot(msr);
3966
3967                 if (idx != -ENOENT) {
3968                         if (type & MSR_TYPE_R)
3969                                 clear_bit(idx, vmx->shadow_msr_intercept.read);
3970                         if (type & MSR_TYPE_W)
3971                                 clear_bit(idx, vmx->shadow_msr_intercept.write);
3972                 }
3973         }
3974
3975         if ((type & MSR_TYPE_R) &&
3976             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
3977                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
3978                 type &= ~MSR_TYPE_R;
3979         }
3980
3981         if ((type & MSR_TYPE_W) &&
3982             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
3983                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
3984                 type &= ~MSR_TYPE_W;
3985         }
3986
3987         if (type & MSR_TYPE_R)
3988                 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
3989
3990         if (type & MSR_TYPE_W)
3991                 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
3992 }
3993
3994 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
3995 {
3996         struct vcpu_vmx *vmx = to_vmx(vcpu);
3997         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3998
3999         if (!cpu_has_vmx_msr_bitmap())
4000                 return;
4001
4002         vmx_msr_bitmap_l01_changed(vmx);
4003
4004         /*
4005          * Mark the desired intercept state in shadow bitmap, this is needed
4006          * for resync when the MSR filter changes.
4007         */
4008         if (is_valid_passthrough_msr(msr)) {
4009                 int idx = possible_passthrough_msr_slot(msr);
4010
4011                 if (idx != -ENOENT) {
4012                         if (type & MSR_TYPE_R)
4013                                 set_bit(idx, vmx->shadow_msr_intercept.read);
4014                         if (type & MSR_TYPE_W)
4015                                 set_bit(idx, vmx->shadow_msr_intercept.write);
4016                 }
4017         }
4018
4019         if (type & MSR_TYPE_R)
4020                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
4021
4022         if (type & MSR_TYPE_W)
4023                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
4024 }
4025
4026 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
4027 {
4028         /*
4029          * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
4030          * of the MSR bitmap.  KVM emulates APIC registers up through 0x3f0,
4031          * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
4032          */
4033         const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
4034         const int write_idx = read_idx + (0x800 / sizeof(u64));
4035         struct vcpu_vmx *vmx = to_vmx(vcpu);
4036         u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
4037         u8 mode;
4038
4039         if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
4040                 return;
4041
4042         if (cpu_has_secondary_exec_ctrls() &&
4043             (secondary_exec_controls_get(vmx) &
4044              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4045                 mode = MSR_BITMAP_MODE_X2APIC;
4046                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4047                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4048         } else {
4049                 mode = 0;
4050         }
4051
4052         if (mode == vmx->x2apic_msr_bitmap_mode)
4053                 return;
4054
4055         vmx->x2apic_msr_bitmap_mode = mode;
4056
4057         /*
4058          * Reset the bitmap for MSRs 0x800 - 0x83f.  Leave AMD's uber-extended
4059          * registers (0x840 and above) intercepted, KVM doesn't support them.
4060          * Intercept all writes by default and poke holes as needed.  Pass
4061          * through reads for all valid registers by default in x2APIC+APICv
4062          * mode, only the current timer count needs on-demand emulation by KVM.
4063          */
4064         if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
4065                 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
4066         else
4067                 msr_bitmap[read_idx] = ~0ull;
4068         msr_bitmap[write_idx] = ~0ull;
4069
4070         /*
4071          * TPR reads and writes can be virtualized even if virtual interrupt
4072          * delivery is not in use.
4073          */
4074         vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4075                                   !(mode & MSR_BITMAP_MODE_X2APIC));
4076
4077         if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4078                 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4079                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4080                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4081                 if (enable_ipiv)
4082                         vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
4083         }
4084 }
4085
4086 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
4087 {
4088         struct vcpu_vmx *vmx = to_vmx(vcpu);
4089         bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4090         u32 i;
4091
4092         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4093         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4094         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4095         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
4096         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
4097                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4098                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4099         }
4100 }
4101
4102 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
4103 {
4104         struct vcpu_vmx *vmx = to_vmx(vcpu);
4105         void *vapic_page;
4106         u32 vppr;
4107         int rvi;
4108
4109         if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
4110                 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4111                 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
4112                 return false;
4113
4114         rvi = vmx_get_rvi();
4115
4116         vapic_page = vmx->nested.virtual_apic_map.hva;
4117         vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
4118
4119         return ((rvi & 0xf0) > (vppr & 0xf0));
4120 }
4121
4122 static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
4123 {
4124         struct vcpu_vmx *vmx = to_vmx(vcpu);
4125         u32 i;
4126
4127         /*
4128          * Redo intercept permissions for MSRs that KVM is passing through to
4129          * the guest.  Disabling interception will check the new MSR filter and
4130          * ensure that KVM enables interception if usersepace wants to filter
4131          * the MSR.  MSRs that KVM is already intercepting don't need to be
4132          * refreshed since KVM is going to intercept them regardless of what
4133          * userspace wants.
4134          */
4135         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
4136                 u32 msr = vmx_possible_passthrough_msrs[i];
4137
4138                 if (!test_bit(i, vmx->shadow_msr_intercept.read))
4139                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
4140
4141                 if (!test_bit(i, vmx->shadow_msr_intercept.write))
4142                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
4143         }
4144
4145         /* PT MSRs can be passed through iff PT is exposed to the guest. */
4146         if (vmx_pt_mode_is_host_guest())
4147                 pt_update_intercept_for_msr(vcpu);
4148 }
4149
4150 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
4151                                                      int pi_vec)
4152 {
4153 #ifdef CONFIG_SMP
4154         if (vcpu->mode == IN_GUEST_MODE) {
4155                 /*
4156                  * The vector of the virtual has already been set in the PIR.
4157                  * Send a notification event to deliver the virtual interrupt
4158                  * unless the vCPU is the currently running vCPU, i.e. the
4159                  * event is being sent from a fastpath VM-Exit handler, in
4160                  * which case the PIR will be synced to the vIRR before
4161                  * re-entering the guest.
4162                  *
4163                  * When the target is not the running vCPU, the following
4164                  * possibilities emerge:
4165                  *
4166                  * Case 1: vCPU stays in non-root mode. Sending a notification
4167                  * event posts the interrupt to the vCPU.
4168                  *
4169                  * Case 2: vCPU exits to root mode and is still runnable. The
4170                  * PIR will be synced to the vIRR before re-entering the guest.
4171                  * Sending a notification event is ok as the host IRQ handler
4172                  * will ignore the spurious event.
4173                  *
4174                  * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
4175                  * has already synced PIR to vIRR and never blocks the vCPU if
4176                  * the vIRR is not empty. Therefore, a blocked vCPU here does
4177                  * not wait for any requested interrupts in PIR, and sending a
4178                  * notification event also results in a benign, spurious event.
4179                  */
4180
4181                 if (vcpu != kvm_get_running_vcpu())
4182                         apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
4183                 return;
4184         }
4185 #endif
4186         /*
4187          * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
4188          * otherwise do nothing as KVM will grab the highest priority pending
4189          * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
4190          */
4191         kvm_vcpu_wake_up(vcpu);
4192 }
4193
4194 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4195                                                 int vector)
4196 {
4197         struct vcpu_vmx *vmx = to_vmx(vcpu);
4198
4199         if (is_guest_mode(vcpu) &&
4200             vector == vmx->nested.posted_intr_nv) {
4201                 /*
4202                  * If a posted intr is not recognized by hardware,
4203                  * we will accomplish it in the next vmentry.
4204                  */
4205                 vmx->nested.pi_pending = true;
4206                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4207
4208                 /*
4209                  * This pairs with the smp_mb_*() after setting vcpu->mode in
4210                  * vcpu_enter_guest() to guarantee the vCPU sees the event
4211                  * request if triggering a posted interrupt "fails" because
4212                  * vcpu->mode != IN_GUEST_MODE.  The extra barrier is needed as
4213                  * the smb_wmb() in kvm_make_request() only ensures everything
4214                  * done before making the request is visible when the request
4215                  * is visible, it doesn't ensure ordering between the store to
4216                  * vcpu->requests and the load from vcpu->mode.
4217                  */
4218                 smp_mb__after_atomic();
4219
4220                 /* the PIR and ON have been set by L1. */
4221                 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
4222                 return 0;
4223         }
4224         return -1;
4225 }
4226 /*
4227  * Send interrupt to vcpu via posted interrupt way.
4228  * 1. If target vcpu is running(non-root mode), send posted interrupt
4229  * notification to vcpu and hardware will sync PIR to vIRR atomically.
4230  * 2. If target vcpu isn't running(root mode), kick it to pick up the
4231  * interrupt from PIR in next vmentry.
4232  */
4233 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4234 {
4235         struct vcpu_vmx *vmx = to_vmx(vcpu);
4236         int r;
4237
4238         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4239         if (!r)
4240                 return 0;
4241
4242         /* Note, this is called iff the local APIC is in-kernel. */
4243         if (!vcpu->arch.apic->apicv_active)
4244                 return -1;
4245
4246         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4247                 return 0;
4248
4249         /* If a previous notification has sent the IPI, nothing to do.  */
4250         if (pi_test_and_set_on(&vmx->pi_desc))
4251                 return 0;
4252
4253         /*
4254          * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
4255          * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
4256          * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
4257          * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
4258          */
4259         kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
4260         return 0;
4261 }
4262
4263 static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4264                                   int trig_mode, int vector)
4265 {
4266         struct kvm_vcpu *vcpu = apic->vcpu;
4267
4268         if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4269                 kvm_lapic_set_irr(vector, apic);
4270                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4271                 kvm_vcpu_kick(vcpu);
4272         } else {
4273                 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4274                                            trig_mode, vector);
4275         }
4276 }
4277
4278 /*
4279  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4280  * will not change in the lifetime of the guest.
4281  * Note that host-state that does change is set elsewhere. E.g., host-state
4282  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4283  */
4284 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4285 {
4286         u32 low32, high32;
4287         unsigned long tmpl;
4288         unsigned long cr0, cr3, cr4;
4289
4290         cr0 = read_cr0();
4291         WARN_ON(cr0 & X86_CR0_TS);
4292         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
4293
4294         /*
4295          * Save the most likely value for this task's CR3 in the VMCS.
4296          * We can't use __get_current_cr3_fast() because we're not atomic.
4297          */
4298         cr3 = __read_cr3();
4299         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
4300         vmx->loaded_vmcs->host_state.cr3 = cr3;
4301
4302         /* Save the most likely value for this task's CR4 in the VMCS. */
4303         cr4 = cr4_read_shadow();
4304         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
4305         vmx->loaded_vmcs->host_state.cr4 = cr4;
4306
4307         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4308 #ifdef CONFIG_X86_64
4309         /*
4310          * Load null selectors, so we can avoid reloading them in
4311          * vmx_prepare_switch_to_host(), in case userspace uses
4312          * the null selectors too (the expected case).
4313          */
4314         vmcs_write16(HOST_DS_SELECTOR, 0);
4315         vmcs_write16(HOST_ES_SELECTOR, 0);
4316 #else
4317         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4318         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4319 #endif
4320         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4321         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4322
4323         vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
4324
4325         vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4326
4327         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4328         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4329
4330         /*
4331          * SYSENTER is used for 32-bit system calls on either 32-bit or
4332          * 64-bit kernels.  It is always zero If neither is allowed, otherwise
4333          * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4334          * have already done so!).
4335          */
4336         if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4337                 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4338
4339         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4340         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4341
4342         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4343                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4344                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4345         }
4346
4347         if (cpu_has_load_ia32_efer())
4348                 vmcs_write64(HOST_IA32_EFER, host_efer);
4349 }
4350
4351 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4352 {
4353         struct kvm_vcpu *vcpu = &vmx->vcpu;
4354
4355         vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4356                                           ~vcpu->arch.cr4_guest_rsvd_bits;
4357         if (!enable_ept) {
4358                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4359                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4360         }
4361         if (is_guest_mode(&vmx->vcpu))
4362                 vcpu->arch.cr4_guest_owned_bits &=
4363                         ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4364         vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4365 }
4366
4367 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4368 {
4369         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4370
4371         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4372                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4373
4374         if (!enable_vnmi)
4375                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4376
4377         if (!enable_preemption_timer)
4378                 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4379
4380         return pin_based_exec_ctrl;
4381 }
4382
4383 static u32 vmx_vmentry_ctrl(void)
4384 {
4385         u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4386
4387         if (vmx_pt_mode_is_system())
4388                 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4389                                   VM_ENTRY_LOAD_IA32_RTIT_CTL);
4390         /*
4391          * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4392          */
4393         vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4394                           VM_ENTRY_LOAD_IA32_EFER |
4395                           VM_ENTRY_IA32E_MODE);
4396
4397         if (cpu_has_perf_global_ctrl_bug())
4398                 vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4399
4400         return vmentry_ctrl;
4401 }
4402
4403 static u32 vmx_vmexit_ctrl(void)
4404 {
4405         u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4406
4407         /*
4408          * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4409          * nested virtualization and thus allowed to be set in vmcs12.
4410          */
4411         vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4412                          VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4413
4414         if (vmx_pt_mode_is_system())
4415                 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4416                                  VM_EXIT_CLEAR_IA32_RTIT_CTL);
4417
4418         if (cpu_has_perf_global_ctrl_bug())
4419                 vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4420
4421         /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4422         return vmexit_ctrl &
4423                 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
4424 }
4425
4426 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4427 {
4428         struct vcpu_vmx *vmx = to_vmx(vcpu);
4429
4430         if (is_guest_mode(vcpu)) {
4431                 vmx->nested.update_vmcs01_apicv_status = true;
4432                 return;
4433         }
4434
4435         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4436
4437         if (kvm_vcpu_apicv_active(vcpu)) {
4438                 secondary_exec_controls_setbit(vmx,
4439                                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
4440                                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4441                 if (enable_ipiv)
4442                         tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4443         } else {
4444                 secondary_exec_controls_clearbit(vmx,
4445                                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
4446                                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4447                 if (enable_ipiv)
4448                         tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4449         }
4450
4451         vmx_update_msr_bitmap_x2apic(vcpu);
4452 }
4453
4454 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4455 {
4456         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4457
4458         /*
4459          * Not used by KVM, but fully supported for nesting, i.e. are allowed in
4460          * vmcs12 and propagated to vmcs02 when set in vmcs12.
4461          */
4462         exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4463                           CPU_BASED_USE_IO_BITMAPS |
4464                           CPU_BASED_MONITOR_TRAP_FLAG |
4465                           CPU_BASED_PAUSE_EXITING);
4466
4467         /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4468         exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4469                           CPU_BASED_NMI_WINDOW_EXITING);
4470
4471         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4472                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4473
4474         if (!cpu_need_tpr_shadow(&vmx->vcpu))
4475                 exec_control &= ~CPU_BASED_TPR_SHADOW;
4476
4477 #ifdef CONFIG_X86_64
4478         if (exec_control & CPU_BASED_TPR_SHADOW)
4479                 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4480                                   CPU_BASED_CR8_STORE_EXITING);
4481         else
4482                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4483                                 CPU_BASED_CR8_LOAD_EXITING;
4484 #endif
4485         /* No need to intercept CR3 access or INVPLG when using EPT. */
4486         if (enable_ept)
4487                 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4488                                   CPU_BASED_CR3_STORE_EXITING |
4489                                   CPU_BASED_INVLPG_EXITING);
4490         if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4491                 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4492                                 CPU_BASED_MONITOR_EXITING);
4493         if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4494                 exec_control &= ~CPU_BASED_HLT_EXITING;
4495         return exec_control;
4496 }
4497
4498 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4499 {
4500         u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4501
4502         /*
4503          * IPI virtualization relies on APICv. Disable IPI virtualization if
4504          * APICv is inhibited.
4505          */
4506         if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4507                 exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4508
4509         return exec_control;
4510 }
4511
4512 /*
4513  * Adjust a single secondary execution control bit to intercept/allow an
4514  * instruction in the guest.  This is usually done based on whether or not a
4515  * feature has been exposed to the guest in order to correctly emulate faults.
4516  */
4517 static inline void
4518 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4519                                   u32 control, bool enabled, bool exiting)
4520 {
4521         /*
4522          * If the control is for an opt-in feature, clear the control if the
4523          * feature is not exposed to the guest, i.e. not enabled.  If the
4524          * control is opt-out, i.e. an exiting control, clear the control if
4525          * the feature _is_ exposed to the guest, i.e. exiting/interception is
4526          * disabled for the associated instruction.  Note, the caller is
4527          * responsible presetting exec_control to set all supported bits.
4528          */
4529         if (enabled == exiting)
4530                 *exec_control &= ~control;
4531
4532         /*
4533          * Update the nested MSR settings so that a nested VMM can/can't set
4534          * controls for features that are/aren't exposed to the guest.
4535          */
4536         if (nested) {
4537                 /*
4538                  * All features that can be added or removed to VMX MSRs must
4539                  * be supported in the first place for nested virtualization.
4540                  */
4541                 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4542                         enabled = false;
4543
4544                 if (enabled)
4545                         vmx->nested.msrs.secondary_ctls_high |= control;
4546                 else
4547                         vmx->nested.msrs.secondary_ctls_high &= ~control;
4548         }
4549 }
4550
4551 /*
4552  * Wrapper macro for the common case of adjusting a secondary execution control
4553  * based on a single guest CPUID bit, with a dedicated feature bit.  This also
4554  * verifies that the control is actually supported by KVM and hardware.
4555  */
4556 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4557 ({                                                                       \
4558         bool __enabled;                                                  \
4559                                                                          \
4560         if (cpu_has_vmx_##name()) {                                      \
4561                 __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
4562                                             X86_FEATURE_##feat_name);    \
4563                 vmx_adjust_secondary_exec_control(vmx, exec_control,     \
4564                         SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4565         }                                                                \
4566 })
4567
4568 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4569 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4570         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4571
4572 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4573         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4574
4575 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4576 {
4577         struct kvm_vcpu *vcpu = &vmx->vcpu;
4578
4579         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4580
4581         if (vmx_pt_mode_is_system())
4582                 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4583         if (!cpu_need_virtualize_apic_accesses(vcpu))
4584                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4585         if (vmx->vpid == 0)
4586                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4587         if (!enable_ept) {
4588                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4589                 enable_unrestricted_guest = 0;
4590         }
4591         if (!enable_unrestricted_guest)
4592                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4593         if (kvm_pause_in_guest(vmx->vcpu.kvm))
4594                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4595         if (!kvm_vcpu_apicv_active(vcpu))
4596                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4597                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4598         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4599
4600         /*
4601          * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
4602          * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
4603          */
4604         exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
4605
4606         /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4607          * in vmx_set_cr4.  */
4608         exec_control &= ~SECONDARY_EXEC_DESC;
4609
4610         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4611            (handle_vmptrld).
4612            We can NOT enable shadow_vmcs here because we don't have yet
4613            a current VMCS12
4614         */
4615         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4616
4617         /*
4618          * PML is enabled/disabled when dirty logging of memsmlots changes, but
4619          * it needs to be set here when dirty logging is already active, e.g.
4620          * if this vCPU was created after dirty logging was enabled.
4621          */
4622         if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
4623                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4624
4625         if (cpu_has_vmx_xsaves()) {
4626                 /* Exposing XSAVES only when XSAVE is exposed */
4627                 bool xsaves_enabled =
4628                         boot_cpu_has(X86_FEATURE_XSAVE) &&
4629                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4630                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4631
4632                 vcpu->arch.xsaves_enabled = xsaves_enabled;
4633
4634                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4635                                                   SECONDARY_EXEC_XSAVES,
4636                                                   xsaves_enabled, false);
4637         }
4638
4639         /*
4640          * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4641          * feature is exposed to the guest.  This creates a virtualization hole
4642          * if both are supported in hardware but only one is exposed to the
4643          * guest, but letting the guest execute RDTSCP or RDPID when either one
4644          * is advertised is preferable to emulating the advertised instruction
4645          * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4646          */
4647         if (cpu_has_vmx_rdtscp()) {
4648                 bool rdpid_or_rdtscp_enabled =
4649                         guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
4650                         guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
4651
4652                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4653                                                   SECONDARY_EXEC_ENABLE_RDTSCP,
4654                                                   rdpid_or_rdtscp_enabled, false);
4655         }
4656         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4657
4658         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4659         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4660
4661         vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4662                                     ENABLE_USR_WAIT_PAUSE, false);
4663
4664         if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4665                 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4666
4667         if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4668                 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4669
4670         return exec_control;
4671 }
4672
4673 static inline int vmx_get_pid_table_order(struct kvm *kvm)
4674 {
4675         return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4676 }
4677
4678 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4679 {
4680         struct page *pages;
4681         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4682
4683         if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4684                 return 0;
4685
4686         if (kvm_vmx->pid_table)
4687                 return 0;
4688
4689         pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
4690                             vmx_get_pid_table_order(kvm));
4691         if (!pages)
4692                 return -ENOMEM;
4693
4694         kvm_vmx->pid_table = (void *)page_address(pages);
4695         return 0;
4696 }
4697
4698 static int vmx_vcpu_precreate(struct kvm *kvm)
4699 {
4700         return vmx_alloc_ipiv_pid_table(kvm);
4701 }
4702
4703 #define VMX_XSS_EXIT_BITMAP 0
4704
4705 static void init_vmcs(struct vcpu_vmx *vmx)
4706 {
4707         struct kvm *kvm = vmx->vcpu.kvm;
4708         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4709
4710         if (nested)
4711                 nested_vmx_set_vmcs_shadowing_bitmap();
4712
4713         if (cpu_has_vmx_msr_bitmap())
4714                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4715
4716         vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
4717
4718         /* Control */
4719         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4720
4721         exec_controls_set(vmx, vmx_exec_control(vmx));
4722
4723         if (cpu_has_secondary_exec_ctrls())
4724                 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
4725
4726         if (cpu_has_tertiary_exec_ctrls())
4727                 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4728
4729         if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
4730                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4731                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4732                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4733                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4734
4735                 vmcs_write16(GUEST_INTR_STATUS, 0);
4736
4737                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4738                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4739         }
4740
4741         if (vmx_can_use_ipiv(&vmx->vcpu)) {
4742                 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4743                 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4744         }
4745
4746         if (!kvm_pause_in_guest(kvm)) {
4747                 vmcs_write32(PLE_GAP, ple_gap);
4748                 vmx->ple_window = ple_window;
4749                 vmx->ple_window_dirty = true;
4750         }
4751
4752         if (kvm_notify_vmexit_enabled(kvm))
4753                 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4754
4755         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4756         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4757         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4758
4759         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4760         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4761         vmx_set_constant_host_state(vmx);
4762         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4763         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4764
4765         if (cpu_has_vmx_vmfunc())
4766                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4767
4768         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4769         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4770         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4771         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4772         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4773
4774         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4775                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4776
4777         vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
4778
4779         /* 22.2.1, 20.8.1 */
4780         vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4781
4782         vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4783         vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4784
4785         set_cr4_guest_host_mask(vmx);
4786
4787         if (vmx->vpid != 0)
4788                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4789
4790         if (cpu_has_vmx_xsaves())
4791                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4792
4793         if (enable_pml) {
4794                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4795                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4796         }
4797
4798         vmx_write_encls_bitmap(&vmx->vcpu, NULL);
4799
4800         if (vmx_pt_mode_is_host_guest()) {
4801                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4802                 /* Bit[6~0] are forced to 1, writes are ignored. */
4803                 vmx->pt_desc.guest.output_mask = 0x7F;
4804                 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4805         }
4806
4807         vmcs_write32(GUEST_SYSENTER_CS, 0);
4808         vmcs_writel(GUEST_SYSENTER_ESP, 0);
4809         vmcs_writel(GUEST_SYSENTER_EIP, 0);
4810         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4811
4812         if (cpu_has_vmx_tpr_shadow()) {
4813                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4814                 if (cpu_need_tpr_shadow(&vmx->vcpu))
4815                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4816                                      __pa(vmx->vcpu.arch.apic->regs));
4817                 vmcs_write32(TPR_THRESHOLD, 0);
4818         }
4819
4820         vmx_setup_uret_msrs(vmx);
4821 }
4822
4823 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4824 {
4825         struct vcpu_vmx *vmx = to_vmx(vcpu);
4826
4827         init_vmcs(vmx);
4828
4829         if (nested)
4830                 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4831
4832         vcpu_setup_sgx_lepubkeyhash(vcpu);
4833
4834         vmx->nested.posted_intr_nv = -1;
4835         vmx->nested.vmxon_ptr = INVALID_GPA;
4836         vmx->nested.current_vmptr = INVALID_GPA;
4837         vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4838
4839         vcpu->arch.microcode_version = 0x100000000ULL;
4840         vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4841
4842         /*
4843          * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4844          * or POSTED_INTR_WAKEUP_VECTOR.
4845          */
4846         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
4847         vmx->pi_desc.sn = 1;
4848 }
4849
4850 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4851 {
4852         struct vcpu_vmx *vmx = to_vmx(vcpu);
4853
4854         if (!init_event)
4855                 __vmx_vcpu_reset(vcpu);
4856
4857         vmx->rmode.vm86_active = 0;
4858         vmx->spec_ctrl = 0;
4859
4860         vmx->msr_ia32_umwait_control = 0;
4861
4862         vmx->hv_deadline_tsc = -1;
4863         kvm_set_cr8(vcpu, 0);
4864
4865         vmx_segment_cache_clear(vmx);
4866         kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
4867
4868         seg_setup(VCPU_SREG_CS);
4869         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4870         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4871
4872         seg_setup(VCPU_SREG_DS);
4873         seg_setup(VCPU_SREG_ES);
4874         seg_setup(VCPU_SREG_FS);
4875         seg_setup(VCPU_SREG_GS);
4876         seg_setup(VCPU_SREG_SS);
4877
4878         vmcs_write16(GUEST_TR_SELECTOR, 0);
4879         vmcs_writel(GUEST_TR_BASE, 0);
4880         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4881         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4882
4883         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4884         vmcs_writel(GUEST_LDTR_BASE, 0);
4885         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4886         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4887
4888         vmcs_writel(GUEST_GDTR_BASE, 0);
4889         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4890
4891         vmcs_writel(GUEST_IDTR_BASE, 0);
4892         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4893
4894         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4895         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4896         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4897         if (kvm_mpx_supported())
4898                 vmcs_write64(GUEST_BNDCFGS, 0);
4899
4900         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4901
4902         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4903
4904         vpid_sync_context(vmx->vpid);
4905
4906         vmx_update_fb_clear_dis(vcpu, vmx);
4907 }
4908
4909 static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
4910 {
4911         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4912 }
4913
4914 static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
4915 {
4916         if (!enable_vnmi ||
4917             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4918                 vmx_enable_irq_window(vcpu);
4919                 return;
4920         }
4921
4922         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4923 }
4924
4925 static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
4926 {
4927         struct vcpu_vmx *vmx = to_vmx(vcpu);
4928         uint32_t intr;
4929         int irq = vcpu->arch.interrupt.nr;
4930
4931         trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
4932
4933         ++vcpu->stat.irq_injections;
4934         if (vmx->rmode.vm86_active) {
4935                 int inc_eip = 0;
4936                 if (vcpu->arch.interrupt.soft)
4937                         inc_eip = vcpu->arch.event_exit_inst_len;
4938                 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4939                 return;
4940         }
4941         intr = irq | INTR_INFO_VALID_MASK;
4942         if (vcpu->arch.interrupt.soft) {
4943                 intr |= INTR_TYPE_SOFT_INTR;
4944                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4945                              vmx->vcpu.arch.event_exit_inst_len);
4946         } else
4947                 intr |= INTR_TYPE_EXT_INTR;
4948         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4949
4950         vmx_clear_hlt(vcpu);
4951 }
4952
4953 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4954 {
4955         struct vcpu_vmx *vmx = to_vmx(vcpu);
4956
4957         if (!enable_vnmi) {
4958                 /*
4959                  * Tracking the NMI-blocked state in software is built upon
4960                  * finding the next open IRQ window. This, in turn, depends on
4961                  * well-behaving guests: They have to keep IRQs disabled at
4962                  * least as long as the NMI handler runs. Otherwise we may
4963                  * cause NMI nesting, maybe breaking the guest. But as this is
4964                  * highly unlikely, we can live with the residual risk.
4965                  */
4966                 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4967                 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4968         }
4969
4970         ++vcpu->stat.nmi_injections;
4971         vmx->loaded_vmcs->nmi_known_unmasked = false;
4972
4973         if (vmx->rmode.vm86_active) {
4974                 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
4975                 return;
4976         }
4977
4978         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4979                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4980
4981         vmx_clear_hlt(vcpu);
4982 }
4983
4984 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4985 {
4986         struct vcpu_vmx *vmx = to_vmx(vcpu);
4987         bool masked;
4988
4989         if (!enable_vnmi)
4990                 return vmx->loaded_vmcs->soft_vnmi_blocked;
4991         if (vmx->loaded_vmcs->nmi_known_unmasked)
4992                 return false;
4993         masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4994         vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4995         return masked;
4996 }
4997
4998 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4999 {
5000         struct vcpu_vmx *vmx = to_vmx(vcpu);
5001
5002         if (!enable_vnmi) {
5003                 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5004                         vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5005                         vmx->loaded_vmcs->vnmi_blocked_time = 0;
5006                 }
5007         } else {
5008                 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5009                 if (masked)
5010                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5011                                       GUEST_INTR_STATE_NMI);
5012                 else
5013                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5014                                         GUEST_INTR_STATE_NMI);
5015         }
5016 }
5017
5018 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5019 {
5020         if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5021                 return false;
5022
5023         if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5024                 return true;
5025
5026         return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5027                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5028                  GUEST_INTR_STATE_NMI));
5029 }
5030
5031 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5032 {
5033         if (to_vmx(vcpu)->nested.nested_run_pending)
5034                 return -EBUSY;
5035
5036         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
5037         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5038                 return -EBUSY;
5039
5040         return !vmx_nmi_blocked(vcpu);
5041 }
5042
5043 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5044 {
5045         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5046                 return false;
5047
5048         return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
5049                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5050                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5051 }
5052
5053 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5054 {
5055         if (to_vmx(vcpu)->nested.nested_run_pending)
5056                 return -EBUSY;
5057
5058         /*
5059          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5060          * e.g. if the IRQ arrived asynchronously after checking nested events.
5061          */
5062         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5063                 return -EBUSY;
5064
5065         return !vmx_interrupt_blocked(vcpu);
5066 }
5067
5068 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5069 {
5070         void __user *ret;
5071
5072         if (enable_unrestricted_guest)
5073                 return 0;
5074
5075         mutex_lock(&kvm->slots_lock);
5076         ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5077                                       PAGE_SIZE * 3);
5078         mutex_unlock(&kvm->slots_lock);
5079
5080         if (IS_ERR(ret))
5081                 return PTR_ERR(ret);
5082
5083         to_kvm_vmx(kvm)->tss_addr = addr;
5084
5085         return init_rmode_tss(kvm, ret);
5086 }
5087
5088 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5089 {
5090         to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5091         return 0;
5092 }
5093
5094 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5095 {
5096         switch (vec) {
5097         case BP_VECTOR:
5098                 /*
5099                  * Update instruction length as we may reinject the exception
5100                  * from user space while in guest debugging mode.
5101                  */
5102                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5103                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5104                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5105                         return false;
5106                 fallthrough;
5107         case DB_VECTOR:
5108                 return !(vcpu->guest_debug &
5109                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
5110         case DE_VECTOR:
5111         case OF_VECTOR:
5112         case BR_VECTOR:
5113         case UD_VECTOR:
5114         case DF_VECTOR:
5115         case SS_VECTOR:
5116         case GP_VECTOR:
5117         case MF_VECTOR:
5118                 return true;
5119         }
5120         return false;
5121 }
5122
5123 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5124                                   int vec, u32 err_code)
5125 {
5126         /*
5127          * Instruction with address size override prefix opcode 0x67
5128          * Cause the #SS fault with 0 error code in VM86 mode.
5129          */
5130         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5131                 if (kvm_emulate_instruction(vcpu, 0)) {
5132                         if (vcpu->arch.halt_request) {
5133                                 vcpu->arch.halt_request = 0;
5134                                 return kvm_emulate_halt_noskip(vcpu);
5135                         }
5136                         return 1;
5137                 }
5138                 return 0;
5139         }
5140
5141         /*
5142          * Forward all other exceptions that are valid in real mode.
5143          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5144          *        the required debugging infrastructure rework.
5145          */
5146         kvm_queue_exception(vcpu, vec);
5147         return 1;
5148 }
5149
5150 static int handle_machine_check(struct kvm_vcpu *vcpu)
5151 {
5152         /* handled by vmx_vcpu_run() */
5153         return 1;
5154 }
5155
5156 /*
5157  * If the host has split lock detection disabled, then #AC is
5158  * unconditionally injected into the guest, which is the pre split lock
5159  * detection behaviour.
5160  *
5161  * If the host has split lock detection enabled then #AC is
5162  * only injected into the guest when:
5163  *  - Guest CPL == 3 (user mode)
5164  *  - Guest has #AC detection enabled in CR0
5165  *  - Guest EFLAGS has AC bit set
5166  */
5167 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
5168 {
5169         if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5170                 return true;
5171
5172         return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
5173                (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5174 }
5175
5176 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5177 {
5178         struct vcpu_vmx *vmx = to_vmx(vcpu);
5179         struct kvm_run *kvm_run = vcpu->run;
5180         u32 intr_info, ex_no, error_code;
5181         unsigned long cr2, dr6;
5182         u32 vect_info;
5183
5184         vect_info = vmx->idt_vectoring_info;
5185         intr_info = vmx_get_intr_info(vcpu);
5186
5187         /*
5188          * Machine checks are handled by handle_exception_irqoff(), or by
5189          * vmx_vcpu_run() if a #MC occurs on VM-Entry.  NMIs are handled by
5190          * vmx_vcpu_enter_exit().
5191          */
5192         if (is_machine_check(intr_info) || is_nmi(intr_info))
5193                 return 1;
5194
5195         /*
5196          * Queue the exception here instead of in handle_nm_fault_irqoff().
5197          * This ensures the nested_vmx check is not skipped so vmexit can
5198          * be reflected to L1 (when it intercepts #NM) before reaching this
5199          * point.
5200          */
5201         if (is_nm_fault(intr_info)) {
5202                 kvm_queue_exception(vcpu, NM_VECTOR);
5203                 return 1;
5204         }
5205
5206         if (is_invalid_opcode(intr_info))
5207                 return handle_ud(vcpu);
5208
5209         error_code = 0;
5210         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5211                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5212
5213         if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5214                 WARN_ON_ONCE(!enable_vmware_backdoor);
5215
5216                 /*
5217                  * VMware backdoor emulation on #GP interception only handles
5218                  * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5219                  * error code on #GP.
5220                  */
5221                 if (error_code) {
5222                         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5223                         return 1;
5224                 }
5225                 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5226         }
5227
5228         /*
5229          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5230          * MMIO, it is better to report an internal error.
5231          * See the comments in vmx_handle_exit.
5232          */
5233         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5234             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5235                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5236                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5237                 vcpu->run->internal.ndata = 4;
5238                 vcpu->run->internal.data[0] = vect_info;
5239                 vcpu->run->internal.data[1] = intr_info;
5240                 vcpu->run->internal.data[2] = error_code;
5241                 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
5242                 return 0;
5243         }
5244
5245         if (is_page_fault(intr_info)) {
5246                 cr2 = vmx_get_exit_qual(vcpu);
5247                 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
5248                         /*
5249                          * EPT will cause page fault only if we need to
5250                          * detect illegal GPAs.
5251                          */
5252                         WARN_ON_ONCE(!allow_smaller_maxphyaddr);
5253                         kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5254                         return 1;
5255                 } else
5256                         return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5257         }
5258
5259         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5260
5261         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5262                 return handle_rmode_exception(vcpu, ex_no, error_code);
5263
5264         switch (ex_no) {
5265         case DB_VECTOR:
5266                 dr6 = vmx_get_exit_qual(vcpu);
5267                 if (!(vcpu->guest_debug &
5268                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5269                         /*
5270                          * If the #DB was due to ICEBP, a.k.a. INT1, skip the
5271                          * instruction.  ICEBP generates a trap-like #DB, but
5272                          * despite its interception control being tied to #DB,
5273                          * is an instruction intercept, i.e. the VM-Exit occurs
5274                          * on the ICEBP itself.  Use the inner "skip" helper to
5275                          * avoid single-step #DB and MTF updates, as ICEBP is
5276                          * higher priority.  Note, skipping ICEBP still clears
5277                          * STI and MOVSS blocking.
5278                          *
5279                          * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5280                          * if single-step is enabled in RFLAGS and STI or MOVSS
5281                          * blocking is active, as the CPU doesn't set the bit
5282                          * on VM-Exit due to #DB interception.  VM-Entry has a
5283                          * consistency check that a single-step #DB is pending
5284                          * in this scenario as the previous instruction cannot
5285                          * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5286                          * don't modify RFLAGS), therefore the one instruction
5287                          * delay when activating single-step breakpoints must
5288                          * have already expired.  Note, the CPU sets/clears BS
5289                          * as appropriate for all other VM-Exits types.
5290                          */
5291                         if (is_icebp(intr_info))
5292                                 WARN_ON(!skip_emulated_instruction(vcpu));
5293                         else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5294                                  (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5295                                   (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5296                                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5297                                             vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
5298
5299                         kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
5300                         return 1;
5301                 }
5302                 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
5303                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5304                 fallthrough;
5305         case BP_VECTOR:
5306                 /*
5307                  * Update instruction length as we may reinject #BP from
5308                  * user space while in guest debugging mode. Reading it for
5309                  * #DB as well causes no harm, it is not used in that case.
5310                  */
5311                 vmx->vcpu.arch.event_exit_inst_len =
5312                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5313                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5314                 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5315                 kvm_run->debug.arch.exception = ex_no;
5316                 break;
5317         case AC_VECTOR:
5318                 if (vmx_guest_inject_ac(vcpu)) {
5319                         kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5320                         return 1;
5321                 }
5322
5323                 /*
5324                  * Handle split lock. Depending on detection mode this will
5325                  * either warn and disable split lock detection for this
5326                  * task or force SIGBUS on it.
5327                  */
5328                 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5329                         return 1;
5330                 fallthrough;
5331         default:
5332                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5333                 kvm_run->ex.exception = ex_no;
5334                 kvm_run->ex.error_code = error_code;
5335                 break;
5336         }
5337         return 0;
5338 }
5339
5340 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5341 {
5342         ++vcpu->stat.irq_exits;
5343         return 1;
5344 }
5345
5346 static int handle_triple_fault(struct kvm_vcpu *vcpu)
5347 {
5348         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5349         vcpu->mmio_needed = 0;
5350         return 0;
5351 }
5352
5353 static int handle_io(struct kvm_vcpu *vcpu)
5354 {
5355         unsigned long exit_qualification;
5356         int size, in, string;
5357         unsigned port;
5358
5359         exit_qualification = vmx_get_exit_qual(vcpu);
5360         string = (exit_qualification & 16) != 0;
5361
5362         ++vcpu->stat.io_exits;
5363
5364         if (string)
5365                 return kvm_emulate_instruction(vcpu, 0);
5366
5367         port = exit_qualification >> 16;
5368         size = (exit_qualification & 7) + 1;
5369         in = (exit_qualification & 8) != 0;
5370
5371         return kvm_fast_pio(vcpu, size, port, in);
5372 }
5373
5374 static void
5375 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5376 {
5377         /*
5378          * Patch in the VMCALL instruction:
5379          */
5380         hypercall[0] = 0x0f;
5381         hypercall[1] = 0x01;
5382         hypercall[2] = 0xc1;
5383 }
5384
5385 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5386 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5387 {
5388         if (is_guest_mode(vcpu)) {
5389                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5390                 unsigned long orig_val = val;
5391
5392                 /*
5393                  * We get here when L2 changed cr0 in a way that did not change
5394                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5395                  * but did change L0 shadowed bits. So we first calculate the
5396                  * effective cr0 value that L1 would like to write into the
5397                  * hardware. It consists of the L2-owned bits from the new
5398                  * value combined with the L1-owned bits from L1's guest_cr0.
5399                  */
5400                 val = (val & ~vmcs12->cr0_guest_host_mask) |
5401                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5402
5403                 if (kvm_set_cr0(vcpu, val))
5404                         return 1;
5405                 vmcs_writel(CR0_READ_SHADOW, orig_val);
5406                 return 0;
5407         } else {
5408                 return kvm_set_cr0(vcpu, val);
5409         }
5410 }
5411
5412 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5413 {
5414         if (is_guest_mode(vcpu)) {
5415                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5416                 unsigned long orig_val = val;
5417
5418                 /* analogously to handle_set_cr0 */
5419                 val = (val & ~vmcs12->cr4_guest_host_mask) |
5420                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5421                 if (kvm_set_cr4(vcpu, val))
5422                         return 1;
5423                 vmcs_writel(CR4_READ_SHADOW, orig_val);
5424                 return 0;
5425         } else
5426                 return kvm_set_cr4(vcpu, val);
5427 }
5428
5429 static int handle_desc(struct kvm_vcpu *vcpu)
5430 {
5431         /*
5432          * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
5433          * and other code needs to be updated if UMIP can be guest owned.
5434          */
5435         BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
5436
5437         WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
5438         return kvm_emulate_instruction(vcpu, 0);
5439 }
5440
5441 static int handle_cr(struct kvm_vcpu *vcpu)
5442 {
5443         unsigned long exit_qualification, val;
5444         int cr;
5445         int reg;
5446         int err;
5447         int ret;
5448
5449         exit_qualification = vmx_get_exit_qual(vcpu);
5450         cr = exit_qualification & 15;
5451         reg = (exit_qualification >> 8) & 15;
5452         switch ((exit_qualification >> 4) & 3) {
5453         case 0: /* mov to cr */
5454                 val = kvm_register_read(vcpu, reg);
5455                 trace_kvm_cr_write(cr, val);
5456                 switch (cr) {
5457                 case 0:
5458                         err = handle_set_cr0(vcpu, val);
5459                         return kvm_complete_insn_gp(vcpu, err);
5460                 case 3:
5461                         WARN_ON_ONCE(enable_unrestricted_guest);
5462
5463                         err = kvm_set_cr3(vcpu, val);
5464                         return kvm_complete_insn_gp(vcpu, err);
5465                 case 4:
5466                         err = handle_set_cr4(vcpu, val);
5467                         return kvm_complete_insn_gp(vcpu, err);
5468                 case 8: {
5469                                 u8 cr8_prev = kvm_get_cr8(vcpu);
5470                                 u8 cr8 = (u8)val;
5471                                 err = kvm_set_cr8(vcpu, cr8);
5472                                 ret = kvm_complete_insn_gp(vcpu, err);
5473                                 if (lapic_in_kernel(vcpu))
5474                                         return ret;
5475                                 if (cr8_prev <= cr8)
5476                                         return ret;
5477                                 /*
5478                                  * TODO: we might be squashing a
5479                                  * KVM_GUESTDBG_SINGLESTEP-triggered
5480                                  * KVM_EXIT_DEBUG here.
5481                                  */
5482                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5483                                 return 0;
5484                         }
5485                 }
5486                 break;
5487         case 2: /* clts */
5488                 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5489                 return -EIO;
5490         case 1: /*mov from cr*/
5491                 switch (cr) {
5492                 case 3:
5493                         WARN_ON_ONCE(enable_unrestricted_guest);
5494
5495                         val = kvm_read_cr3(vcpu);
5496                         kvm_register_write(vcpu, reg, val);
5497                         trace_kvm_cr_read(cr, val);
5498                         return kvm_skip_emulated_instruction(vcpu);
5499                 case 8:
5500                         val = kvm_get_cr8(vcpu);
5501                         kvm_register_write(vcpu, reg, val);
5502                         trace_kvm_cr_read(cr, val);
5503                         return kvm_skip_emulated_instruction(vcpu);
5504                 }
5505                 break;
5506         case 3: /* lmsw */
5507                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5508                 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
5509                 kvm_lmsw(vcpu, val);
5510
5511                 return kvm_skip_emulated_instruction(vcpu);
5512         default:
5513                 break;
5514         }
5515         vcpu->run->exit_reason = 0;
5516         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5517                (int)(exit_qualification >> 4) & 3, cr);
5518         return 0;
5519 }
5520
5521 static int handle_dr(struct kvm_vcpu *vcpu)
5522 {
5523         unsigned long exit_qualification;
5524         int dr, dr7, reg;
5525         int err = 1;
5526
5527         exit_qualification = vmx_get_exit_qual(vcpu);
5528         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5529
5530         /* First, if DR does not exist, trigger UD */
5531         if (!kvm_require_dr(vcpu, dr))
5532                 return 1;
5533
5534         if (vmx_get_cpl(vcpu) > 0)
5535                 goto out;
5536
5537         dr7 = vmcs_readl(GUEST_DR7);
5538         if (dr7 & DR7_GD) {
5539                 /*
5540                  * As the vm-exit takes precedence over the debug trap, we
5541                  * need to emulate the latter, either for the host or the
5542                  * guest debugging itself.
5543                  */
5544                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5545                         vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
5546                         vcpu->run->debug.arch.dr7 = dr7;
5547                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5548                         vcpu->run->debug.arch.exception = DB_VECTOR;
5549                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5550                         return 0;
5551                 } else {
5552                         kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5553                         return 1;
5554                 }
5555         }
5556
5557         if (vcpu->guest_debug == 0) {
5558                 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5559
5560                 /*
5561                  * No more DR vmexits; force a reload of the debug registers
5562                  * and reenter on this instruction.  The next vmexit will
5563                  * retrieve the full state of the debug registers.
5564                  */
5565                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5566                 return 1;
5567         }
5568
5569         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5570         if (exit_qualification & TYPE_MOV_FROM_DR) {
5571                 unsigned long val;
5572
5573                 kvm_get_dr(vcpu, dr, &val);
5574                 kvm_register_write(vcpu, reg, val);
5575                 err = 0;
5576         } else {
5577                 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
5578         }
5579
5580 out:
5581         return kvm_complete_insn_gp(vcpu, err);
5582 }
5583
5584 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5585 {
5586         get_debugreg(vcpu->arch.db[0], 0);
5587         get_debugreg(vcpu->arch.db[1], 1);
5588         get_debugreg(vcpu->arch.db[2], 2);
5589         get_debugreg(vcpu->arch.db[3], 3);
5590         get_debugreg(vcpu->arch.dr6, 6);
5591         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5592
5593         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5594         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5595
5596         /*
5597          * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5598          * a stale dr6 from the guest.
5599          */
5600         set_debugreg(DR6_RESERVED, 6);
5601 }
5602
5603 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5604 {
5605         vmcs_writel(GUEST_DR7, val);
5606 }
5607
5608 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5609 {
5610         kvm_apic_update_ppr(vcpu);
5611         return 1;
5612 }
5613
5614 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5615 {
5616         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5617
5618         kvm_make_request(KVM_REQ_EVENT, vcpu);
5619
5620         ++vcpu->stat.irq_window_exits;
5621         return 1;
5622 }
5623
5624 static int handle_invlpg(struct kvm_vcpu *vcpu)
5625 {
5626         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5627
5628         kvm_mmu_invlpg(vcpu, exit_qualification);
5629         return kvm_skip_emulated_instruction(vcpu);
5630 }
5631
5632 static int handle_apic_access(struct kvm_vcpu *vcpu)
5633 {
5634         if (likely(fasteoi)) {
5635                 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5636                 int access_type, offset;
5637
5638                 access_type = exit_qualification & APIC_ACCESS_TYPE;
5639                 offset = exit_qualification & APIC_ACCESS_OFFSET;
5640                 /*
5641                  * Sane guest uses MOV to write EOI, with written value
5642                  * not cared. So make a short-circuit here by avoiding
5643                  * heavy instruction emulation.
5644                  */
5645                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5646                     (offset == APIC_EOI)) {
5647                         kvm_lapic_set_eoi(vcpu);
5648                         return kvm_skip_emulated_instruction(vcpu);
5649                 }
5650         }
5651         return kvm_emulate_instruction(vcpu, 0);
5652 }
5653
5654 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5655 {
5656         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5657         int vector = exit_qualification & 0xff;
5658
5659         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5660         kvm_apic_set_eoi_accelerated(vcpu, vector);
5661         return 1;
5662 }
5663
5664 static int handle_apic_write(struct kvm_vcpu *vcpu)
5665 {
5666         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5667
5668         /*
5669          * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5670          * hardware has done any necessary aliasing, offset adjustments, etc...
5671          * for the access.  I.e. the correct value has already been  written to
5672          * the vAPIC page for the correct 16-byte chunk.  KVM needs only to
5673          * retrieve the register value and emulate the access.
5674          */
5675         u32 offset = exit_qualification & 0xff0;
5676
5677         kvm_apic_write_nodecode(vcpu, offset);
5678         return 1;
5679 }
5680
5681 static int handle_task_switch(struct kvm_vcpu *vcpu)
5682 {
5683         struct vcpu_vmx *vmx = to_vmx(vcpu);
5684         unsigned long exit_qualification;
5685         bool has_error_code = false;
5686         u32 error_code = 0;
5687         u16 tss_selector;
5688         int reason, type, idt_v, idt_index;
5689
5690         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5691         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5692         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5693
5694         exit_qualification = vmx_get_exit_qual(vcpu);
5695
5696         reason = (u32)exit_qualification >> 30;
5697         if (reason == TASK_SWITCH_GATE && idt_v) {
5698                 switch (type) {
5699                 case INTR_TYPE_NMI_INTR:
5700                         vcpu->arch.nmi_injected = false;
5701                         vmx_set_nmi_mask(vcpu, true);
5702                         break;
5703                 case INTR_TYPE_EXT_INTR:
5704                 case INTR_TYPE_SOFT_INTR:
5705                         kvm_clear_interrupt_queue(vcpu);
5706                         break;
5707                 case INTR_TYPE_HARD_EXCEPTION:
5708                         if (vmx->idt_vectoring_info &
5709                             VECTORING_INFO_DELIVER_CODE_MASK) {
5710                                 has_error_code = true;
5711                                 error_code =
5712                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
5713                         }
5714                         fallthrough;
5715                 case INTR_TYPE_SOFT_EXCEPTION:
5716                         kvm_clear_exception_queue(vcpu);
5717                         break;
5718                 default:
5719                         break;
5720                 }
5721         }
5722         tss_selector = exit_qualification;
5723
5724         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5725                        type != INTR_TYPE_EXT_INTR &&
5726                        type != INTR_TYPE_NMI_INTR))
5727                 WARN_ON(!skip_emulated_instruction(vcpu));
5728
5729         /*
5730          * TODO: What about debug traps on tss switch?
5731          *       Are we supposed to inject them and update dr6?
5732          */
5733         return kvm_task_switch(vcpu, tss_selector,
5734                                type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5735                                reason, has_error_code, error_code);
5736 }
5737
5738 static int handle_ept_violation(struct kvm_vcpu *vcpu)
5739 {
5740         unsigned long exit_qualification;
5741         gpa_t gpa;
5742         u64 error_code;
5743
5744         exit_qualification = vmx_get_exit_qual(vcpu);
5745
5746         /*
5747          * EPT violation happened while executing iret from NMI,
5748          * "blocked by NMI" bit has to be set before next VM entry.
5749          * There are errata that may cause this bit to not be set:
5750          * AAK134, BY25.
5751          */
5752         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5753                         enable_vnmi &&
5754                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5755                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5756
5757         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5758         trace_kvm_page_fault(vcpu, gpa, exit_qualification);
5759
5760         /* Is it a read fault? */
5761         error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5762                      ? PFERR_USER_MASK : 0;
5763         /* Is it a write fault? */
5764         error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5765                       ? PFERR_WRITE_MASK : 0;
5766         /* Is it a fetch fault? */
5767         error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5768                       ? PFERR_FETCH_MASK : 0;
5769         /* ept page table entry is present? */
5770         error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
5771                       ? PFERR_PRESENT_MASK : 0;
5772
5773         error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
5774                PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5775
5776         vcpu->arch.exit_qualification = exit_qualification;
5777
5778         /*
5779          * Check that the GPA doesn't exceed physical memory limits, as that is
5780          * a guest page fault.  We have to emulate the instruction here, because
5781          * if the illegal address is that of a paging structure, then
5782          * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
5783          * would also use advanced VM-exit information for EPT violations to
5784          * reconstruct the page fault error code.
5785          */
5786         if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
5787                 return kvm_emulate_instruction(vcpu, 0);
5788
5789         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5790 }
5791
5792 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5793 {
5794         gpa_t gpa;
5795
5796         if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
5797                 return 1;
5798
5799         /*
5800          * A nested guest cannot optimize MMIO vmexits, because we have an
5801          * nGPA here instead of the required GPA.
5802          */
5803         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5804         if (!is_guest_mode(vcpu) &&
5805             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5806                 trace_kvm_fast_mmio(gpa);
5807                 return kvm_skip_emulated_instruction(vcpu);
5808         }
5809
5810         return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5811 }
5812
5813 static int handle_nmi_window(struct kvm_vcpu *vcpu)
5814 {
5815         if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5816                 return -EIO;
5817
5818         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5819         ++vcpu->stat.nmi_window_exits;
5820         kvm_make_request(KVM_REQ_EVENT, vcpu);
5821
5822         return 1;
5823 }
5824
5825 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
5826 {
5827         struct vcpu_vmx *vmx = to_vmx(vcpu);
5828
5829         return vmx->emulation_required && !vmx->rmode.vm86_active &&
5830                (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
5831 }
5832
5833 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5834 {
5835         struct vcpu_vmx *vmx = to_vmx(vcpu);
5836         bool intr_window_requested;
5837         unsigned count = 130;
5838
5839         intr_window_requested = exec_controls_get(vmx) &
5840                                 CPU_BASED_INTR_WINDOW_EXITING;
5841
5842         while (vmx->emulation_required && count-- != 0) {
5843                 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5844                         return handle_interrupt_window(&vmx->vcpu);
5845
5846                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5847                         return 1;
5848
5849                 if (!kvm_emulate_instruction(vcpu, 0))
5850                         return 0;
5851
5852                 if (vmx_emulation_required_with_pending_exception(vcpu)) {
5853                         kvm_prepare_emulation_failure_exit(vcpu);
5854                         return 0;
5855                 }
5856
5857                 if (vcpu->arch.halt_request) {
5858                         vcpu->arch.halt_request = 0;
5859                         return kvm_emulate_halt_noskip(vcpu);
5860                 }
5861
5862                 /*
5863                  * Note, return 1 and not 0, vcpu_run() will invoke
5864                  * xfer_to_guest_mode() which will create a proper return
5865                  * code.
5866                  */
5867                 if (__xfer_to_guest_mode_work_pending())
5868                         return 1;
5869         }
5870
5871         return 1;
5872 }
5873
5874 static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
5875 {
5876         if (vmx_emulation_required_with_pending_exception(vcpu)) {
5877                 kvm_prepare_emulation_failure_exit(vcpu);
5878                 return 0;
5879         }
5880
5881         return 1;
5882 }
5883
5884 static void grow_ple_window(struct kvm_vcpu *vcpu)
5885 {
5886         struct vcpu_vmx *vmx = to_vmx(vcpu);
5887         unsigned int old = vmx->ple_window;
5888
5889         vmx->ple_window = __grow_ple_window(old, ple_window,
5890                                             ple_window_grow,
5891                                             ple_window_max);
5892
5893         if (vmx->ple_window != old) {
5894                 vmx->ple_window_dirty = true;
5895                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5896                                             vmx->ple_window, old);
5897         }
5898 }
5899
5900 static void shrink_ple_window(struct kvm_vcpu *vcpu)
5901 {
5902         struct vcpu_vmx *vmx = to_vmx(vcpu);
5903         unsigned int old = vmx->ple_window;
5904
5905         vmx->ple_window = __shrink_ple_window(old, ple_window,
5906                                               ple_window_shrink,
5907                                               ple_window);
5908
5909         if (vmx->ple_window != old) {
5910                 vmx->ple_window_dirty = true;
5911                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5912                                             vmx->ple_window, old);
5913         }
5914 }
5915
5916 /*
5917  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5918  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5919  */
5920 static int handle_pause(struct kvm_vcpu *vcpu)
5921 {
5922         if (!kvm_pause_in_guest(vcpu->kvm))
5923                 grow_ple_window(vcpu);
5924
5925         /*
5926          * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5927          * VM-execution control is ignored if CPL > 0. OTOH, KVM
5928          * never set PAUSE_EXITING and just set PLE if supported,
5929          * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5930          */
5931         kvm_vcpu_on_spin(vcpu, true);
5932         return kvm_skip_emulated_instruction(vcpu);
5933 }
5934
5935 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5936 {
5937         return 1;
5938 }
5939
5940 static int handle_invpcid(struct kvm_vcpu *vcpu)
5941 {
5942         u32 vmx_instruction_info;
5943         unsigned long type;
5944         gva_t gva;
5945         struct {
5946                 u64 pcid;
5947                 u64 gla;
5948         } operand;
5949         int gpr_index;
5950
5951         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5952                 kvm_queue_exception(vcpu, UD_VECTOR);
5953                 return 1;
5954         }
5955
5956         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5957         gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5958         type = kvm_register_read(vcpu, gpr_index);
5959
5960         /* According to the Intel instruction reference, the memory operand
5961          * is read even if it isn't needed (e.g., for type==all)
5962          */
5963         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5964                                 vmx_instruction_info, false,
5965                                 sizeof(operand), &gva))
5966                 return 1;
5967
5968         return kvm_handle_invpcid(vcpu, type, gva);
5969 }
5970
5971 static int handle_pml_full(struct kvm_vcpu *vcpu)
5972 {
5973         unsigned long exit_qualification;
5974
5975         trace_kvm_pml_full(vcpu->vcpu_id);
5976
5977         exit_qualification = vmx_get_exit_qual(vcpu);
5978
5979         /*
5980          * PML buffer FULL happened while executing iret from NMI,
5981          * "blocked by NMI" bit has to be set before next VM entry.
5982          */
5983         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5984                         enable_vnmi &&
5985                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5986                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5987                                 GUEST_INTR_STATE_NMI);
5988
5989         /*
5990          * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5991          * here.., and there's no userspace involvement needed for PML.
5992          */
5993         return 1;
5994 }
5995
5996 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
5997 {
5998         struct vcpu_vmx *vmx = to_vmx(vcpu);
5999
6000         if (!vmx->req_immediate_exit &&
6001             !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
6002                 kvm_lapic_expired_hv_timer(vcpu);
6003                 return EXIT_FASTPATH_REENTER_GUEST;
6004         }
6005
6006         return EXIT_FASTPATH_NONE;
6007 }
6008
6009 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6010 {
6011         handle_fastpath_preemption_timer(vcpu);
6012         return 1;
6013 }
6014
6015 /*
6016  * When nested=0, all VMX instruction VM Exits filter here.  The handlers
6017  * are overwritten by nested_vmx_setup() when nested=1.
6018  */
6019 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6020 {
6021         kvm_queue_exception(vcpu, UD_VECTOR);
6022         return 1;
6023 }
6024
6025 #ifndef CONFIG_X86_SGX_KVM
6026 static int handle_encls(struct kvm_vcpu *vcpu)
6027 {
6028         /*
6029          * SGX virtualization is disabled.  There is no software enable bit for
6030          * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6031          * the guest from executing ENCLS (when SGX is supported by hardware).
6032          */
6033         kvm_queue_exception(vcpu, UD_VECTOR);
6034         return 1;
6035 }
6036 #endif /* CONFIG_X86_SGX_KVM */
6037
6038 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6039 {
6040         /*
6041          * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6042          * VM-Exits. Unconditionally set the flag here and leave the handling to
6043          * vmx_handle_exit().
6044          */
6045         to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
6046         return 1;
6047 }
6048
6049 static int handle_notify(struct kvm_vcpu *vcpu)
6050 {
6051         unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6052         bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6053
6054         ++vcpu->stat.notify_window_exits;
6055
6056         /*
6057          * Notify VM exit happened while executing iret from NMI,
6058          * "blocked by NMI" bit has to be set before next VM entry.
6059          */
6060         if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6061                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6062                               GUEST_INTR_STATE_NMI);
6063
6064         if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6065             context_invalid) {
6066                 vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6067                 vcpu->run->notify.flags = context_invalid ?
6068                                           KVM_NOTIFY_CONTEXT_INVALID : 0;
6069                 return 0;
6070         }
6071
6072         return 1;
6073 }
6074
6075 /*
6076  * The exit handlers return 1 if the exit was handled fully and guest execution
6077  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
6078  * to be done to userspace and return 0.
6079  */
6080 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6081         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
6082         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
6083         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
6084         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
6085         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
6086         [EXIT_REASON_CR_ACCESS]               = handle_cr,
6087         [EXIT_REASON_DR_ACCESS]               = handle_dr,
6088         [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
6089         [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
6090         [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
6091         [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
6092         [EXIT_REASON_HLT]                     = kvm_emulate_halt,
6093         [EXIT_REASON_INVD]                    = kvm_emulate_invd,
6094         [EXIT_REASON_INVLPG]                  = handle_invlpg,
6095         [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
6096         [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
6097         [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
6098         [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
6099         [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
6100         [EXIT_REASON_VMPTRST]                 = handle_vmx_instruction,
6101         [EXIT_REASON_VMREAD]                  = handle_vmx_instruction,
6102         [EXIT_REASON_VMRESUME]                = handle_vmx_instruction,
6103         [EXIT_REASON_VMWRITE]                 = handle_vmx_instruction,
6104         [EXIT_REASON_VMOFF]                   = handle_vmx_instruction,
6105         [EXIT_REASON_VMON]                    = handle_vmx_instruction,
6106         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
6107         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
6108         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
6109         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
6110         [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
6111         [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
6112         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
6113         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
6114         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
6115         [EXIT_REASON_LDTR_TR]                 = handle_desc,
6116         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
6117         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
6118         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
6119         [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
6120         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
6121         [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
6122         [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
6123         [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
6124         [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
6125         [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
6126         [EXIT_REASON_PML_FULL]                = handle_pml_full,
6127         [EXIT_REASON_INVPCID]                 = handle_invpcid,
6128         [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
6129         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
6130         [EXIT_REASON_ENCLS]                   = handle_encls,
6131         [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
6132         [EXIT_REASON_NOTIFY]                  = handle_notify,
6133 };
6134
6135 static const int kvm_vmx_max_exit_handlers =
6136         ARRAY_SIZE(kvm_vmx_exit_handlers);
6137
6138 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6139                               u64 *info1, u64 *info2,
6140                               u32 *intr_info, u32 *error_code)
6141 {
6142         struct vcpu_vmx *vmx = to_vmx(vcpu);
6143
6144         *reason = vmx->exit_reason.full;
6145         *info1 = vmx_get_exit_qual(vcpu);
6146         if (!(vmx->exit_reason.failed_vmentry)) {
6147                 *info2 = vmx->idt_vectoring_info;
6148                 *intr_info = vmx_get_intr_info(vcpu);
6149                 if (is_exception_with_error_code(*intr_info))
6150                         *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6151                 else
6152                         *error_code = 0;
6153         } else {
6154                 *info2 = 0;
6155                 *intr_info = 0;
6156                 *error_code = 0;
6157         }
6158 }
6159
6160 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6161 {
6162         if (vmx->pml_pg) {
6163                 __free_page(vmx->pml_pg);
6164                 vmx->pml_pg = NULL;
6165         }
6166 }
6167
6168 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6169 {
6170         struct vcpu_vmx *vmx = to_vmx(vcpu);
6171         u64 *pml_buf;
6172         u16 pml_idx;
6173
6174         pml_idx = vmcs_read16(GUEST_PML_INDEX);
6175
6176         /* Do nothing if PML buffer is empty */
6177         if (pml_idx == (PML_ENTITY_NUM - 1))
6178                 return;
6179
6180         /* PML index always points to next available PML buffer entity */
6181         if (pml_idx >= PML_ENTITY_NUM)
6182                 pml_idx = 0;
6183         else
6184                 pml_idx++;
6185
6186         pml_buf = page_address(vmx->pml_pg);
6187         for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6188                 u64 gpa;
6189
6190                 gpa = pml_buf[pml_idx];
6191                 WARN_ON(gpa & (PAGE_SIZE - 1));
6192                 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6193         }
6194
6195         /* reset PML index */
6196         vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6197 }
6198
6199 static void vmx_dump_sel(char *name, uint32_t sel)
6200 {
6201         pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6202                name, vmcs_read16(sel),
6203                vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6204                vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6205                vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6206 }
6207
6208 static void vmx_dump_dtsel(char *name, uint32_t limit)
6209 {
6210         pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
6211                name, vmcs_read32(limit),
6212                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6213 }
6214
6215 static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6216 {
6217         unsigned int i;
6218         struct vmx_msr_entry *e;
6219
6220         pr_err("MSR %s:\n", name);
6221         for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6222                 pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6223 }
6224
6225 void dump_vmcs(struct kvm_vcpu *vcpu)
6226 {
6227         struct vcpu_vmx *vmx = to_vmx(vcpu);
6228         u32 vmentry_ctl, vmexit_ctl;
6229         u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6230         u64 tertiary_exec_control;
6231         unsigned long cr4;
6232         int efer_slot;
6233
6234         if (!dump_invalid_vmcs) {
6235                 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6236                 return;
6237         }
6238
6239         vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6240         vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6241         cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6242         pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6243         cr4 = vmcs_readl(GUEST_CR4);
6244
6245         if (cpu_has_secondary_exec_ctrls())
6246                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6247         else
6248                 secondary_exec_control = 0;
6249
6250         if (cpu_has_tertiary_exec_ctrls())
6251                 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6252         else
6253                 tertiary_exec_control = 0;
6254
6255         pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6256                vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
6257         pr_err("*** Guest State ***\n");
6258         pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6259                vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6260                vmcs_readl(CR0_GUEST_HOST_MASK));
6261         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6262                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6263         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6264         if (cpu_has_vmx_ept()) {
6265                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
6266                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6267                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
6268                        vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6269         }
6270         pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
6271                vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6272         pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
6273                vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6274         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6275                vmcs_readl(GUEST_SYSENTER_ESP),
6276                vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6277         vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
6278         vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
6279         vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
6280         vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
6281         vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
6282         vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
6283         vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6284         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6285         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6286         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
6287         efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
6288         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
6289                 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
6290         else if (efer_slot >= 0)
6291                 pr_err("EFER= 0x%016llx (autoload)\n",
6292                        vmx->msr_autoload.guest.val[efer_slot].value);
6293         else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6294                 pr_err("EFER= 0x%016llx (effective)\n",
6295                        vcpu->arch.efer | (EFER_LMA | EFER_LME));
6296         else
6297                 pr_err("EFER= 0x%016llx (effective)\n",
6298                        vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
6299         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
6300                 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
6301         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
6302                vmcs_read64(GUEST_IA32_DEBUGCTL),
6303                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6304         if (cpu_has_load_perf_global_ctrl() &&
6305             vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6306                 pr_err("PerfGlobCtl = 0x%016llx\n",
6307                        vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6308         if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6309                 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6310         pr_err("Interruptibility = %08x  ActivityState = %08x\n",
6311                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6312                vmcs_read32(GUEST_ACTIVITY_STATE));
6313         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6314                 pr_err("InterruptStatus = %04x\n",
6315                        vmcs_read16(GUEST_INTR_STATUS));
6316         if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6317                 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6318         if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6319                 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
6320
6321         pr_err("*** Host State ***\n");
6322         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
6323                vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6324         pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6325                vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6326                vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6327                vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6328                vmcs_read16(HOST_TR_SELECTOR));
6329         pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6330                vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6331                vmcs_readl(HOST_TR_BASE));
6332         pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6333                vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6334         pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6335                vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6336                vmcs_readl(HOST_CR4));
6337         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6338                vmcs_readl(HOST_IA32_SYSENTER_ESP),
6339                vmcs_read32(HOST_IA32_SYSENTER_CS),
6340                vmcs_readl(HOST_IA32_SYSENTER_EIP));
6341         if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6342                 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6343         if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6344                 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
6345         if (cpu_has_load_perf_global_ctrl() &&
6346             vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6347                 pr_err("PerfGlobCtl = 0x%016llx\n",
6348                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6349         if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6350                 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
6351
6352         pr_err("*** Control State ***\n");
6353         pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6354                cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6355         pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6356                pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
6357         pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6358                vmcs_read32(EXCEPTION_BITMAP),
6359                vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6360                vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6361         pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6362                vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6363                vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6364                vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6365         pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6366                vmcs_read32(VM_EXIT_INTR_INFO),
6367                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6368                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6369         pr_err("        reason=%08x qualification=%016lx\n",
6370                vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6371         pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6372                vmcs_read32(IDT_VECTORING_INFO_FIELD),
6373                vmcs_read32(IDT_VECTORING_ERROR_CODE));
6374         pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6375         if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6376                 pr_err("TSC Multiplier = 0x%016llx\n",
6377                        vmcs_read64(TSC_MULTIPLIER));
6378         if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6379                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6380                         u16 status = vmcs_read16(GUEST_INTR_STATUS);
6381                         pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6382                 }
6383                 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6384                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6385                         pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6386                 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6387         }
6388         if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6389                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6390         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6391                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6392         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6393                 pr_err("PLE Gap=%08x Window=%08x\n",
6394                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6395         if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6396                 pr_err("Virtual processor ID = 0x%04x\n",
6397                        vmcs_read16(VIRTUAL_PROCESSOR_ID));
6398 }
6399
6400 /*
6401  * The guest has exited.  See if we can fix it or if we need userspace
6402  * assistance.
6403  */
6404 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6405 {
6406         struct vcpu_vmx *vmx = to_vmx(vcpu);
6407         union vmx_exit_reason exit_reason = vmx->exit_reason;
6408         u32 vectoring_info = vmx->idt_vectoring_info;
6409         u16 exit_handler_index;
6410
6411         /*
6412          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6413          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6414          * querying dirty_bitmap, we only need to kick all vcpus out of guest
6415          * mode as if vcpus is in root mode, the PML buffer must has been
6416          * flushed already.  Note, PML is never enabled in hardware while
6417          * running L2.
6418          */
6419         if (enable_pml && !is_guest_mode(vcpu))
6420                 vmx_flush_pml_buffer(vcpu);
6421
6422         /*
6423          * KVM should never reach this point with a pending nested VM-Enter.
6424          * More specifically, short-circuiting VM-Entry to emulate L2 due to
6425          * invalid guest state should never happen as that means KVM knowingly
6426          * allowed a nested VM-Enter with an invalid vmcs12.  More below.
6427          */
6428         if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6429                 return -EIO;
6430
6431         if (is_guest_mode(vcpu)) {
6432                 /*
6433                  * PML is never enabled when running L2, bail immediately if a
6434                  * PML full exit occurs as something is horribly wrong.
6435                  */
6436                 if (exit_reason.basic == EXIT_REASON_PML_FULL)
6437                         goto unexpected_vmexit;
6438
6439                 /*
6440                  * The host physical addresses of some pages of guest memory
6441                  * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6442                  * Page). The CPU may write to these pages via their host
6443                  * physical address while L2 is running, bypassing any
6444                  * address-translation-based dirty tracking (e.g. EPT write
6445                  * protection).
6446                  *
6447                  * Mark them dirty on every exit from L2 to prevent them from
6448                  * getting out of sync with dirty tracking.
6449                  */
6450                 nested_mark_vmcs12_pages_dirty(vcpu);
6451
6452                 /*
6453                  * Synthesize a triple fault if L2 state is invalid.  In normal
6454                  * operation, nested VM-Enter rejects any attempt to enter L2
6455                  * with invalid state.  However, those checks are skipped if
6456                  * state is being stuffed via RSM or KVM_SET_NESTED_STATE.  If
6457                  * L2 state is invalid, it means either L1 modified SMRAM state
6458                  * or userspace provided bad state.  Synthesize TRIPLE_FAULT as
6459                  * doing so is architecturally allowed in the RSM case, and is
6460                  * the least awful solution for the userspace case without
6461                  * risking false positives.
6462                  */
6463                 if (vmx->emulation_required) {
6464                         nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6465                         return 1;
6466                 }
6467
6468                 if (nested_vmx_reflect_vmexit(vcpu))
6469                         return 1;
6470         }
6471
6472         /* If guest state is invalid, start emulating.  L2 is handled above. */
6473         if (vmx->emulation_required)
6474                 return handle_invalid_guest_state(vcpu);
6475
6476         if (exit_reason.failed_vmentry) {
6477                 dump_vmcs(vcpu);
6478                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6479                 vcpu->run->fail_entry.hardware_entry_failure_reason
6480                         = exit_reason.full;
6481                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6482                 return 0;
6483         }
6484
6485         if (unlikely(vmx->fail)) {
6486                 dump_vmcs(vcpu);
6487                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6488                 vcpu->run->fail_entry.hardware_entry_failure_reason
6489                         = vmcs_read32(VM_INSTRUCTION_ERROR);
6490                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6491                 return 0;
6492         }
6493
6494         /*
6495          * Note:
6496          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6497          * delivery event since it indicates guest is accessing MMIO.
6498          * The vm-exit can be triggered again after return to guest that
6499          * will cause infinite loop.
6500          */
6501         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6502             (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6503              exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6504              exit_reason.basic != EXIT_REASON_PML_FULL &&
6505              exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6506              exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6507              exit_reason.basic != EXIT_REASON_NOTIFY)) {
6508                 int ndata = 3;
6509
6510                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6511                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6512                 vcpu->run->internal.data[0] = vectoring_info;
6513                 vcpu->run->internal.data[1] = exit_reason.full;
6514                 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
6515                 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
6516                         vcpu->run->internal.data[ndata++] =
6517                                 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6518                 }
6519                 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6520                 vcpu->run->internal.ndata = ndata;
6521                 return 0;
6522         }
6523
6524         if (unlikely(!enable_vnmi &&
6525                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
6526                 if (!vmx_interrupt_blocked(vcpu)) {
6527                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6528                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6529                            vcpu->arch.nmi_pending) {
6530                         /*
6531                          * This CPU don't support us in finding the end of an
6532                          * NMI-blocked window if the guest runs with IRQs
6533                          * disabled. So we pull the trigger after 1 s of
6534                          * futile waiting, but inform the user about this.
6535                          */
6536                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6537                                "state on VCPU %d after 1 s timeout\n",
6538                                __func__, vcpu->vcpu_id);
6539                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6540                 }
6541         }
6542
6543         if (exit_fastpath != EXIT_FASTPATH_NONE)
6544                 return 1;
6545
6546         if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6547                 goto unexpected_vmexit;
6548 #ifdef CONFIG_RETPOLINE
6549         if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6550                 return kvm_emulate_wrmsr(vcpu);
6551         else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6552                 return handle_preemption_timer(vcpu);
6553         else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6554                 return handle_interrupt_window(vcpu);
6555         else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6556                 return handle_external_interrupt(vcpu);
6557         else if (exit_reason.basic == EXIT_REASON_HLT)
6558                 return kvm_emulate_halt(vcpu);
6559         else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6560                 return handle_ept_misconfig(vcpu);
6561 #endif
6562
6563         exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6564                                                 kvm_vmx_max_exit_handlers);
6565         if (!kvm_vmx_exit_handlers[exit_handler_index])
6566                 goto unexpected_vmexit;
6567
6568         return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6569
6570 unexpected_vmexit:
6571         vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6572                     exit_reason.full);
6573         dump_vmcs(vcpu);
6574         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6575         vcpu->run->internal.suberror =
6576                         KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6577         vcpu->run->internal.ndata = 2;
6578         vcpu->run->internal.data[0] = exit_reason.full;
6579         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
6580         return 0;
6581 }
6582
6583 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6584 {
6585         int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6586
6587         /*
6588          * Exit to user space when bus lock detected to inform that there is
6589          * a bus lock in guest.
6590          */
6591         if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
6592                 if (ret > 0)
6593                         vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6594
6595                 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6596                 return 0;
6597         }
6598         return ret;
6599 }
6600
6601 /*
6602  * Software based L1D cache flush which is used when microcode providing
6603  * the cache control MSR is not loaded.
6604  *
6605  * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6606  * flush it is required to read in 64 KiB because the replacement algorithm
6607  * is not exactly LRU. This could be sized at runtime via topology
6608  * information but as all relevant affected CPUs have 32KiB L1D cache size
6609  * there is no point in doing so.
6610  */
6611 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6612 {
6613         int size = PAGE_SIZE << L1D_CACHE_ORDER;
6614
6615         /*
6616          * This code is only executed when the flush mode is 'cond' or
6617          * 'always'
6618          */
6619         if (static_branch_likely(&vmx_l1d_flush_cond)) {
6620                 bool flush_l1d;
6621
6622                 /*
6623                  * Clear the per-vcpu flush bit, it gets set again
6624                  * either from vcpu_run() or from one of the unsafe
6625                  * VMEXIT handlers.
6626                  */
6627                 flush_l1d = vcpu->arch.l1tf_flush_l1d;
6628                 vcpu->arch.l1tf_flush_l1d = false;
6629
6630                 /*
6631                  * Clear the per-cpu flush bit, it gets set again from
6632                  * the interrupt handlers.
6633                  */
6634                 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6635                 kvm_clear_cpu_l1tf_flush_l1d();
6636
6637                 if (!flush_l1d)
6638                         return;
6639         }
6640
6641         vcpu->stat.l1d_flush++;
6642
6643         if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6644                 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6645                 return;
6646         }
6647
6648         asm volatile(
6649                 /* First ensure the pages are in the TLB */
6650                 "xorl   %%eax, %%eax\n"
6651                 ".Lpopulate_tlb:\n\t"
6652                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6653                 "addl   $4096, %%eax\n\t"
6654                 "cmpl   %%eax, %[size]\n\t"
6655                 "jne    .Lpopulate_tlb\n\t"
6656                 "xorl   %%eax, %%eax\n\t"
6657                 "cpuid\n\t"
6658                 /* Now fill the cache */
6659                 "xorl   %%eax, %%eax\n"
6660                 ".Lfill_cache:\n"
6661                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6662                 "addl   $64, %%eax\n\t"
6663                 "cmpl   %%eax, %[size]\n\t"
6664                 "jne    .Lfill_cache\n\t"
6665                 "lfence\n"
6666                 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6667                     [size] "r" (size)
6668                 : "eax", "ebx", "ecx", "edx");
6669 }
6670
6671 static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6672 {
6673         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6674         int tpr_threshold;
6675
6676         if (is_guest_mode(vcpu) &&
6677                 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6678                 return;
6679
6680         tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6681         if (is_guest_mode(vcpu))
6682                 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6683         else
6684                 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6685 }
6686
6687 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6688 {
6689         struct vcpu_vmx *vmx = to_vmx(vcpu);
6690         u32 sec_exec_control;
6691
6692         if (!lapic_in_kernel(vcpu))
6693                 return;
6694
6695         if (!flexpriority_enabled &&
6696             !cpu_has_vmx_virtualize_x2apic_mode())
6697                 return;
6698
6699         /* Postpone execution until vmcs01 is the current VMCS. */
6700         if (is_guest_mode(vcpu)) {
6701                 vmx->nested.change_vmcs01_virtual_apic_mode = true;
6702                 return;
6703         }
6704
6705         sec_exec_control = secondary_exec_controls_get(vmx);
6706         sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6707                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6708
6709         switch (kvm_get_apic_mode(vcpu)) {
6710         case LAPIC_MODE_INVALID:
6711                 WARN_ONCE(true, "Invalid local APIC state");
6712                 break;
6713         case LAPIC_MODE_DISABLED:
6714                 break;
6715         case LAPIC_MODE_XAPIC:
6716                 if (flexpriority_enabled) {
6717                         sec_exec_control |=
6718                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6719                         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6720
6721                         /*
6722                          * Flush the TLB, reloading the APIC access page will
6723                          * only do so if its physical address has changed, but
6724                          * the guest may have inserted a non-APIC mapping into
6725                          * the TLB while the APIC access page was disabled.
6726                          */
6727                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6728                 }
6729                 break;
6730         case LAPIC_MODE_X2APIC:
6731                 if (cpu_has_vmx_virtualize_x2apic_mode())
6732                         sec_exec_control |=
6733                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6734                 break;
6735         }
6736         secondary_exec_controls_set(vmx, sec_exec_control);
6737
6738         vmx_update_msr_bitmap_x2apic(vcpu);
6739 }
6740
6741 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6742 {
6743         const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
6744         struct kvm *kvm = vcpu->kvm;
6745         struct kvm_memslots *slots = kvm_memslots(kvm);
6746         struct kvm_memory_slot *slot;
6747         unsigned long mmu_seq;
6748         kvm_pfn_t pfn;
6749
6750         /* Defer reload until vmcs01 is the current VMCS. */
6751         if (is_guest_mode(vcpu)) {
6752                 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6753                 return;
6754         }
6755
6756         if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6757             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6758                 return;
6759
6760         /*
6761          * Grab the memslot so that the hva lookup for the mmu_notifier retry
6762          * is guaranteed to use the same memslot as the pfn lookup, i.e. rely
6763          * on the pfn lookup's validation of the memslot to ensure a valid hva
6764          * is used for the retry check.
6765          */
6766         slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
6767         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
6768                 return;
6769
6770         /*
6771          * Ensure that the mmu_notifier sequence count is read before KVM
6772          * retrieves the pfn from the primary MMU.  Note, the memslot is
6773          * protected by SRCU, not the mmu_notifier.  Pairs with the smp_wmb()
6774          * in kvm_mmu_invalidate_end().
6775          */
6776         mmu_seq = kvm->mmu_invalidate_seq;
6777         smp_rmb();
6778
6779         /*
6780          * No need to retry if the memslot does not exist or is invalid.  KVM
6781          * controls the APIC-access page memslot, and only deletes the memslot
6782          * if APICv is permanently inhibited, i.e. the memslot won't reappear.
6783          */
6784         pfn = gfn_to_pfn_memslot(slot, gfn);
6785         if (is_error_noslot_pfn(pfn))
6786                 return;
6787
6788         read_lock(&vcpu->kvm->mmu_lock);
6789         if (mmu_invalidate_retry_hva(kvm, mmu_seq,
6790                                      gfn_to_hva_memslot(slot, gfn))) {
6791                 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6792                 read_unlock(&vcpu->kvm->mmu_lock);
6793                 goto out;
6794         }
6795
6796         vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
6797         read_unlock(&vcpu->kvm->mmu_lock);
6798
6799         vmx_flush_tlb_current(vcpu);
6800
6801 out:
6802         /*
6803          * Do not pin apic access page in memory, the MMU notifier
6804          * will call us again if it is migrated or swapped out.
6805          */
6806         kvm_release_pfn_clean(pfn);
6807 }
6808
6809 static void vmx_hwapic_isr_update(int max_isr)
6810 {
6811         u16 status;
6812         u8 old;
6813
6814         if (max_isr == -1)
6815                 max_isr = 0;
6816
6817         status = vmcs_read16(GUEST_INTR_STATUS);
6818         old = status >> 8;
6819         if (max_isr != old) {
6820                 status &= 0xff;
6821                 status |= max_isr << 8;
6822                 vmcs_write16(GUEST_INTR_STATUS, status);
6823         }
6824 }
6825
6826 static void vmx_set_rvi(int vector)
6827 {
6828         u16 status;
6829         u8 old;
6830
6831         if (vector == -1)
6832                 vector = 0;
6833
6834         status = vmcs_read16(GUEST_INTR_STATUS);
6835         old = (u8)status & 0xff;
6836         if ((u8)vector != old) {
6837                 status &= ~0xff;
6838                 status |= (u8)vector;
6839                 vmcs_write16(GUEST_INTR_STATUS, status);
6840         }
6841 }
6842
6843 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6844 {
6845         /*
6846          * When running L2, updating RVI is only relevant when
6847          * vmcs12 virtual-interrupt-delivery enabled.
6848          * However, it can be enabled only when L1 also
6849          * intercepts external-interrupts and in that case
6850          * we should not update vmcs02 RVI but instead intercept
6851          * interrupt. Therefore, do nothing when running L2.
6852          */
6853         if (!is_guest_mode(vcpu))
6854                 vmx_set_rvi(max_irr);
6855 }
6856
6857 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6858 {
6859         struct vcpu_vmx *vmx = to_vmx(vcpu);
6860         int max_irr;
6861         bool got_posted_interrupt;
6862
6863         if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
6864                 return -EIO;
6865
6866         if (pi_test_on(&vmx->pi_desc)) {
6867                 pi_clear_on(&vmx->pi_desc);
6868                 /*
6869                  * IOMMU can write to PID.ON, so the barrier matters even on UP.
6870                  * But on x86 this is just a compiler barrier anyway.
6871                  */
6872                 smp_mb__after_atomic();
6873                 got_posted_interrupt =
6874                         kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6875         } else {
6876                 max_irr = kvm_lapic_find_highest_irr(vcpu);
6877                 got_posted_interrupt = false;
6878         }
6879
6880         /*
6881          * Newly recognized interrupts are injected via either virtual interrupt
6882          * delivery (RVI) or KVM_REQ_EVENT.  Virtual interrupt delivery is
6883          * disabled in two cases:
6884          *
6885          * 1) If L2 is running and the vCPU has a new pending interrupt.  If L1
6886          * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
6887          * VM-Exit to L1.  If L1 doesn't want to exit, the interrupt is injected
6888          * into L2, but KVM doesn't use virtual interrupt delivery to inject
6889          * interrupts into L2, and so KVM_REQ_EVENT is again needed.
6890          *
6891          * 2) If APICv is disabled for this vCPU, assigned devices may still
6892          * attempt to post interrupts.  The posted interrupt vector will cause
6893          * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
6894          */
6895         if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
6896                 vmx_set_rvi(max_irr);
6897         else if (got_posted_interrupt)
6898                 kvm_make_request(KVM_REQ_EVENT, vcpu);
6899
6900         return max_irr;
6901 }
6902
6903 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6904 {
6905         if (!kvm_vcpu_apicv_active(vcpu))
6906                 return;
6907
6908         vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6909         vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6910         vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6911         vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6912 }
6913
6914 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6915 {
6916         struct vcpu_vmx *vmx = to_vmx(vcpu);
6917
6918         pi_clear_on(&vmx->pi_desc);
6919         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6920 }
6921
6922 void vmx_do_interrupt_irqoff(unsigned long entry);
6923 void vmx_do_nmi_irqoff(void);
6924
6925 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
6926 {
6927         /*
6928          * Save xfd_err to guest_fpu before interrupt is enabled, so the
6929          * MSR value is not clobbered by the host activity before the guest
6930          * has chance to consume it.
6931          *
6932          * Do not blindly read xfd_err here, since this exception might
6933          * be caused by L1 interception on a platform which doesn't
6934          * support xfd at all.
6935          *
6936          * Do it conditionally upon guest_fpu::xfd. xfd_err matters
6937          * only when xfd contains a non-zero value.
6938          *
6939          * Queuing exception is done in vmx_handle_exit. See comment there.
6940          */
6941         if (vcpu->arch.guest_fpu.fpstate->xfd)
6942                 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
6943 }
6944
6945 static void handle_exception_irqoff(struct vcpu_vmx *vmx)
6946 {
6947         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
6948
6949         /* if exit due to PF check for async PF */
6950         if (is_page_fault(intr_info))
6951                 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
6952         /* if exit due to NM, handle before interrupts are enabled */
6953         else if (is_nm_fault(intr_info))
6954                 handle_nm_fault_irqoff(&vmx->vcpu);
6955         /* Handle machine checks before interrupts are enabled */
6956         else if (is_machine_check(intr_info))
6957                 kvm_machine_check();
6958 }
6959
6960 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6961 {
6962         u32 intr_info = vmx_get_intr_info(vcpu);
6963         unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6964         gate_desc *desc = (gate_desc *)host_idt_base + vector;
6965
6966         if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
6967             "unexpected VM-Exit interrupt info: 0x%x", intr_info))
6968                 return;
6969
6970         kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
6971         vmx_do_interrupt_irqoff(gate_offset(desc));
6972         kvm_after_interrupt(vcpu);
6973
6974         vcpu->arch.at_instruction_boundary = true;
6975 }
6976
6977 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6978 {
6979         struct vcpu_vmx *vmx = to_vmx(vcpu);
6980
6981         if (vmx->emulation_required)
6982                 return;
6983
6984         if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6985                 handle_external_interrupt_irqoff(vcpu);
6986         else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
6987                 handle_exception_irqoff(vmx);
6988 }
6989
6990 /*
6991  * The kvm parameter can be NULL (module initialization, or invocation before
6992  * VM creation). Be sure to check the kvm parameter before using it.
6993  */
6994 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
6995 {
6996         switch (index) {
6997         case MSR_IA32_SMBASE:
6998                 if (!IS_ENABLED(CONFIG_KVM_SMM))
6999                         return false;
7000                 /*
7001                  * We cannot do SMM unless we can run the guest in big
7002                  * real mode.
7003                  */
7004                 return enable_unrestricted_guest || emulate_invalid_guest_state;
7005         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
7006                 return nested;
7007         case MSR_AMD64_VIRT_SPEC_CTRL:
7008         case MSR_AMD64_TSC_RATIO:
7009                 /* This is AMD only.  */
7010                 return false;
7011         default:
7012                 return true;
7013         }
7014 }
7015
7016 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7017 {
7018         u32 exit_intr_info;
7019         bool unblock_nmi;
7020         u8 vector;
7021         bool idtv_info_valid;
7022
7023         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7024
7025         if (enable_vnmi) {
7026                 if (vmx->loaded_vmcs->nmi_known_unmasked)
7027                         return;
7028
7029                 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
7030                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7031                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7032                 /*
7033                  * SDM 3: 27.7.1.2 (September 2008)
7034                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
7035                  * a guest IRET fault.
7036                  * SDM 3: 23.2.2 (September 2008)
7037                  * Bit 12 is undefined in any of the following cases:
7038                  *  If the VM exit sets the valid bit in the IDT-vectoring
7039                  *   information field.
7040                  *  If the VM exit is due to a double fault.
7041                  */
7042                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7043                     vector != DF_VECTOR && !idtv_info_valid)
7044                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7045                                       GUEST_INTR_STATE_NMI);
7046                 else
7047                         vmx->loaded_vmcs->nmi_known_unmasked =
7048                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7049                                   & GUEST_INTR_STATE_NMI);
7050         } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7051                 vmx->loaded_vmcs->vnmi_blocked_time +=
7052                         ktime_to_ns(ktime_sub(ktime_get(),
7053                                               vmx->loaded_vmcs->entry_time));
7054 }
7055
7056 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7057                                       u32 idt_vectoring_info,
7058                                       int instr_len_field,
7059                                       int error_code_field)
7060 {
7061         u8 vector;
7062         int type;
7063         bool idtv_info_valid;
7064
7065         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7066
7067         vcpu->arch.nmi_injected = false;
7068         kvm_clear_exception_queue(vcpu);
7069         kvm_clear_interrupt_queue(vcpu);
7070
7071         if (!idtv_info_valid)
7072                 return;
7073
7074         kvm_make_request(KVM_REQ_EVENT, vcpu);
7075
7076         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7077         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7078
7079         switch (type) {
7080         case INTR_TYPE_NMI_INTR:
7081                 vcpu->arch.nmi_injected = true;
7082                 /*
7083                  * SDM 3: 27.7.1.2 (September 2008)
7084                  * Clear bit "block by NMI" before VM entry if a NMI
7085                  * delivery faulted.
7086                  */
7087                 vmx_set_nmi_mask(vcpu, false);
7088                 break;
7089         case INTR_TYPE_SOFT_EXCEPTION:
7090                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7091                 fallthrough;
7092         case INTR_TYPE_HARD_EXCEPTION:
7093                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7094                         u32 err = vmcs_read32(error_code_field);
7095                         kvm_requeue_exception_e(vcpu, vector, err);
7096                 } else
7097                         kvm_requeue_exception(vcpu, vector);
7098                 break;
7099         case INTR_TYPE_SOFT_INTR:
7100                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7101                 fallthrough;
7102         case INTR_TYPE_EXT_INTR:
7103                 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7104                 break;
7105         default:
7106                 break;
7107         }
7108 }
7109
7110 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7111 {
7112         __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7113                                   VM_EXIT_INSTRUCTION_LEN,
7114                                   IDT_VECTORING_ERROR_CODE);
7115 }
7116
7117 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7118 {
7119         __vmx_complete_interrupts(vcpu,
7120                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7121                                   VM_ENTRY_INSTRUCTION_LEN,
7122                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
7123
7124         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7125 }
7126
7127 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7128 {
7129         int i, nr_msrs;
7130         struct perf_guest_switch_msr *msrs;
7131         struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7132
7133         pmu->host_cross_mapped_mask = 0;
7134         if (pmu->pebs_enable & pmu->global_ctrl)
7135                 intel_pmu_cross_mapped_check(pmu);
7136
7137         /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
7138         msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
7139         if (!msrs)
7140                 return;
7141
7142         for (i = 0; i < nr_msrs; i++)
7143                 if (msrs[i].host == msrs[i].guest)
7144                         clear_atomic_switch_msr(vmx, msrs[i].msr);
7145                 else
7146                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7147                                         msrs[i].host, false);
7148 }
7149
7150 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
7151 {
7152         struct vcpu_vmx *vmx = to_vmx(vcpu);
7153         u64 tscl;
7154         u32 delta_tsc;
7155
7156         if (vmx->req_immediate_exit) {
7157                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7158                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7159         } else if (vmx->hv_deadline_tsc != -1) {
7160                 tscl = rdtsc();
7161                 if (vmx->hv_deadline_tsc > tscl)
7162                         /* set_hv_timer ensures the delta fits in 32-bits */
7163                         delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7164                                 cpu_preemption_timer_multi);
7165                 else
7166                         delta_tsc = 0;
7167
7168                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7169                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7170         } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7171                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7172                 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7173         }
7174 }
7175
7176 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7177 {
7178         if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7179                 vmx->loaded_vmcs->host_state.rsp = host_rsp;
7180                 vmcs_writel(HOST_RSP, host_rsp);
7181         }
7182 }
7183
7184 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7185                                         unsigned int flags)
7186 {
7187         u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7188
7189         if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7190                 return;
7191
7192         if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7193                 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
7194
7195         /*
7196          * If the guest/host SPEC_CTRL values differ, restore the host value.
7197          *
7198          * For legacy IBRS, the IBRS bit always needs to be written after
7199          * transitioning from a less privileged predictor mode, regardless of
7200          * whether the guest/host values differ.
7201          */
7202         if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7203             vmx->spec_ctrl != hostval)
7204                 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
7205
7206         barrier_nospec();
7207 }
7208
7209 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
7210 {
7211         switch (to_vmx(vcpu)->exit_reason.basic) {
7212         case EXIT_REASON_MSR_WRITE:
7213                 return handle_fastpath_set_msr_irqoff(vcpu);
7214         case EXIT_REASON_PREEMPTION_TIMER:
7215                 return handle_fastpath_preemption_timer(vcpu);
7216         default:
7217                 return EXIT_FASTPATH_NONE;
7218         }
7219 }
7220
7221 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
7222                                         unsigned int flags)
7223 {
7224         struct vcpu_vmx *vmx = to_vmx(vcpu);
7225
7226         guest_state_enter_irqoff();
7227
7228         /* L1D Flush includes CPU buffer clear to mitigate MDS */
7229         if (static_branch_unlikely(&vmx_l1d_should_flush))
7230                 vmx_l1d_flush(vcpu);
7231         else if (static_branch_unlikely(&mds_user_clear))
7232                 mds_clear_cpu_buffers();
7233         else if (static_branch_unlikely(&mmio_stale_data_clear) &&
7234                  kvm_arch_has_assigned_device(vcpu->kvm))
7235                 mds_clear_cpu_buffers();
7236
7237         vmx_disable_fb_clear(vmx);
7238
7239         if (vcpu->arch.cr2 != native_read_cr2())
7240                 native_write_cr2(vcpu->arch.cr2);
7241
7242         vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7243                                    flags);
7244
7245         vcpu->arch.cr2 = native_read_cr2();
7246
7247         vmx_enable_fb_clear(vmx);
7248
7249         if (unlikely(vmx->fail))
7250                 vmx->exit_reason.full = 0xdead;
7251         else
7252                 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7253
7254         if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
7255             is_nmi(vmx_get_intr_info(vcpu))) {
7256                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
7257                 vmx_do_nmi_irqoff();
7258                 kvm_after_interrupt(vcpu);
7259         }
7260
7261         guest_state_exit_irqoff();
7262 }
7263
7264 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
7265 {
7266         struct vcpu_vmx *vmx = to_vmx(vcpu);
7267         unsigned long cr3, cr4;
7268
7269         /* Record the guest's net vcpu time for enforced NMI injections. */
7270         if (unlikely(!enable_vnmi &&
7271                      vmx->loaded_vmcs->soft_vnmi_blocked))
7272                 vmx->loaded_vmcs->entry_time = ktime_get();
7273
7274         /*
7275          * Don't enter VMX if guest state is invalid, let the exit handler
7276          * start emulation until we arrive back to a valid state.  Synthesize a
7277          * consistency check VM-Exit due to invalid guest state and bail.
7278          */
7279         if (unlikely(vmx->emulation_required)) {
7280                 vmx->fail = 0;
7281
7282                 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
7283                 vmx->exit_reason.failed_vmentry = 1;
7284                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7285                 vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
7286                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7287                 vmx->exit_intr_info = 0;
7288                 return EXIT_FASTPATH_NONE;
7289         }
7290
7291         trace_kvm_entry(vcpu);
7292
7293         if (vmx->ple_window_dirty) {
7294                 vmx->ple_window_dirty = false;
7295                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
7296         }
7297
7298         /*
7299          * We did this in prepare_switch_to_guest, because it needs to
7300          * be within srcu_read_lock.
7301          */
7302         WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
7303
7304         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7305                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7306         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7307                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7308         vcpu->arch.regs_dirty = 0;
7309
7310         /*
7311          * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
7312          * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7313          * it switches back to the current->mm, which can occur in KVM context
7314          * when switching to a temporary mm to patch kernel code, e.g. if KVM
7315          * toggles a static key while handling a VM-Exit.
7316          */
7317         cr3 = __get_current_cr3_fast();
7318         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7319                 vmcs_writel(HOST_CR3, cr3);
7320                 vmx->loaded_vmcs->host_state.cr3 = cr3;
7321         }
7322
7323         cr4 = cr4_read_shadow();
7324         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7325                 vmcs_writel(HOST_CR4, cr4);
7326                 vmx->loaded_vmcs->host_state.cr4 = cr4;
7327         }
7328
7329         /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
7330         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
7331                 set_debugreg(vcpu->arch.dr6, 6);
7332
7333         /* When single-stepping over STI and MOV SS, we must clear the
7334          * corresponding interruptibility bits in the guest state. Otherwise
7335          * vmentry fails as it then expects bit 14 (BS) in pending debug
7336          * exceptions being set, but that's not correct for the guest debugging
7337          * case. */
7338         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7339                 vmx_set_interrupt_shadow(vcpu, 0);
7340
7341         kvm_load_guest_xsave_state(vcpu);
7342
7343         pt_guest_enter(vmx);
7344
7345         atomic_switch_perf_msrs(vmx);
7346         if (intel_pmu_lbr_is_enabled(vcpu))
7347                 vmx_passthrough_lbr_msrs(vcpu);
7348
7349         if (enable_preemption_timer)
7350                 vmx_update_hv_timer(vcpu);
7351
7352         kvm_wait_lapic_expire(vcpu);
7353
7354         /* The actual VMENTER/EXIT is in the .noinstr.text section. */
7355         vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
7356
7357         /* All fields are clean at this point */
7358         if (kvm_is_using_evmcs()) {
7359                 current_evmcs->hv_clean_fields |=
7360                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7361
7362                 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
7363         }
7364
7365         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7366         if (vmx->host_debugctlmsr)
7367                 update_debugctlmsr(vmx->host_debugctlmsr);
7368
7369 #ifndef CONFIG_X86_64
7370         /*
7371          * The sysexit path does not restore ds/es, so we must set them to
7372          * a reasonable value ourselves.
7373          *
7374          * We can't defer this to vmx_prepare_switch_to_host() since that
7375          * function may be executed in interrupt context, which saves and
7376          * restore segments around it, nullifying its effect.
7377          */
7378         loadsegment(ds, __USER_DS);
7379         loadsegment(es, __USER_DS);
7380 #endif
7381
7382         vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7383
7384         pt_guest_exit(vmx);
7385
7386         kvm_load_host_xsave_state(vcpu);
7387
7388         if (is_guest_mode(vcpu)) {
7389                 /*
7390                  * Track VMLAUNCH/VMRESUME that have made past guest state
7391                  * checking.
7392                  */
7393                 if (vmx->nested.nested_run_pending &&
7394                     !vmx->exit_reason.failed_vmentry)
7395                         ++vcpu->stat.nested_run;
7396
7397                 vmx->nested.nested_run_pending = 0;
7398         }
7399
7400         vmx->idt_vectoring_info = 0;
7401
7402         if (unlikely(vmx->fail))
7403                 return EXIT_FASTPATH_NONE;
7404
7405         if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
7406                 kvm_machine_check();
7407
7408         if (likely(!vmx->exit_reason.failed_vmentry))
7409                 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7410
7411         trace_kvm_exit(vcpu, KVM_ISA_VMX);
7412
7413         if (unlikely(vmx->exit_reason.failed_vmentry))
7414                 return EXIT_FASTPATH_NONE;
7415
7416         vmx->loaded_vmcs->launched = 1;
7417
7418         vmx_recover_nmi_blocking(vmx);
7419         vmx_complete_interrupts(vmx);
7420
7421         if (is_guest_mode(vcpu))
7422                 return EXIT_FASTPATH_NONE;
7423
7424         return vmx_exit_handlers_fastpath(vcpu);
7425 }
7426
7427 static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
7428 {
7429         struct vcpu_vmx *vmx = to_vmx(vcpu);
7430
7431         if (enable_pml)
7432                 vmx_destroy_pml_buffer(vmx);
7433         free_vpid(vmx->vpid);
7434         nested_vmx_free_vcpu(vcpu);
7435         free_loaded_vmcs(vmx->loaded_vmcs);
7436 }
7437
7438 static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
7439 {
7440         struct vmx_uret_msr *tsx_ctrl;
7441         struct vcpu_vmx *vmx;
7442         int i, err;
7443
7444         BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7445         vmx = to_vmx(vcpu);
7446
7447         INIT_LIST_HEAD(&vmx->pi_wakeup_list);
7448
7449         err = -ENOMEM;
7450
7451         vmx->vpid = allocate_vpid();
7452
7453         /*
7454          * If PML is turned on, failure on enabling PML just results in failure
7455          * of creating the vcpu, therefore we can simplify PML logic (by
7456          * avoiding dealing with cases, such as enabling PML partially on vcpus
7457          * for the guest), etc.
7458          */
7459         if (enable_pml) {
7460                 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7461                 if (!vmx->pml_pg)
7462                         goto free_vpid;
7463         }
7464
7465         for (i = 0; i < kvm_nr_uret_msrs; ++i)
7466                 vmx->guest_uret_msrs[i].mask = -1ull;
7467         if (boot_cpu_has(X86_FEATURE_RTM)) {
7468                 /*
7469                  * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7470                  * Keep the host value unchanged to avoid changing CPUID bits
7471                  * under the host kernel's feet.
7472                  */
7473                 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7474                 if (tsx_ctrl)
7475                         tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7476         }
7477
7478         err = alloc_loaded_vmcs(&vmx->vmcs01);
7479         if (err < 0)
7480                 goto free_pml;
7481
7482         /*
7483          * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7484          * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7485          * feature only for vmcs01, KVM currently isn't equipped to realize any
7486          * performance benefits from enabling it for vmcs02.
7487          */
7488         if (kvm_is_using_evmcs() &&
7489             (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7490                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7491
7492                 evmcs->hv_enlightenments_control.msr_bitmap = 1;
7493         }
7494
7495         /* The MSR bitmap starts with all ones */
7496         bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7497         bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7498
7499         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
7500 #ifdef CONFIG_X86_64
7501         vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
7502         vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
7503         vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
7504 #endif
7505         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7506         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7507         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
7508         if (kvm_cstate_in_guest(vcpu->kvm)) {
7509                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
7510                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7511                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7512                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
7513         }
7514
7515         vmx->loaded_vmcs = &vmx->vmcs01;
7516
7517         if (cpu_need_virtualize_apic_accesses(vcpu)) {
7518                 err = kvm_alloc_apic_access_page(vcpu->kvm);
7519                 if (err)
7520                         goto free_vmcs;
7521         }
7522
7523         if (enable_ept && !enable_unrestricted_guest) {
7524                 err = init_rmode_identity_map(vcpu->kvm);
7525                 if (err)
7526                         goto free_vmcs;
7527         }
7528
7529         if (vmx_can_use_ipiv(vcpu))
7530                 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7531                            __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
7532
7533         return 0;
7534
7535 free_vmcs:
7536         free_loaded_vmcs(vmx->loaded_vmcs);
7537 free_pml:
7538         vmx_destroy_pml_buffer(vmx);
7539 free_vpid:
7540         free_vpid(vmx->vpid);
7541         return err;
7542 }
7543
7544 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7545 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7546
7547 static int vmx_vm_init(struct kvm *kvm)
7548 {
7549         if (!ple_gap)
7550                 kvm->arch.pause_in_guest = true;
7551
7552         if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7553                 switch (l1tf_mitigation) {
7554                 case L1TF_MITIGATION_OFF:
7555                 case L1TF_MITIGATION_FLUSH_NOWARN:
7556                         /* 'I explicitly don't care' is set */
7557                         break;
7558                 case L1TF_MITIGATION_FLUSH:
7559                 case L1TF_MITIGATION_FLUSH_NOSMT:
7560                 case L1TF_MITIGATION_FULL:
7561                         /*
7562                          * Warn upon starting the first VM in a potentially
7563                          * insecure environment.
7564                          */
7565                         if (sched_smt_active())
7566                                 pr_warn_once(L1TF_MSG_SMT);
7567                         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7568                                 pr_warn_once(L1TF_MSG_L1D);
7569                         break;
7570                 case L1TF_MITIGATION_FULL_FORCE:
7571                         /* Flush is enforced */
7572                         break;
7573                 }
7574         }
7575         return 0;
7576 }
7577
7578 static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7579 {
7580         u8 cache;
7581
7582         /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
7583          * memory aliases with conflicting memory types and sometimes MCEs.
7584          * We have to be careful as to what are honored and when.
7585          *
7586          * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
7587          * UC.  The effective memory type is UC or WC depending on guest PAT.
7588          * This was historically the source of MCEs and we want to be
7589          * conservative.
7590          *
7591          * When there is no need to deal with noncoherent DMA (e.g., no VT-d
7592          * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
7593          * EPT memory type is set to WB.  The effective memory type is forced
7594          * WB.
7595          *
7596          * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
7597          * EPT memory type is used to emulate guest CD/MTRR.
7598          */
7599
7600         if (is_mmio)
7601                 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7602
7603         if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
7604                 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7605
7606         if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
7607                 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7608                         cache = MTRR_TYPE_WRBACK;
7609                 else
7610                         cache = MTRR_TYPE_UNCACHABLE;
7611
7612                 return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7613         }
7614
7615         return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
7616 }
7617
7618 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7619 {
7620         /*
7621          * These bits in the secondary execution controls field
7622          * are dynamic, the others are mostly based on the hypervisor
7623          * architecture and the guest's CPUID.  Do not touch the
7624          * dynamic bits.
7625          */
7626         u32 mask =
7627                 SECONDARY_EXEC_SHADOW_VMCS |
7628                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7629                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7630                 SECONDARY_EXEC_DESC;
7631
7632         u32 cur_ctl = secondary_exec_controls_get(vmx);
7633
7634         secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7635 }
7636
7637 /*
7638  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7639  * (indicating "allowed-1") if they are supported in the guest's CPUID.
7640  */
7641 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7642 {
7643         struct vcpu_vmx *vmx = to_vmx(vcpu);
7644         struct kvm_cpuid_entry2 *entry;
7645
7646         vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7647         vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7648
7649 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
7650         if (entry && (entry->_reg & (_cpuid_mask)))                     \
7651                 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
7652 } while (0)
7653
7654         entry = kvm_find_cpuid_entry(vcpu, 0x1);
7655         cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
7656         cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
7657         cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
7658         cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
7659         cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
7660         cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
7661         cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
7662         cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
7663         cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
7664         cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7665         cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
7666         cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
7667         cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
7668         cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
7669
7670         entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
7671         cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
7672         cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
7673         cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
7674         cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
7675         cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
7676         cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
7677
7678 #undef cr4_fixed1_update
7679 }
7680
7681 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7682 {
7683         struct vcpu_vmx *vmx = to_vmx(vcpu);
7684         struct kvm_cpuid_entry2 *best = NULL;
7685         int i;
7686
7687         for (i = 0; i < PT_CPUID_LEAVES; i++) {
7688                 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
7689                 if (!best)
7690                         return;
7691                 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7692                 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7693                 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7694                 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7695         }
7696
7697         /* Get the number of configurable Address Ranges for filtering */
7698         vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
7699                                                 PT_CAP_num_address_ranges);
7700
7701         /* Initialize and clear the no dependency bits */
7702         vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7703                         RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7704                         RTIT_CTL_BRANCH_EN);
7705
7706         /*
7707          * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7708          * will inject an #GP
7709          */
7710         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7711                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7712
7713         /*
7714          * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7715          * PSBFreq can be set
7716          */
7717         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7718                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7719                                 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7720
7721         /*
7722          * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
7723          */
7724         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7725                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7726                                               RTIT_CTL_MTC_RANGE);
7727
7728         /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7729         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7730                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7731                                                         RTIT_CTL_PTW_EN);
7732
7733         /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7734         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7735                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7736
7737         /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7738         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7739                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7740
7741         /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
7742         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7743                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7744
7745         /* unmask address range configure area */
7746         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
7747                 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7748 }
7749
7750 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7751 {
7752         struct vcpu_vmx *vmx = to_vmx(vcpu);
7753
7754         /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7755         vcpu->arch.xsaves_enabled = false;
7756
7757         vmx_setup_uret_msrs(vmx);
7758
7759         if (cpu_has_secondary_exec_ctrls())
7760                 vmcs_set_secondary_exec_control(vmx,
7761                                                 vmx_secondary_exec_control(vmx));
7762
7763         if (nested_vmx_allowed(vcpu))
7764                 vmx->msr_ia32_feature_control_valid_bits |=
7765                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7766                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7767         else
7768                 vmx->msr_ia32_feature_control_valid_bits &=
7769                         ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7770                           FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7771
7772         if (nested_vmx_allowed(vcpu))
7773                 nested_vmx_cr_fixed1_bits_update(vcpu);
7774
7775         if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7776                         guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7777                 update_intel_pt_cfg(vcpu);
7778
7779         if (boot_cpu_has(X86_FEATURE_RTM)) {
7780                 struct vmx_uret_msr *msr;
7781                 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7782                 if (msr) {
7783                         bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7784                         vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7785                 }
7786         }
7787
7788         if (kvm_cpu_cap_has(X86_FEATURE_XFD))
7789                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
7790                                           !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
7791
7792         if (boot_cpu_has(X86_FEATURE_IBPB))
7793                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
7794                                           !guest_has_pred_cmd_msr(vcpu));
7795
7796         if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
7797                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
7798                                           !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
7799
7800         set_cr4_guest_host_mask(vmx);
7801
7802         vmx_write_encls_bitmap(vcpu, NULL);
7803         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
7804                 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
7805         else
7806                 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7807
7808         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
7809                 vmx->msr_ia32_feature_control_valid_bits |=
7810                         FEAT_CTL_SGX_LC_ENABLED;
7811         else
7812                 vmx->msr_ia32_feature_control_valid_bits &=
7813                         ~FEAT_CTL_SGX_LC_ENABLED;
7814
7815         /* Refresh #PF interception to account for MAXPHYADDR changes. */
7816         vmx_update_exception_bitmap(vcpu);
7817 }
7818
7819 static u64 vmx_get_perf_capabilities(void)
7820 {
7821         u64 perf_cap = PMU_CAP_FW_WRITES;
7822         struct x86_pmu_lbr lbr;
7823         u64 host_perf_cap = 0;
7824
7825         if (!enable_pmu)
7826                 return 0;
7827
7828         if (boot_cpu_has(X86_FEATURE_PDCM))
7829                 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
7830
7831         if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
7832                 x86_perf_get_lbr(&lbr);
7833                 if (lbr.nr)
7834                         perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
7835         }
7836
7837         if (vmx_pebs_supported()) {
7838                 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
7839                 if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
7840                         perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7841         }
7842
7843         return perf_cap;
7844 }
7845
7846 static __init void vmx_set_cpu_caps(void)
7847 {
7848         kvm_set_cpu_caps();
7849
7850         /* CPUID 0x1 */
7851         if (nested)
7852                 kvm_cpu_cap_set(X86_FEATURE_VMX);
7853
7854         /* CPUID 0x7 */
7855         if (kvm_mpx_supported())
7856                 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7857         if (!cpu_has_vmx_invpcid())
7858                 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
7859         if (vmx_pt_mode_is_host_guest())
7860                 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7861         if (vmx_pebs_supported()) {
7862                 kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
7863                 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
7864         }
7865
7866         if (!enable_pmu)
7867                 kvm_cpu_cap_clear(X86_FEATURE_PDCM);
7868         kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
7869
7870         if (!enable_sgx) {
7871                 kvm_cpu_cap_clear(X86_FEATURE_SGX);
7872                 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
7873                 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
7874                 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
7875         }
7876
7877         if (vmx_umip_emulated())
7878                 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7879
7880         /* CPUID 0xD.1 */
7881         kvm_caps.supported_xss = 0;
7882         if (!cpu_has_vmx_xsaves())
7883                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7884
7885         /* CPUID 0x80000001 and 0x7 (RDPID) */
7886         if (!cpu_has_vmx_rdtscp()) {
7887                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7888                 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7889         }
7890
7891         if (cpu_has_vmx_waitpkg())
7892                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
7893 }
7894
7895 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7896 {
7897         to_vmx(vcpu)->req_immediate_exit = true;
7898 }
7899
7900 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7901                                   struct x86_instruction_info *info)
7902 {
7903         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7904         unsigned short port;
7905         bool intercept;
7906         int size;
7907
7908         if (info->intercept == x86_intercept_in ||
7909             info->intercept == x86_intercept_ins) {
7910                 port = info->src_val;
7911                 size = info->dst_bytes;
7912         } else {
7913                 port = info->dst_val;
7914                 size = info->src_bytes;
7915         }
7916
7917         /*
7918          * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7919          * VM-exits depend on the 'unconditional IO exiting' VM-execution
7920          * control.
7921          *
7922          * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7923          */
7924         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7925                 intercept = nested_cpu_has(vmcs12,
7926                                            CPU_BASED_UNCOND_IO_EXITING);
7927         else
7928                 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7929
7930         /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7931         return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7932 }
7933
7934 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7935                                struct x86_instruction_info *info,
7936                                enum x86_intercept_stage stage,
7937                                struct x86_exception *exception)
7938 {
7939         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7940
7941         switch (info->intercept) {
7942         /*
7943          * RDPID causes #UD if disabled through secondary execution controls.
7944          * Because it is marked as EmulateOnUD, we need to intercept it here.
7945          * Note, RDPID is hidden behind ENABLE_RDTSCP.
7946          */
7947         case x86_intercept_rdpid:
7948                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
7949                         exception->vector = UD_VECTOR;
7950                         exception->error_code_valid = false;
7951                         return X86EMUL_PROPAGATE_FAULT;
7952                 }
7953                 break;
7954
7955         case x86_intercept_in:
7956         case x86_intercept_ins:
7957         case x86_intercept_out:
7958         case x86_intercept_outs:
7959                 return vmx_check_intercept_io(vcpu, info);
7960
7961         case x86_intercept_lgdt:
7962         case x86_intercept_lidt:
7963         case x86_intercept_lldt:
7964         case x86_intercept_ltr:
7965         case x86_intercept_sgdt:
7966         case x86_intercept_sidt:
7967         case x86_intercept_sldt:
7968         case x86_intercept_str:
7969                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
7970                         return X86EMUL_CONTINUE;
7971
7972                 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7973                 break;
7974
7975         case x86_intercept_pause:
7976                 /*
7977                  * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
7978                  * with vanilla NOPs in the emulator.  Apply the interception
7979                  * check only to actual PAUSE instructions.  Don't check
7980                  * PAUSE-loop-exiting, software can't expect a given PAUSE to
7981                  * exit, i.e. KVM is within its rights to allow L2 to execute
7982                  * the PAUSE.
7983                  */
7984                 if ((info->rep_prefix != REPE_PREFIX) ||
7985                     !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
7986                         return X86EMUL_CONTINUE;
7987
7988                 break;
7989
7990         /* TODO: check more intercepts... */
7991         default:
7992                 break;
7993         }
7994
7995         return X86EMUL_UNHANDLEABLE;
7996 }
7997
7998 #ifdef CONFIG_X86_64
7999 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
8000 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
8001                                   u64 divisor, u64 *result)
8002 {
8003         u64 low = a << shift, high = a >> (64 - shift);
8004
8005         /* To avoid the overflow on divq */
8006         if (high >= divisor)
8007                 return 1;
8008
8009         /* Low hold the result, high hold rem which is discarded */
8010         asm("divq %2\n\t" : "=a" (low), "=d" (high) :
8011             "rm" (divisor), "0" (low), "1" (high));
8012         *result = low;
8013
8014         return 0;
8015 }
8016
8017 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
8018                             bool *expired)
8019 {
8020         struct vcpu_vmx *vmx;
8021         u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
8022         struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
8023
8024         vmx = to_vmx(vcpu);
8025         tscl = rdtsc();
8026         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
8027         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
8028         lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
8029                                                     ktimer->timer_advance_ns);
8030
8031         if (delta_tsc > lapic_timer_advance_cycles)
8032                 delta_tsc -= lapic_timer_advance_cycles;
8033         else
8034                 delta_tsc = 0;
8035
8036         /* Convert to host delta tsc if tsc scaling is enabled */
8037         if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
8038             delta_tsc && u64_shl_div_u64(delta_tsc,
8039                                 kvm_caps.tsc_scaling_ratio_frac_bits,
8040                                 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
8041                 return -ERANGE;
8042
8043         /*
8044          * If the delta tsc can't fit in the 32 bit after the multi shift,
8045          * we can't use the preemption timer.
8046          * It's possible that it fits on later vmentries, but checking
8047          * on every vmentry is costly so we just use an hrtimer.
8048          */
8049         if (delta_tsc >> (cpu_preemption_timer_multi + 32))
8050                 return -ERANGE;
8051
8052         vmx->hv_deadline_tsc = tscl + delta_tsc;
8053         *expired = !delta_tsc;
8054         return 0;
8055 }
8056
8057 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8058 {
8059         to_vmx(vcpu)->hv_deadline_tsc = -1;
8060 }
8061 #endif
8062
8063 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
8064 {
8065         if (!kvm_pause_in_guest(vcpu->kvm))
8066                 shrink_ple_window(vcpu);
8067 }
8068
8069 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
8070 {
8071         struct vcpu_vmx *vmx = to_vmx(vcpu);
8072
8073         if (WARN_ON_ONCE(!enable_pml))
8074                 return;
8075
8076         if (is_guest_mode(vcpu)) {
8077                 vmx->nested.update_vmcs01_cpu_dirty_logging = true;
8078                 return;
8079         }
8080
8081         /*
8082          * Note, nr_memslots_dirty_logging can be changed concurrent with this
8083          * code, but in that case another update request will be made and so
8084          * the guest will never run with a stale PML value.
8085          */
8086         if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
8087                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8088         else
8089                 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8090 }
8091
8092 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
8093 {
8094         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8095                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8096                         FEAT_CTL_LMCE_ENABLED;
8097         else
8098                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8099                         ~FEAT_CTL_LMCE_ENABLED;
8100 }
8101
8102 #ifdef CONFIG_KVM_SMM
8103 static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
8104 {
8105         /* we need a nested vmexit to enter SMM, postpone if run is pending */
8106         if (to_vmx(vcpu)->nested.nested_run_pending)
8107                 return -EBUSY;
8108         return !is_smm(vcpu);
8109 }
8110
8111 static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
8112 {
8113         struct vcpu_vmx *vmx = to_vmx(vcpu);
8114
8115         /*
8116          * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8117          * SMI and RSM.  Using the common VM-Exit + VM-Enter routines is wrong
8118          * SMI and RSM only modify state that is saved and restored via SMRAM.
8119          * E.g. most MSRs are left untouched, but many are modified by VM-Exit
8120          * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8121          */
8122         vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8123         if (vmx->nested.smm.guest_mode)
8124                 nested_vmx_vmexit(vcpu, -1, 0, 0);
8125
8126         vmx->nested.smm.vmxon = vmx->nested.vmxon;
8127         vmx->nested.vmxon = false;
8128         vmx_clear_hlt(vcpu);
8129         return 0;
8130 }
8131
8132 static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
8133 {
8134         struct vcpu_vmx *vmx = to_vmx(vcpu);
8135         int ret;
8136
8137         if (vmx->nested.smm.vmxon) {
8138                 vmx->nested.vmxon = true;
8139                 vmx->nested.smm.vmxon = false;
8140         }
8141
8142         if (vmx->nested.smm.guest_mode) {
8143                 ret = nested_vmx_enter_non_root_mode(vcpu, false);
8144                 if (ret)
8145                         return ret;
8146
8147                 vmx->nested.nested_run_pending = 1;
8148                 vmx->nested.smm.guest_mode = false;
8149         }
8150         return 0;
8151 }
8152
8153 static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
8154 {
8155         /* RSM will cause a vmexit anyway.  */
8156 }
8157 #endif
8158
8159 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8160 {
8161         return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
8162 }
8163
8164 static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8165 {
8166         if (is_guest_mode(vcpu)) {
8167                 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8168
8169                 if (hrtimer_try_to_cancel(timer) == 1)
8170                         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8171         }
8172 }
8173
8174 static void vmx_hardware_unsetup(void)
8175 {
8176         kvm_set_posted_intr_wakeup_handler(NULL);
8177
8178         if (nested)
8179                 nested_vmx_hardware_unsetup();
8180
8181         free_kvm_area();
8182 }
8183
8184 #define VMX_REQUIRED_APICV_INHIBITS                     \
8185 (                                                       \
8186         BIT(APICV_INHIBIT_REASON_DISABLE)|              \
8187         BIT(APICV_INHIBIT_REASON_ABSENT) |              \
8188         BIT(APICV_INHIBIT_REASON_HYPERV) |              \
8189         BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |            \
8190         BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
8191         BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |    \
8192         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)    \
8193 )
8194
8195 static void vmx_vm_destroy(struct kvm *kvm)
8196 {
8197         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8198
8199         free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8200 }
8201
8202 static struct kvm_x86_ops vmx_x86_ops __initdata = {
8203         .name = KBUILD_MODNAME,
8204
8205         .check_processor_compatibility = vmx_check_processor_compat,
8206
8207         .hardware_unsetup = vmx_hardware_unsetup,
8208
8209         .hardware_enable = vmx_hardware_enable,
8210         .hardware_disable = vmx_hardware_disable,
8211         .has_emulated_msr = vmx_has_emulated_msr,
8212
8213         .vm_size = sizeof(struct kvm_vmx),
8214         .vm_init = vmx_vm_init,
8215         .vm_destroy = vmx_vm_destroy,
8216
8217         .vcpu_precreate = vmx_vcpu_precreate,
8218         .vcpu_create = vmx_vcpu_create,
8219         .vcpu_free = vmx_vcpu_free,
8220         .vcpu_reset = vmx_vcpu_reset,
8221
8222         .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
8223         .vcpu_load = vmx_vcpu_load,
8224         .vcpu_put = vmx_vcpu_put,
8225
8226         .update_exception_bitmap = vmx_update_exception_bitmap,
8227         .get_msr_feature = vmx_get_msr_feature,
8228         .get_msr = vmx_get_msr,
8229         .set_msr = vmx_set_msr,
8230         .get_segment_base = vmx_get_segment_base,
8231         .get_segment = vmx_get_segment,
8232         .set_segment = vmx_set_segment,
8233         .get_cpl = vmx_get_cpl,
8234         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
8235         .is_valid_cr0 = vmx_is_valid_cr0,
8236         .set_cr0 = vmx_set_cr0,
8237         .is_valid_cr4 = vmx_is_valid_cr4,
8238         .set_cr4 = vmx_set_cr4,
8239         .set_efer = vmx_set_efer,
8240         .get_idt = vmx_get_idt,
8241         .set_idt = vmx_set_idt,
8242         .get_gdt = vmx_get_gdt,
8243         .set_gdt = vmx_set_gdt,
8244         .set_dr7 = vmx_set_dr7,
8245         .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8246         .cache_reg = vmx_cache_reg,
8247         .get_rflags = vmx_get_rflags,
8248         .set_rflags = vmx_set_rflags,
8249         .get_if_flag = vmx_get_if_flag,
8250
8251         .flush_tlb_all = vmx_flush_tlb_all,
8252         .flush_tlb_current = vmx_flush_tlb_current,
8253         .flush_tlb_gva = vmx_flush_tlb_gva,
8254         .flush_tlb_guest = vmx_flush_tlb_guest,
8255
8256         .vcpu_pre_run = vmx_vcpu_pre_run,
8257         .vcpu_run = vmx_vcpu_run,
8258         .handle_exit = vmx_handle_exit,
8259         .skip_emulated_instruction = vmx_skip_emulated_instruction,
8260         .update_emulated_instruction = vmx_update_emulated_instruction,
8261         .set_interrupt_shadow = vmx_set_interrupt_shadow,
8262         .get_interrupt_shadow = vmx_get_interrupt_shadow,
8263         .patch_hypercall = vmx_patch_hypercall,
8264         .inject_irq = vmx_inject_irq,
8265         .inject_nmi = vmx_inject_nmi,
8266         .inject_exception = vmx_inject_exception,
8267         .cancel_injection = vmx_cancel_injection,
8268         .interrupt_allowed = vmx_interrupt_allowed,
8269         .nmi_allowed = vmx_nmi_allowed,
8270         .get_nmi_mask = vmx_get_nmi_mask,
8271         .set_nmi_mask = vmx_set_nmi_mask,
8272         .enable_nmi_window = vmx_enable_nmi_window,
8273         .enable_irq_window = vmx_enable_irq_window,
8274         .update_cr8_intercept = vmx_update_cr8_intercept,
8275         .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
8276         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
8277         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
8278         .load_eoi_exitmap = vmx_load_eoi_exitmap,
8279         .apicv_post_state_restore = vmx_apicv_post_state_restore,
8280         .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
8281         .hwapic_irr_update = vmx_hwapic_irr_update,
8282         .hwapic_isr_update = vmx_hwapic_isr_update,
8283         .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
8284         .sync_pir_to_irr = vmx_sync_pir_to_irr,
8285         .deliver_interrupt = vmx_deliver_interrupt,
8286         .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
8287
8288         .set_tss_addr = vmx_set_tss_addr,
8289         .set_identity_map_addr = vmx_set_identity_map_addr,
8290         .get_mt_mask = vmx_get_mt_mask,
8291
8292         .get_exit_info = vmx_get_exit_info,
8293
8294         .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
8295
8296         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
8297
8298         .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
8299         .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
8300         .write_tsc_offset = vmx_write_tsc_offset,
8301         .write_tsc_multiplier = vmx_write_tsc_multiplier,
8302
8303         .load_mmu_pgd = vmx_load_mmu_pgd,
8304
8305         .check_intercept = vmx_check_intercept,
8306         .handle_exit_irqoff = vmx_handle_exit_irqoff,
8307
8308         .request_immediate_exit = vmx_request_immediate_exit,
8309
8310         .sched_in = vmx_sched_in,
8311
8312         .cpu_dirty_log_size = PML_ENTITY_NUM,
8313         .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
8314
8315         .nested_ops = &vmx_nested_ops,
8316
8317         .pi_update_irte = vmx_pi_update_irte,
8318         .pi_start_assignment = vmx_pi_start_assignment,
8319
8320 #ifdef CONFIG_X86_64
8321         .set_hv_timer = vmx_set_hv_timer,
8322         .cancel_hv_timer = vmx_cancel_hv_timer,
8323 #endif
8324
8325         .setup_mce = vmx_setup_mce,
8326
8327 #ifdef CONFIG_KVM_SMM
8328         .smi_allowed = vmx_smi_allowed,
8329         .enter_smm = vmx_enter_smm,
8330         .leave_smm = vmx_leave_smm,
8331         .enable_smi_window = vmx_enable_smi_window,
8332 #endif
8333
8334         .can_emulate_instruction = vmx_can_emulate_instruction,
8335         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
8336         .migrate_timers = vmx_migrate_timers,
8337
8338         .msr_filter_changed = vmx_msr_filter_changed,
8339         .complete_emulated_msr = kvm_complete_insn_gp,
8340
8341         .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
8342 };
8343
8344 static unsigned int vmx_handle_intel_pt_intr(void)
8345 {
8346         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8347
8348         /* '0' on failure so that the !PT case can use a RET0 static call. */
8349         if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
8350                 return 0;
8351
8352         kvm_make_request(KVM_REQ_PMI, vcpu);
8353         __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8354                   (unsigned long *)&vcpu->arch.pmu.global_status);
8355         return 1;
8356 }
8357
8358 static __init void vmx_setup_user_return_msrs(void)
8359 {
8360
8361         /*
8362          * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8363          * will emulate SYSCALL in legacy mode if the vendor string in guest
8364          * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8365          * support this emulation, MSR_STAR is included in the list for i386,
8366          * but is never loaded into hardware.  MSR_CSTAR is also never loaded
8367          * into hardware and is here purely for emulation purposes.
8368          */
8369         const u32 vmx_uret_msrs_list[] = {
8370         #ifdef CONFIG_X86_64
8371                 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8372         #endif
8373                 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8374                 MSR_IA32_TSX_CTRL,
8375         };
8376         int i;
8377
8378         BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8379
8380         for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8381                 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
8382 }
8383
8384 static void __init vmx_setup_me_spte_mask(void)
8385 {
8386         u64 me_mask = 0;
8387
8388         /*
8389          * kvm_get_shadow_phys_bits() returns shadow_phys_bits.  Use
8390          * the former to avoid exposing shadow_phys_bits.
8391          *
8392          * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8393          * shadow_phys_bits.  On MKTME and/or TDX capable systems,
8394          * boot_cpu_data.x86_phys_bits holds the actual physical address
8395          * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
8396          * reported by CPUID.  Those bits between are KeyID bits.
8397          */
8398         if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
8399                 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8400                         kvm_get_shadow_phys_bits() - 1);
8401         /*
8402          * Unlike SME, host kernel doesn't support setting up any
8403          * MKTME KeyID on Intel platforms.  No memory encryption
8404          * bits should be included into the SPTE.
8405          */
8406         kvm_mmu_set_me_spte_mask(0, me_mask);
8407 }
8408
8409 static struct kvm_x86_init_ops vmx_init_ops __initdata;
8410
8411 static __init int hardware_setup(void)
8412 {
8413         unsigned long host_bndcfgs;
8414         struct desc_ptr dt;
8415         int r;
8416
8417         store_idt(&dt);
8418         host_idt_base = dt.address;
8419
8420         vmx_setup_user_return_msrs();
8421
8422         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8423                 return -EIO;
8424
8425         if (cpu_has_perf_global_ctrl_bug())
8426                 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
8427                              "does not work properly. Using workaround\n");
8428
8429         if (boot_cpu_has(X86_FEATURE_NX))
8430                 kvm_enable_efer_bits(EFER_NX);
8431
8432         if (boot_cpu_has(X86_FEATURE_MPX)) {
8433                 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8434                 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
8435         }
8436
8437         if (!cpu_has_vmx_mpx())
8438                 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8439                                              XFEATURE_MASK_BNDCSR);
8440
8441         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8442             !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8443                 enable_vpid = 0;
8444
8445         if (!cpu_has_vmx_ept() ||
8446             !cpu_has_vmx_ept_4levels() ||
8447             !cpu_has_vmx_ept_mt_wb() ||
8448             !cpu_has_vmx_invept_global())
8449                 enable_ept = 0;
8450
8451         /* NX support is required for shadow paging. */
8452         if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8453                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
8454                 return -EOPNOTSUPP;
8455         }
8456
8457         if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8458                 enable_ept_ad_bits = 0;
8459
8460         if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8461                 enable_unrestricted_guest = 0;
8462
8463         if (!cpu_has_vmx_flexpriority())
8464                 flexpriority_enabled = 0;
8465
8466         if (!cpu_has_virtual_nmis())
8467                 enable_vnmi = 0;
8468
8469 #ifdef CONFIG_X86_SGX_KVM
8470         if (!cpu_has_vmx_encls_vmexit())
8471                 enable_sgx = false;
8472 #endif
8473
8474         /*
8475          * set_apic_access_page_addr() is used to reload apic access
8476          * page upon invalidation.  No need to do anything if not
8477          * using the APIC_ACCESS_ADDR VMCS field.
8478          */
8479         if (!flexpriority_enabled)
8480                 vmx_x86_ops.set_apic_access_page_addr = NULL;
8481
8482         if (!cpu_has_vmx_tpr_shadow())
8483                 vmx_x86_ops.update_cr8_intercept = NULL;
8484
8485 #if IS_ENABLED(CONFIG_HYPERV)
8486         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8487             && enable_ept) {
8488                 vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
8489                 vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
8490         }
8491 #endif
8492
8493         if (!cpu_has_vmx_ple()) {
8494                 ple_gap = 0;
8495                 ple_window = 0;
8496                 ple_window_grow = 0;
8497                 ple_window_max = 0;
8498                 ple_window_shrink = 0;
8499         }
8500
8501         if (!cpu_has_vmx_apicv())
8502                 enable_apicv = 0;
8503         if (!enable_apicv)
8504                 vmx_x86_ops.sync_pir_to_irr = NULL;
8505
8506         if (!enable_apicv || !cpu_has_vmx_ipiv())
8507                 enable_ipiv = false;
8508
8509         if (cpu_has_vmx_tsc_scaling())
8510                 kvm_caps.has_tsc_control = true;
8511
8512         kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8513         kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8514         kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
8515         kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
8516
8517         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8518
8519         if (enable_ept)
8520                 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8521                                       cpu_has_vmx_ept_execute_only());
8522
8523         /*
8524          * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8525          * bits to shadow_zero_check.
8526          */
8527         vmx_setup_me_spte_mask();
8528
8529         kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
8530                           ept_caps_to_lpage_level(vmx_capability.ept));
8531
8532         /*
8533          * Only enable PML when hardware supports PML feature, and both EPT
8534          * and EPT A/D bit features are enabled -- PML depends on them to work.
8535          */
8536         if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8537                 enable_pml = 0;
8538
8539         if (!enable_pml)
8540                 vmx_x86_ops.cpu_dirty_log_size = 0;
8541
8542         if (!cpu_has_vmx_preemption_timer())
8543                 enable_preemption_timer = false;
8544
8545         if (enable_preemption_timer) {
8546                 u64 use_timer_freq = 5000ULL * 1000 * 1000;
8547
8548                 cpu_preemption_timer_multi =
8549                         vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8550
8551                 if (tsc_khz)
8552                         use_timer_freq = (u64)tsc_khz * 1000;
8553                 use_timer_freq >>= cpu_preemption_timer_multi;
8554
8555                 /*
8556                  * KVM "disables" the preemption timer by setting it to its max
8557                  * value.  Don't use the timer if it might cause spurious exits
8558                  * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8559                  */
8560                 if (use_timer_freq > 0xffffffffu / 10)
8561                         enable_preemption_timer = false;
8562         }
8563
8564         if (!enable_preemption_timer) {
8565                 vmx_x86_ops.set_hv_timer = NULL;
8566                 vmx_x86_ops.cancel_hv_timer = NULL;
8567                 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
8568         }
8569
8570         kvm_caps.supported_mce_cap |= MCG_LMCE_P;
8571         kvm_caps.supported_mce_cap |= MCG_CMCI_P;
8572
8573         if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8574                 return -EINVAL;
8575         if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
8576                 pt_mode = PT_MODE_SYSTEM;
8577         if (pt_mode == PT_MODE_HOST_GUEST)
8578                 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8579         else
8580                 vmx_init_ops.handle_intel_pt_intr = NULL;
8581
8582         setup_default_sgx_lepubkeyhash();
8583
8584         if (nested) {
8585                 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
8586
8587                 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8588                 if (r)
8589                         return r;
8590         }
8591
8592         vmx_set_cpu_caps();
8593
8594         r = alloc_kvm_area();
8595         if (r && nested)
8596                 nested_vmx_hardware_unsetup();
8597
8598         kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8599
8600         return r;
8601 }
8602
8603 static struct kvm_x86_init_ops vmx_init_ops __initdata = {
8604         .hardware_setup = hardware_setup,
8605         .handle_intel_pt_intr = NULL,
8606
8607         .runtime_ops = &vmx_x86_ops,
8608         .pmu_ops = &intel_pmu_ops,
8609 };
8610
8611 static void vmx_cleanup_l1d_flush(void)
8612 {
8613         if (vmx_l1d_flush_pages) {
8614                 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8615                 vmx_l1d_flush_pages = NULL;
8616         }
8617         /* Restore state so sysfs ignores VMX */
8618         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8619 }
8620
8621 static void __vmx_exit(void)
8622 {
8623         allow_smaller_maxphyaddr = false;
8624
8625 #ifdef CONFIG_KEXEC_CORE
8626         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8627         synchronize_rcu();
8628 #endif
8629         vmx_cleanup_l1d_flush();
8630 }
8631
8632 static void vmx_exit(void)
8633 {
8634         kvm_exit();
8635         kvm_x86_vendor_exit();
8636
8637         __vmx_exit();
8638 }
8639 module_exit(vmx_exit);
8640
8641 static int __init vmx_init(void)
8642 {
8643         int r, cpu;
8644
8645         if (!kvm_is_vmx_supported())
8646                 return -EOPNOTSUPP;
8647
8648         /*
8649          * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
8650          * to unwind if a later step fails.
8651          */
8652         hv_init_evmcs();
8653
8654         r = kvm_x86_vendor_init(&vmx_init_ops);
8655         if (r)
8656                 return r;
8657
8658         /*
8659          * Must be called after common x86 init so enable_ept is properly set
8660          * up. Hand the parameter mitigation value in which was stored in
8661          * the pre module init parser. If no parameter was given, it will
8662          * contain 'auto' which will be turned into the default 'cond'
8663          * mitigation mode.
8664          */
8665         r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8666         if (r)
8667                 goto err_l1d_flush;
8668
8669         vmx_setup_fb_clear_ctrl();
8670
8671         for_each_possible_cpu(cpu) {
8672                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8673
8674                 pi_init_cpu(cpu);
8675         }
8676
8677 #ifdef CONFIG_KEXEC_CORE
8678         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8679                            crash_vmclear_local_loaded_vmcss);
8680 #endif
8681         vmx_check_vmcs12_offsets();
8682
8683         /*
8684          * Shadow paging doesn't have a (further) performance penalty
8685          * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8686          * by default
8687          */
8688         if (!enable_ept)
8689                 allow_smaller_maxphyaddr = true;
8690
8691         /*
8692          * Common KVM initialization _must_ come last, after this, /dev/kvm is
8693          * exposed to userspace!
8694          */
8695         r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
8696                      THIS_MODULE);
8697         if (r)
8698                 goto err_kvm_init;
8699
8700         return 0;
8701
8702 err_kvm_init:
8703         __vmx_exit();
8704 err_l1d_flush:
8705         kvm_x86_vendor_exit();
8706         return r;
8707 }
8708 module_init(vmx_init);