8031bded75cc60dfe2bef1e61a6fb2cc701d5fdb
[platform/kernel/linux-rpi.git] / arch / x86 / kvm / vmx / vmx.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * Copyright (C) 2006 Qumranet, Inc.
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  */
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17 #include <linux/highmem.h>
18 #include <linux/hrtimer.h>
19 #include <linux/kernel.h>
20 #include <linux/kvm_host.h>
21 #include <linux/module.h>
22 #include <linux/moduleparam.h>
23 #include <linux/mod_devicetable.h>
24 #include <linux/mm.h>
25 #include <linux/objtool.h>
26 #include <linux/sched.h>
27 #include <linux/sched/smt.h>
28 #include <linux/slab.h>
29 #include <linux/tboot.h>
30 #include <linux/trace_events.h>
31 #include <linux/entry-kvm.h>
32
33 #include <asm/apic.h>
34 #include <asm/asm.h>
35 #include <asm/cpu.h>
36 #include <asm/cpu_device_id.h>
37 #include <asm/debugreg.h>
38 #include <asm/desc.h>
39 #include <asm/fpu/api.h>
40 #include <asm/fpu/xstate.h>
41 #include <asm/idtentry.h>
42 #include <asm/io.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/kexec.h>
45 #include <asm/perf_event.h>
46 #include <asm/mmu_context.h>
47 #include <asm/mshyperv.h>
48 #include <asm/mwait.h>
49 #include <asm/spec-ctrl.h>
50 #include <asm/virtext.h>
51 #include <asm/vmx.h>
52
53 #include "capabilities.h"
54 #include "cpuid.h"
55 #include "hyperv.h"
56 #include "kvm_onhyperv.h"
57 #include "irq.h"
58 #include "kvm_cache_regs.h"
59 #include "lapic.h"
60 #include "mmu.h"
61 #include "nested.h"
62 #include "pmu.h"
63 #include "sgx.h"
64 #include "trace.h"
65 #include "vmcs.h"
66 #include "vmcs12.h"
67 #include "vmx.h"
68 #include "x86.h"
69 #include "smm.h"
70
71 MODULE_AUTHOR("Qumranet");
72 MODULE_LICENSE("GPL");
73
74 #ifdef MODULE
75 static const struct x86_cpu_id vmx_cpu_id[] = {
76         X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
77         {}
78 };
79 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
80 #endif
81
82 bool __read_mostly enable_vpid = 1;
83 module_param_named(vpid, enable_vpid, bool, 0444);
84
85 static bool __read_mostly enable_vnmi = 1;
86 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
87
88 bool __read_mostly flexpriority_enabled = 1;
89 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
90
91 bool __read_mostly enable_ept = 1;
92 module_param_named(ept, enable_ept, bool, S_IRUGO);
93
94 bool __read_mostly enable_unrestricted_guest = 1;
95 module_param_named(unrestricted_guest,
96                         enable_unrestricted_guest, bool, S_IRUGO);
97
98 bool __read_mostly enable_ept_ad_bits = 1;
99 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
100
101 static bool __read_mostly emulate_invalid_guest_state = true;
102 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
103
104 static bool __read_mostly fasteoi = 1;
105 module_param(fasteoi, bool, S_IRUGO);
106
107 module_param(enable_apicv, bool, S_IRUGO);
108
109 bool __read_mostly enable_ipiv = true;
110 module_param(enable_ipiv, bool, 0444);
111
112 /*
113  * If nested=1, nested virtualization is supported, i.e., guests may use
114  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
115  * use VMX instructions.
116  */
117 static bool __read_mostly nested = 1;
118 module_param(nested, bool, S_IRUGO);
119
120 bool __read_mostly enable_pml = 1;
121 module_param_named(pml, enable_pml, bool, S_IRUGO);
122
123 static bool __read_mostly error_on_inconsistent_vmcs_config = true;
124 module_param(error_on_inconsistent_vmcs_config, bool, 0444);
125
126 static bool __read_mostly dump_invalid_vmcs = 0;
127 module_param(dump_invalid_vmcs, bool, 0644);
128
129 #define MSR_BITMAP_MODE_X2APIC          1
130 #define MSR_BITMAP_MODE_X2APIC_APICV    2
131
132 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
133
134 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
135 static int __read_mostly cpu_preemption_timer_multi;
136 static bool __read_mostly enable_preemption_timer = 1;
137 #ifdef CONFIG_X86_64
138 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
139 #endif
140
141 extern bool __read_mostly allow_smaller_maxphyaddr;
142 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
143
144 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
145 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
146 #define KVM_VM_CR0_ALWAYS_ON                            \
147         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
148
149 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
150 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
151 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
152
153 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
154
155 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
156         RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
157         RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
158         RTIT_STATUS_BYTECNT))
159
160 /*
161  * List of MSRs that can be directly passed to the guest.
162  * In addition to these x2apic and PT MSRs are handled specially.
163  */
164 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
165         MSR_IA32_SPEC_CTRL,
166         MSR_IA32_PRED_CMD,
167         MSR_IA32_TSC,
168 #ifdef CONFIG_X86_64
169         MSR_FS_BASE,
170         MSR_GS_BASE,
171         MSR_KERNEL_GS_BASE,
172         MSR_IA32_XFD,
173         MSR_IA32_XFD_ERR,
174 #endif
175         MSR_IA32_SYSENTER_CS,
176         MSR_IA32_SYSENTER_ESP,
177         MSR_IA32_SYSENTER_EIP,
178         MSR_CORE_C1_RES,
179         MSR_CORE_C3_RESIDENCY,
180         MSR_CORE_C6_RESIDENCY,
181         MSR_CORE_C7_RESIDENCY,
182 };
183
184 /*
185  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
186  * ple_gap:    upper bound on the amount of time between two successive
187  *             executions of PAUSE in a loop. Also indicate if ple enabled.
188  *             According to test, this time is usually smaller than 128 cycles.
189  * ple_window: upper bound on the amount of time a guest is allowed to execute
190  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
191  *             less than 2^12 cycles
192  * Time is measured based on a counter that runs at the same rate as the TSC,
193  * refer SDM volume 3b section 21.6.13 & 22.1.3.
194  */
195 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
196 module_param(ple_gap, uint, 0444);
197
198 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
199 module_param(ple_window, uint, 0444);
200
201 /* Default doubles per-vcpu window every exit. */
202 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
203 module_param(ple_window_grow, uint, 0444);
204
205 /* Default resets per-vcpu window every exit to ple_window. */
206 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
207 module_param(ple_window_shrink, uint, 0444);
208
209 /* Default is to compute the maximum so we can never overflow. */
210 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
211 module_param(ple_window_max, uint, 0444);
212
213 /* Default is SYSTEM mode, 1 for host-guest mode */
214 int __read_mostly pt_mode = PT_MODE_SYSTEM;
215 module_param(pt_mode, int, S_IRUGO);
216
217 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
218 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
219 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
220
221 /* Storage for pre module init parameter parsing */
222 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
223
224 static const struct {
225         const char *option;
226         bool for_parse;
227 } vmentry_l1d_param[] = {
228         [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
229         [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
230         [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
231         [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
232         [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
233         [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
234 };
235
236 #define L1D_CACHE_ORDER 4
237 static void *vmx_l1d_flush_pages;
238
239 /* Control for disabling CPU Fill buffer clear */
240 static bool __read_mostly vmx_fb_clear_ctrl_available;
241
242 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
243 {
244         struct page *page;
245         unsigned int i;
246
247         if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
248                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
249                 return 0;
250         }
251
252         if (!enable_ept) {
253                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
254                 return 0;
255         }
256
257         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
258                 u64 msr;
259
260                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
261                 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
262                         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
263                         return 0;
264                 }
265         }
266
267         /* If set to auto use the default l1tf mitigation method */
268         if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
269                 switch (l1tf_mitigation) {
270                 case L1TF_MITIGATION_OFF:
271                         l1tf = VMENTER_L1D_FLUSH_NEVER;
272                         break;
273                 case L1TF_MITIGATION_FLUSH_NOWARN:
274                 case L1TF_MITIGATION_FLUSH:
275                 case L1TF_MITIGATION_FLUSH_NOSMT:
276                         l1tf = VMENTER_L1D_FLUSH_COND;
277                         break;
278                 case L1TF_MITIGATION_FULL:
279                 case L1TF_MITIGATION_FULL_FORCE:
280                         l1tf = VMENTER_L1D_FLUSH_ALWAYS;
281                         break;
282                 }
283         } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
284                 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
285         }
286
287         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
288             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
289                 /*
290                  * This allocation for vmx_l1d_flush_pages is not tied to a VM
291                  * lifetime and so should not be charged to a memcg.
292                  */
293                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
294                 if (!page)
295                         return -ENOMEM;
296                 vmx_l1d_flush_pages = page_address(page);
297
298                 /*
299                  * Initialize each page with a different pattern in
300                  * order to protect against KSM in the nested
301                  * virtualization case.
302                  */
303                 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
304                         memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
305                                PAGE_SIZE);
306                 }
307         }
308
309         l1tf_vmx_mitigation = l1tf;
310
311         if (l1tf != VMENTER_L1D_FLUSH_NEVER)
312                 static_branch_enable(&vmx_l1d_should_flush);
313         else
314                 static_branch_disable(&vmx_l1d_should_flush);
315
316         if (l1tf == VMENTER_L1D_FLUSH_COND)
317                 static_branch_enable(&vmx_l1d_flush_cond);
318         else
319                 static_branch_disable(&vmx_l1d_flush_cond);
320         return 0;
321 }
322
323 static int vmentry_l1d_flush_parse(const char *s)
324 {
325         unsigned int i;
326
327         if (s) {
328                 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
329                         if (vmentry_l1d_param[i].for_parse &&
330                             sysfs_streq(s, vmentry_l1d_param[i].option))
331                                 return i;
332                 }
333         }
334         return -EINVAL;
335 }
336
337 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
338 {
339         int l1tf, ret;
340
341         l1tf = vmentry_l1d_flush_parse(s);
342         if (l1tf < 0)
343                 return l1tf;
344
345         if (!boot_cpu_has(X86_BUG_L1TF))
346                 return 0;
347
348         /*
349          * Has vmx_init() run already? If not then this is the pre init
350          * parameter parsing. In that case just store the value and let
351          * vmx_init() do the proper setup after enable_ept has been
352          * established.
353          */
354         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
355                 vmentry_l1d_flush_param = l1tf;
356                 return 0;
357         }
358
359         mutex_lock(&vmx_l1d_flush_mutex);
360         ret = vmx_setup_l1d_flush(l1tf);
361         mutex_unlock(&vmx_l1d_flush_mutex);
362         return ret;
363 }
364
365 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
366 {
367         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
368                 return sprintf(s, "???\n");
369
370         return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
371 }
372
373 static void vmx_setup_fb_clear_ctrl(void)
374 {
375         u64 msr;
376
377         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
378             !boot_cpu_has_bug(X86_BUG_MDS) &&
379             !boot_cpu_has_bug(X86_BUG_TAA)) {
380                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
381                 if (msr & ARCH_CAP_FB_CLEAR_CTRL)
382                         vmx_fb_clear_ctrl_available = true;
383         }
384 }
385
386 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
387 {
388         u64 msr;
389
390         if (!vmx->disable_fb_clear)
391                 return;
392
393         msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
394         msr |= FB_CLEAR_DIS;
395         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
396         /* Cache the MSR value to avoid reading it later */
397         vmx->msr_ia32_mcu_opt_ctrl = msr;
398 }
399
400 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
401 {
402         if (!vmx->disable_fb_clear)
403                 return;
404
405         vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
406         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
407 }
408
409 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
410 {
411         vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
412
413         /*
414          * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
415          * at VMEntry. Skip the MSR read/write when a guest has no use case to
416          * execute VERW.
417          */
418         if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
419            ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
420             (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
421             (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
422             (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
423             (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
424                 vmx->disable_fb_clear = false;
425 }
426
427 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
428         .set = vmentry_l1d_flush_set,
429         .get = vmentry_l1d_flush_get,
430 };
431 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
432
433 static u32 vmx_segment_access_rights(struct kvm_segment *var);
434
435 void vmx_vmexit(void);
436
437 #define vmx_insn_failed(fmt...)         \
438 do {                                    \
439         WARN_ONCE(1, fmt);              \
440         pr_warn_ratelimited(fmt);       \
441 } while (0)
442
443 void vmread_error(unsigned long field, bool fault)
444 {
445         if (fault)
446                 kvm_spurious_fault();
447         else
448                 vmx_insn_failed("vmread failed: field=%lx\n", field);
449 }
450
451 noinline void vmwrite_error(unsigned long field, unsigned long value)
452 {
453         vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
454                         field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
455 }
456
457 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
458 {
459         vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
460                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
461 }
462
463 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
464 {
465         vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
466                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
467 }
468
469 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
470 {
471         vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
472                         ext, vpid, gva);
473 }
474
475 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
476 {
477         vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
478                         ext, eptp, gpa);
479 }
480
481 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
482 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
483 /*
484  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
485  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
486  */
487 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
488
489 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
490 static DEFINE_SPINLOCK(vmx_vpid_lock);
491
492 struct vmcs_config vmcs_config __ro_after_init;
493 struct vmx_capability vmx_capability __ro_after_init;
494
495 #define VMX_SEGMENT_FIELD(seg)                                  \
496         [VCPU_SREG_##seg] = {                                   \
497                 .selector = GUEST_##seg##_SELECTOR,             \
498                 .base = GUEST_##seg##_BASE,                     \
499                 .limit = GUEST_##seg##_LIMIT,                   \
500                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
501         }
502
503 static const struct kvm_vmx_segment_field {
504         unsigned selector;
505         unsigned base;
506         unsigned limit;
507         unsigned ar_bytes;
508 } kvm_vmx_segment_fields[] = {
509         VMX_SEGMENT_FIELD(CS),
510         VMX_SEGMENT_FIELD(DS),
511         VMX_SEGMENT_FIELD(ES),
512         VMX_SEGMENT_FIELD(FS),
513         VMX_SEGMENT_FIELD(GS),
514         VMX_SEGMENT_FIELD(SS),
515         VMX_SEGMENT_FIELD(TR),
516         VMX_SEGMENT_FIELD(LDTR),
517 };
518
519 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
520 {
521         vmx->segment_cache.bitmask = 0;
522 }
523
524 static unsigned long host_idt_base;
525
526 #if IS_ENABLED(CONFIG_HYPERV)
527 static struct kvm_x86_ops vmx_x86_ops __initdata;
528
529 static bool __read_mostly enlightened_vmcs = true;
530 module_param(enlightened_vmcs, bool, 0444);
531
532 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
533 {
534         struct hv_enlightened_vmcs *evmcs;
535         struct hv_partition_assist_pg **p_hv_pa_pg =
536                         &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
537         /*
538          * Synthetic VM-Exit is not enabled in current code and so All
539          * evmcs in singe VM shares same assist page.
540          */
541         if (!*p_hv_pa_pg)
542                 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
543
544         if (!*p_hv_pa_pg)
545                 return -ENOMEM;
546
547         evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
548
549         evmcs->partition_assist_page =
550                 __pa(*p_hv_pa_pg);
551         evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
552         evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
553
554         return 0;
555 }
556
557 static __init void hv_init_evmcs(void)
558 {
559         int cpu;
560
561         if (!enlightened_vmcs)
562                 return;
563
564         /*
565          * Enlightened VMCS usage should be recommended and the host needs
566          * to support eVMCS v1 or above.
567          */
568         if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
569             (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
570              KVM_EVMCS_VERSION) {
571
572                 /* Check that we have assist pages on all online CPUs */
573                 for_each_online_cpu(cpu) {
574                         if (!hv_get_vp_assist_page(cpu)) {
575                                 enlightened_vmcs = false;
576                                 break;
577                         }
578                 }
579
580                 if (enlightened_vmcs) {
581                         pr_info("Using Hyper-V Enlightened VMCS\n");
582                         static_branch_enable(&__kvm_is_using_evmcs);
583                 }
584
585                 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
586                         vmx_x86_ops.enable_l2_tlb_flush
587                                 = hv_enable_l2_tlb_flush;
588
589         } else {
590                 enlightened_vmcs = false;
591         }
592 }
593
594 static void hv_reset_evmcs(void)
595 {
596         struct hv_vp_assist_page *vp_ap;
597
598         if (!kvm_is_using_evmcs())
599                 return;
600
601         /*
602          * KVM should enable eVMCS if and only if all CPUs have a VP assist
603          * page, and should reject CPU onlining if eVMCS is enabled the CPU
604          * doesn't have a VP assist page allocated.
605          */
606         vp_ap = hv_get_vp_assist_page(smp_processor_id());
607         if (WARN_ON_ONCE(!vp_ap))
608                 return;
609
610         /*
611          * Reset everything to support using non-enlightened VMCS access later
612          * (e.g. when we reload the module with enlightened_vmcs=0)
613          */
614         vp_ap->nested_control.features.directhypercall = 0;
615         vp_ap->current_nested_vmcs = 0;
616         vp_ap->enlighten_vmentry = 0;
617 }
618
619 #else /* IS_ENABLED(CONFIG_HYPERV) */
620 static void hv_init_evmcs(void) {}
621 static void hv_reset_evmcs(void) {}
622 #endif /* IS_ENABLED(CONFIG_HYPERV) */
623
624 /*
625  * Comment's format: document - errata name - stepping - processor name.
626  * Refer from
627  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
628  */
629 static u32 vmx_preemption_cpu_tfms[] = {
630 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
631 0x000206E6,
632 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
633 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
634 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
635 0x00020652,
636 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
637 0x00020655,
638 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
639 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
640 /*
641  * 320767.pdf - AAP86  - B1 -
642  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
643  */
644 0x000106E5,
645 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
646 0x000106A0,
647 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
648 0x000106A1,
649 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
650 0x000106A4,
651  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
652  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
653  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
654 0x000106A5,
655  /* Xeon E3-1220 V2 */
656 0x000306A8,
657 };
658
659 static inline bool cpu_has_broken_vmx_preemption_timer(void)
660 {
661         u32 eax = cpuid_eax(0x00000001), i;
662
663         /* Clear the reserved bits */
664         eax &= ~(0x3U << 14 | 0xfU << 28);
665         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
666                 if (eax == vmx_preemption_cpu_tfms[i])
667                         return true;
668
669         return false;
670 }
671
672 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
673 {
674         return flexpriority_enabled && lapic_in_kernel(vcpu);
675 }
676
677 static int possible_passthrough_msr_slot(u32 msr)
678 {
679         u32 i;
680
681         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
682                 if (vmx_possible_passthrough_msrs[i] == msr)
683                         return i;
684
685         return -ENOENT;
686 }
687
688 static bool is_valid_passthrough_msr(u32 msr)
689 {
690         bool r;
691
692         switch (msr) {
693         case 0x800 ... 0x8ff:
694                 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
695                 return true;
696         case MSR_IA32_RTIT_STATUS:
697         case MSR_IA32_RTIT_OUTPUT_BASE:
698         case MSR_IA32_RTIT_OUTPUT_MASK:
699         case MSR_IA32_RTIT_CR3_MATCH:
700         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
701                 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
702         case MSR_LBR_SELECT:
703         case MSR_LBR_TOS:
704         case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
705         case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
706         case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
707         case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
708         case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
709                 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
710                 return true;
711         }
712
713         r = possible_passthrough_msr_slot(msr) != -ENOENT;
714
715         WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
716
717         return r;
718 }
719
720 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
721 {
722         int i;
723
724         i = kvm_find_user_return_msr(msr);
725         if (i >= 0)
726                 return &vmx->guest_uret_msrs[i];
727         return NULL;
728 }
729
730 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
731                                   struct vmx_uret_msr *msr, u64 data)
732 {
733         unsigned int slot = msr - vmx->guest_uret_msrs;
734         int ret = 0;
735
736         if (msr->load_into_hardware) {
737                 preempt_disable();
738                 ret = kvm_set_user_return_msr(slot, data, msr->mask);
739                 preempt_enable();
740         }
741         if (!ret)
742                 msr->data = data;
743         return ret;
744 }
745
746 #ifdef CONFIG_KEXEC_CORE
747 static void crash_vmclear_local_loaded_vmcss(void)
748 {
749         int cpu = raw_smp_processor_id();
750         struct loaded_vmcs *v;
751
752         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
753                             loaded_vmcss_on_cpu_link)
754                 vmcs_clear(v->vmcs);
755 }
756 #endif /* CONFIG_KEXEC_CORE */
757
758 static void __loaded_vmcs_clear(void *arg)
759 {
760         struct loaded_vmcs *loaded_vmcs = arg;
761         int cpu = raw_smp_processor_id();
762
763         if (loaded_vmcs->cpu != cpu)
764                 return; /* vcpu migration can race with cpu offline */
765         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
766                 per_cpu(current_vmcs, cpu) = NULL;
767
768         vmcs_clear(loaded_vmcs->vmcs);
769         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
770                 vmcs_clear(loaded_vmcs->shadow_vmcs);
771
772         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
773
774         /*
775          * Ensure all writes to loaded_vmcs, including deleting it from its
776          * current percpu list, complete before setting loaded_vmcs->cpu to
777          * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
778          * and add loaded_vmcs to its percpu list before it's deleted from this
779          * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
780          */
781         smp_wmb();
782
783         loaded_vmcs->cpu = -1;
784         loaded_vmcs->launched = 0;
785 }
786
787 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
788 {
789         int cpu = loaded_vmcs->cpu;
790
791         if (cpu != -1)
792                 smp_call_function_single(cpu,
793                          __loaded_vmcs_clear, loaded_vmcs, 1);
794 }
795
796 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
797                                        unsigned field)
798 {
799         bool ret;
800         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
801
802         if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
803                 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
804                 vmx->segment_cache.bitmask = 0;
805         }
806         ret = vmx->segment_cache.bitmask & mask;
807         vmx->segment_cache.bitmask |= mask;
808         return ret;
809 }
810
811 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
812 {
813         u16 *p = &vmx->segment_cache.seg[seg].selector;
814
815         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
816                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
817         return *p;
818 }
819
820 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
821 {
822         ulong *p = &vmx->segment_cache.seg[seg].base;
823
824         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
825                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
826         return *p;
827 }
828
829 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
830 {
831         u32 *p = &vmx->segment_cache.seg[seg].limit;
832
833         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
834                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
835         return *p;
836 }
837
838 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
839 {
840         u32 *p = &vmx->segment_cache.seg[seg].ar;
841
842         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
843                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
844         return *p;
845 }
846
847 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
848 {
849         u32 eb;
850
851         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
852              (1u << DB_VECTOR) | (1u << AC_VECTOR);
853         /*
854          * Guest access to VMware backdoor ports could legitimately
855          * trigger #GP because of TSS I/O permission bitmap.
856          * We intercept those #GP and allow access to them anyway
857          * as VMware does.
858          */
859         if (enable_vmware_backdoor)
860                 eb |= (1u << GP_VECTOR);
861         if ((vcpu->guest_debug &
862              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
863             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
864                 eb |= 1u << BP_VECTOR;
865         if (to_vmx(vcpu)->rmode.vm86_active)
866                 eb = ~0;
867         if (!vmx_need_pf_intercept(vcpu))
868                 eb &= ~(1u << PF_VECTOR);
869
870         /* When we are running a nested L2 guest and L1 specified for it a
871          * certain exception bitmap, we must trap the same exceptions and pass
872          * them to L1. When running L2, we will only handle the exceptions
873          * specified above if L1 did not want them.
874          */
875         if (is_guest_mode(vcpu))
876                 eb |= get_vmcs12(vcpu)->exception_bitmap;
877         else {
878                 int mask = 0, match = 0;
879
880                 if (enable_ept && (eb & (1u << PF_VECTOR))) {
881                         /*
882                          * If EPT is enabled, #PF is currently only intercepted
883                          * if MAXPHYADDR is smaller on the guest than on the
884                          * host.  In that case we only care about present,
885                          * non-reserved faults.  For vmcs02, however, PFEC_MASK
886                          * and PFEC_MATCH are set in prepare_vmcs02_rare.
887                          */
888                         mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
889                         match = PFERR_PRESENT_MASK;
890                 }
891                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
892                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
893         }
894
895         /*
896          * Disabling xfd interception indicates that dynamic xfeatures
897          * might be used in the guest. Always trap #NM in this case
898          * to save guest xfd_err timely.
899          */
900         if (vcpu->arch.xfd_no_write_intercept)
901                 eb |= (1u << NM_VECTOR);
902
903         vmcs_write32(EXCEPTION_BITMAP, eb);
904 }
905
906 /*
907  * Check if MSR is intercepted for currently loaded MSR bitmap.
908  */
909 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
910 {
911         if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
912                 return true;
913
914         return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
915 }
916
917 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
918 {
919         unsigned int flags = 0;
920
921         if (vmx->loaded_vmcs->launched)
922                 flags |= VMX_RUN_VMRESUME;
923
924         /*
925          * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
926          * to change it directly without causing a vmexit.  In that case read
927          * it after vmexit and store it in vmx->spec_ctrl.
928          */
929         if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
930                 flags |= VMX_RUN_SAVE_SPEC_CTRL;
931
932         return flags;
933 }
934
935 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
936                 unsigned long entry, unsigned long exit)
937 {
938         vm_entry_controls_clearbit(vmx, entry);
939         vm_exit_controls_clearbit(vmx, exit);
940 }
941
942 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
943 {
944         unsigned int i;
945
946         for (i = 0; i < m->nr; ++i) {
947                 if (m->val[i].index == msr)
948                         return i;
949         }
950         return -ENOENT;
951 }
952
953 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
954 {
955         int i;
956         struct msr_autoload *m = &vmx->msr_autoload;
957
958         switch (msr) {
959         case MSR_EFER:
960                 if (cpu_has_load_ia32_efer()) {
961                         clear_atomic_switch_msr_special(vmx,
962                                         VM_ENTRY_LOAD_IA32_EFER,
963                                         VM_EXIT_LOAD_IA32_EFER);
964                         return;
965                 }
966                 break;
967         case MSR_CORE_PERF_GLOBAL_CTRL:
968                 if (cpu_has_load_perf_global_ctrl()) {
969                         clear_atomic_switch_msr_special(vmx,
970                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
971                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
972                         return;
973                 }
974                 break;
975         }
976         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
977         if (i < 0)
978                 goto skip_guest;
979         --m->guest.nr;
980         m->guest.val[i] = m->guest.val[m->guest.nr];
981         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
982
983 skip_guest:
984         i = vmx_find_loadstore_msr_slot(&m->host, msr);
985         if (i < 0)
986                 return;
987
988         --m->host.nr;
989         m->host.val[i] = m->host.val[m->host.nr];
990         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
991 }
992
993 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
994                 unsigned long entry, unsigned long exit,
995                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
996                 u64 guest_val, u64 host_val)
997 {
998         vmcs_write64(guest_val_vmcs, guest_val);
999         if (host_val_vmcs != HOST_IA32_EFER)
1000                 vmcs_write64(host_val_vmcs, host_val);
1001         vm_entry_controls_setbit(vmx, entry);
1002         vm_exit_controls_setbit(vmx, exit);
1003 }
1004
1005 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1006                                   u64 guest_val, u64 host_val, bool entry_only)
1007 {
1008         int i, j = 0;
1009         struct msr_autoload *m = &vmx->msr_autoload;
1010
1011         switch (msr) {
1012         case MSR_EFER:
1013                 if (cpu_has_load_ia32_efer()) {
1014                         add_atomic_switch_msr_special(vmx,
1015                                         VM_ENTRY_LOAD_IA32_EFER,
1016                                         VM_EXIT_LOAD_IA32_EFER,
1017                                         GUEST_IA32_EFER,
1018                                         HOST_IA32_EFER,
1019                                         guest_val, host_val);
1020                         return;
1021                 }
1022                 break;
1023         case MSR_CORE_PERF_GLOBAL_CTRL:
1024                 if (cpu_has_load_perf_global_ctrl()) {
1025                         add_atomic_switch_msr_special(vmx,
1026                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1027                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1028                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1029                                         HOST_IA32_PERF_GLOBAL_CTRL,
1030                                         guest_val, host_val);
1031                         return;
1032                 }
1033                 break;
1034         case MSR_IA32_PEBS_ENABLE:
1035                 /* PEBS needs a quiescent period after being disabled (to write
1036                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
1037                  * provide that period, so a CPU could write host's record into
1038                  * guest's memory.
1039                  */
1040                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1041         }
1042
1043         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
1044         if (!entry_only)
1045                 j = vmx_find_loadstore_msr_slot(&m->host, msr);
1046
1047         if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
1048             (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
1049                 printk_once(KERN_WARNING "Not enough msr switch entries. "
1050                                 "Can't add msr %x\n", msr);
1051                 return;
1052         }
1053         if (i < 0) {
1054                 i = m->guest.nr++;
1055                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1056         }
1057         m->guest.val[i].index = msr;
1058         m->guest.val[i].value = guest_val;
1059
1060         if (entry_only)
1061                 return;
1062
1063         if (j < 0) {
1064                 j = m->host.nr++;
1065                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1066         }
1067         m->host.val[j].index = msr;
1068         m->host.val[j].value = host_val;
1069 }
1070
1071 static bool update_transition_efer(struct vcpu_vmx *vmx)
1072 {
1073         u64 guest_efer = vmx->vcpu.arch.efer;
1074         u64 ignore_bits = 0;
1075         int i;
1076
1077         /* Shadow paging assumes NX to be available.  */
1078         if (!enable_ept)
1079                 guest_efer |= EFER_NX;
1080
1081         /*
1082          * LMA and LME handled by hardware; SCE meaningless outside long mode.
1083          */
1084         ignore_bits |= EFER_SCE;
1085 #ifdef CONFIG_X86_64
1086         ignore_bits |= EFER_LMA | EFER_LME;
1087         /* SCE is meaningful only in long mode on Intel */
1088         if (guest_efer & EFER_LMA)
1089                 ignore_bits &= ~(u64)EFER_SCE;
1090 #endif
1091
1092         /*
1093          * On EPT, we can't emulate NX, so we must switch EFER atomically.
1094          * On CPUs that support "load IA32_EFER", always switch EFER
1095          * atomically, since it's faster than switching it manually.
1096          */
1097         if (cpu_has_load_ia32_efer() ||
1098             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1099                 if (!(guest_efer & EFER_LMA))
1100                         guest_efer &= ~EFER_LME;
1101                 if (guest_efer != host_efer)
1102                         add_atomic_switch_msr(vmx, MSR_EFER,
1103                                               guest_efer, host_efer, false);
1104                 else
1105                         clear_atomic_switch_msr(vmx, MSR_EFER);
1106                 return false;
1107         }
1108
1109         i = kvm_find_user_return_msr(MSR_EFER);
1110         if (i < 0)
1111                 return false;
1112
1113         clear_atomic_switch_msr(vmx, MSR_EFER);
1114
1115         guest_efer &= ~ignore_bits;
1116         guest_efer |= host_efer & ignore_bits;
1117
1118         vmx->guest_uret_msrs[i].data = guest_efer;
1119         vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1120
1121         return true;
1122 }
1123
1124 #ifdef CONFIG_X86_32
1125 /*
1126  * On 32-bit kernels, VM exits still load the FS and GS bases from the
1127  * VMCS rather than the segment table.  KVM uses this helper to figure
1128  * out the current bases to poke them into the VMCS before entry.
1129  */
1130 static unsigned long segment_base(u16 selector)
1131 {
1132         struct desc_struct *table;
1133         unsigned long v;
1134
1135         if (!(selector & ~SEGMENT_RPL_MASK))
1136                 return 0;
1137
1138         table = get_current_gdt_ro();
1139
1140         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1141                 u16 ldt_selector = kvm_read_ldt();
1142
1143                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1144                         return 0;
1145
1146                 table = (struct desc_struct *)segment_base(ldt_selector);
1147         }
1148         v = get_desc_base(&table[selector >> 3]);
1149         return v;
1150 }
1151 #endif
1152
1153 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1154 {
1155         return vmx_pt_mode_is_host_guest() &&
1156                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1157 }
1158
1159 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1160 {
1161         /* The base must be 128-byte aligned and a legal physical address. */
1162         return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1163 }
1164
1165 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1166 {
1167         u32 i;
1168
1169         wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1170         wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1171         wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1172         wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1173         for (i = 0; i < addr_range; i++) {
1174                 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1175                 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1176         }
1177 }
1178
1179 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1180 {
1181         u32 i;
1182
1183         rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1184         rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1185         rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1186         rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1187         for (i = 0; i < addr_range; i++) {
1188                 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1189                 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1190         }
1191 }
1192
1193 static void pt_guest_enter(struct vcpu_vmx *vmx)
1194 {
1195         if (vmx_pt_mode_is_system())
1196                 return;
1197
1198         /*
1199          * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1200          * Save host state before VM entry.
1201          */
1202         rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1203         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1204                 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1205                 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1206                 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1207         }
1208 }
1209
1210 static void pt_guest_exit(struct vcpu_vmx *vmx)
1211 {
1212         if (vmx_pt_mode_is_system())
1213                 return;
1214
1215         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1216                 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1217                 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1218         }
1219
1220         /*
1221          * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1222          * i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
1223          */
1224         if (vmx->pt_desc.host.ctl)
1225                 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1226 }
1227
1228 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1229                         unsigned long fs_base, unsigned long gs_base)
1230 {
1231         if (unlikely(fs_sel != host->fs_sel)) {
1232                 if (!(fs_sel & 7))
1233                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1234                 else
1235                         vmcs_write16(HOST_FS_SELECTOR, 0);
1236                 host->fs_sel = fs_sel;
1237         }
1238         if (unlikely(gs_sel != host->gs_sel)) {
1239                 if (!(gs_sel & 7))
1240                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1241                 else
1242                         vmcs_write16(HOST_GS_SELECTOR, 0);
1243                 host->gs_sel = gs_sel;
1244         }
1245         if (unlikely(fs_base != host->fs_base)) {
1246                 vmcs_writel(HOST_FS_BASE, fs_base);
1247                 host->fs_base = fs_base;
1248         }
1249         if (unlikely(gs_base != host->gs_base)) {
1250                 vmcs_writel(HOST_GS_BASE, gs_base);
1251                 host->gs_base = gs_base;
1252         }
1253 }
1254
1255 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1256 {
1257         struct vcpu_vmx *vmx = to_vmx(vcpu);
1258         struct vmcs_host_state *host_state;
1259 #ifdef CONFIG_X86_64
1260         int cpu = raw_smp_processor_id();
1261 #endif
1262         unsigned long fs_base, gs_base;
1263         u16 fs_sel, gs_sel;
1264         int i;
1265
1266         vmx->req_immediate_exit = false;
1267
1268         /*
1269          * Note that guest MSRs to be saved/restored can also be changed
1270          * when guest state is loaded. This happens when guest transitions
1271          * to/from long-mode by setting MSR_EFER.LMA.
1272          */
1273         if (!vmx->guest_uret_msrs_loaded) {
1274                 vmx->guest_uret_msrs_loaded = true;
1275                 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
1276                         if (!vmx->guest_uret_msrs[i].load_into_hardware)
1277                                 continue;
1278
1279                         kvm_set_user_return_msr(i,
1280                                                 vmx->guest_uret_msrs[i].data,
1281                                                 vmx->guest_uret_msrs[i].mask);
1282                 }
1283         }
1284
1285         if (vmx->nested.need_vmcs12_to_shadow_sync)
1286                 nested_sync_vmcs12_to_shadow(vcpu);
1287
1288         if (vmx->guest_state_loaded)
1289                 return;
1290
1291         host_state = &vmx->loaded_vmcs->host_state;
1292
1293         /*
1294          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1295          * allow segment selectors with cpl > 0 or ti == 1.
1296          */
1297         host_state->ldt_sel = kvm_read_ldt();
1298
1299 #ifdef CONFIG_X86_64
1300         savesegment(ds, host_state->ds_sel);
1301         savesegment(es, host_state->es_sel);
1302
1303         gs_base = cpu_kernelmode_gs_base(cpu);
1304         if (likely(is_64bit_mm(current->mm))) {
1305                 current_save_fsgs();
1306                 fs_sel = current->thread.fsindex;
1307                 gs_sel = current->thread.gsindex;
1308                 fs_base = current->thread.fsbase;
1309                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1310         } else {
1311                 savesegment(fs, fs_sel);
1312                 savesegment(gs, gs_sel);
1313                 fs_base = read_msr(MSR_FS_BASE);
1314                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1315         }
1316
1317         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1318 #else
1319         savesegment(fs, fs_sel);
1320         savesegment(gs, gs_sel);
1321         fs_base = segment_base(fs_sel);
1322         gs_base = segment_base(gs_sel);
1323 #endif
1324
1325         vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1326         vmx->guest_state_loaded = true;
1327 }
1328
1329 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1330 {
1331         struct vmcs_host_state *host_state;
1332
1333         if (!vmx->guest_state_loaded)
1334                 return;
1335
1336         host_state = &vmx->loaded_vmcs->host_state;
1337
1338         ++vmx->vcpu.stat.host_state_reload;
1339
1340 #ifdef CONFIG_X86_64
1341         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1342 #endif
1343         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1344                 kvm_load_ldt(host_state->ldt_sel);
1345 #ifdef CONFIG_X86_64
1346                 load_gs_index(host_state->gs_sel);
1347 #else
1348                 loadsegment(gs, host_state->gs_sel);
1349 #endif
1350         }
1351         if (host_state->fs_sel & 7)
1352                 loadsegment(fs, host_state->fs_sel);
1353 #ifdef CONFIG_X86_64
1354         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1355                 loadsegment(ds, host_state->ds_sel);
1356                 loadsegment(es, host_state->es_sel);
1357         }
1358 #endif
1359         invalidate_tss_limit();
1360 #ifdef CONFIG_X86_64
1361         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1362 #endif
1363         load_fixmap_gdt(raw_smp_processor_id());
1364         vmx->guest_state_loaded = false;
1365         vmx->guest_uret_msrs_loaded = false;
1366 }
1367
1368 #ifdef CONFIG_X86_64
1369 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1370 {
1371         preempt_disable();
1372         if (vmx->guest_state_loaded)
1373                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1374         preempt_enable();
1375         return vmx->msr_guest_kernel_gs_base;
1376 }
1377
1378 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1379 {
1380         preempt_disable();
1381         if (vmx->guest_state_loaded)
1382                 wrmsrl(MSR_KERNEL_GS_BASE, data);
1383         preempt_enable();
1384         vmx->msr_guest_kernel_gs_base = data;
1385 }
1386 #endif
1387
1388 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1389                         struct loaded_vmcs *buddy)
1390 {
1391         struct vcpu_vmx *vmx = to_vmx(vcpu);
1392         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1393         struct vmcs *prev;
1394
1395         if (!already_loaded) {
1396                 loaded_vmcs_clear(vmx->loaded_vmcs);
1397                 local_irq_disable();
1398
1399                 /*
1400                  * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1401                  * this cpu's percpu list, otherwise it may not yet be deleted
1402                  * from its previous cpu's percpu list.  Pairs with the
1403                  * smb_wmb() in __loaded_vmcs_clear().
1404                  */
1405                 smp_rmb();
1406
1407                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1408                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1409                 local_irq_enable();
1410         }
1411
1412         prev = per_cpu(current_vmcs, cpu);
1413         if (prev != vmx->loaded_vmcs->vmcs) {
1414                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1415                 vmcs_load(vmx->loaded_vmcs->vmcs);
1416
1417                 /*
1418                  * No indirect branch prediction barrier needed when switching
1419                  * the active VMCS within a vCPU, unless IBRS is advertised to
1420                  * the vCPU.  To minimize the number of IBPBs executed, KVM
1421                  * performs IBPB on nested VM-Exit (a single nested transition
1422                  * may switch the active VMCS multiple times).
1423                  */
1424                 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1425                         indirect_branch_prediction_barrier();
1426         }
1427
1428         if (!already_loaded) {
1429                 void *gdt = get_current_gdt_ro();
1430
1431                 /*
1432                  * Flush all EPTP/VPID contexts, the new pCPU may have stale
1433                  * TLB entries from its previous association with the vCPU.
1434                  */
1435                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1436
1437                 /*
1438                  * Linux uses per-cpu TSS and GDT, so set these when switching
1439                  * processors.  See 22.2.4.
1440                  */
1441                 vmcs_writel(HOST_TR_BASE,
1442                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1443                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
1444
1445                 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1446                         /* 22.2.3 */
1447                         vmcs_writel(HOST_IA32_SYSENTER_ESP,
1448                                     (unsigned long)(cpu_entry_stack(cpu) + 1));
1449                 }
1450
1451                 vmx->loaded_vmcs->cpu = cpu;
1452         }
1453 }
1454
1455 /*
1456  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1457  * vcpu mutex is already taken.
1458  */
1459 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1460 {
1461         struct vcpu_vmx *vmx = to_vmx(vcpu);
1462
1463         vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1464
1465         vmx_vcpu_pi_load(vcpu, cpu);
1466
1467         vmx->host_debugctlmsr = get_debugctlmsr();
1468 }
1469
1470 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1471 {
1472         vmx_vcpu_pi_put(vcpu);
1473
1474         vmx_prepare_switch_to_host(to_vmx(vcpu));
1475 }
1476
1477 bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1478 {
1479         return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1480 }
1481
1482 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1483 {
1484         struct vcpu_vmx *vmx = to_vmx(vcpu);
1485         unsigned long rflags, save_rflags;
1486
1487         if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1488                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1489                 rflags = vmcs_readl(GUEST_RFLAGS);
1490                 if (vmx->rmode.vm86_active) {
1491                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1492                         save_rflags = vmx->rmode.save_rflags;
1493                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1494                 }
1495                 vmx->rflags = rflags;
1496         }
1497         return vmx->rflags;
1498 }
1499
1500 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1501 {
1502         struct vcpu_vmx *vmx = to_vmx(vcpu);
1503         unsigned long old_rflags;
1504
1505         if (is_unrestricted_guest(vcpu)) {
1506                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1507                 vmx->rflags = rflags;
1508                 vmcs_writel(GUEST_RFLAGS, rflags);
1509                 return;
1510         }
1511
1512         old_rflags = vmx_get_rflags(vcpu);
1513         vmx->rflags = rflags;
1514         if (vmx->rmode.vm86_active) {
1515                 vmx->rmode.save_rflags = rflags;
1516                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1517         }
1518         vmcs_writel(GUEST_RFLAGS, rflags);
1519
1520         if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1521                 vmx->emulation_required = vmx_emulation_required(vcpu);
1522 }
1523
1524 static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1525 {
1526         return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1527 }
1528
1529 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1530 {
1531         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1532         int ret = 0;
1533
1534         if (interruptibility & GUEST_INTR_STATE_STI)
1535                 ret |= KVM_X86_SHADOW_INT_STI;
1536         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1537                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1538
1539         return ret;
1540 }
1541
1542 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1543 {
1544         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1545         u32 interruptibility = interruptibility_old;
1546
1547         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1548
1549         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1550                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1551         else if (mask & KVM_X86_SHADOW_INT_STI)
1552                 interruptibility |= GUEST_INTR_STATE_STI;
1553
1554         if ((interruptibility != interruptibility_old))
1555                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1556 }
1557
1558 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1559 {
1560         struct vcpu_vmx *vmx = to_vmx(vcpu);
1561         unsigned long value;
1562
1563         /*
1564          * Any MSR write that attempts to change bits marked reserved will
1565          * case a #GP fault.
1566          */
1567         if (data & vmx->pt_desc.ctl_bitmask)
1568                 return 1;
1569
1570         /*
1571          * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1572          * result in a #GP unless the same write also clears TraceEn.
1573          */
1574         if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1575                 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1576                 return 1;
1577
1578         /*
1579          * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1580          * and FabricEn would cause #GP, if
1581          * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1582          */
1583         if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1584                 !(data & RTIT_CTL_FABRIC_EN) &&
1585                 !intel_pt_validate_cap(vmx->pt_desc.caps,
1586                                         PT_CAP_single_range_output))
1587                 return 1;
1588
1589         /*
1590          * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1591          * utilize encodings marked reserved will cause a #GP fault.
1592          */
1593         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1594         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1595                         !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1596                         RTIT_CTL_MTC_RANGE_OFFSET, &value))
1597                 return 1;
1598         value = intel_pt_validate_cap(vmx->pt_desc.caps,
1599                                                 PT_CAP_cycle_thresholds);
1600         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1601                         !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1602                         RTIT_CTL_CYC_THRESH_OFFSET, &value))
1603                 return 1;
1604         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1605         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1606                         !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1607                         RTIT_CTL_PSB_FREQ_OFFSET, &value))
1608                 return 1;
1609
1610         /*
1611          * If ADDRx_CFG is reserved or the encodings is >2 will
1612          * cause a #GP fault.
1613          */
1614         value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1615         if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
1616                 return 1;
1617         value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1618         if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
1619                 return 1;
1620         value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1621         if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
1622                 return 1;
1623         value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1624         if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
1625                 return 1;
1626
1627         return 0;
1628 }
1629
1630 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1631                                         void *insn, int insn_len)
1632 {
1633         /*
1634          * Emulation of instructions in SGX enclaves is impossible as RIP does
1635          * not point at the failing instruction, and even if it did, the code
1636          * stream is inaccessible.  Inject #UD instead of exiting to userspace
1637          * so that guest userspace can't DoS the guest simply by triggering
1638          * emulation (enclaves are CPL3 only).
1639          */
1640         if (to_vmx(vcpu)->exit_reason.enclave_mode) {
1641                 kvm_queue_exception(vcpu, UD_VECTOR);
1642                 return false;
1643         }
1644         return true;
1645 }
1646
1647 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1648 {
1649         union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
1650         unsigned long rip, orig_rip;
1651         u32 instr_len;
1652
1653         /*
1654          * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1655          * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1656          * set when EPT misconfig occurs.  In practice, real hardware updates
1657          * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1658          * (namely Hyper-V) don't set it due to it being undefined behavior,
1659          * i.e. we end up advancing IP with some random value.
1660          */
1661         if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1662             exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1663                 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1664
1665                 /*
1666                  * Emulating an enclave's instructions isn't supported as KVM
1667                  * cannot access the enclave's memory or its true RIP, e.g. the
1668                  * vmcs.GUEST_RIP points at the exit point of the enclave, not
1669                  * the RIP that actually triggered the VM-Exit.  But, because
1670                  * most instructions that cause VM-Exit will #UD in an enclave,
1671                  * most instruction-based VM-Exits simply do not occur.
1672                  *
1673                  * There are a few exceptions, notably the debug instructions
1674                  * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1675                  * and generate #DB/#BP as expected, which KVM might intercept.
1676                  * But again, the CPU does the dirty work and saves an instr
1677                  * length of zero so VMMs don't shoot themselves in the foot.
1678                  * WARN if KVM tries to skip a non-zero length instruction on
1679                  * a VM-Exit from an enclave.
1680                  */
1681                 if (!instr_len)
1682                         goto rip_updated;
1683
1684                 WARN_ONCE(exit_reason.enclave_mode,
1685                           "skipping instruction after SGX enclave VM-Exit");
1686
1687                 orig_rip = kvm_rip_read(vcpu);
1688                 rip = orig_rip + instr_len;
1689 #ifdef CONFIG_X86_64
1690                 /*
1691                  * We need to mask out the high 32 bits of RIP if not in 64-bit
1692                  * mode, but just finding out that we are in 64-bit mode is
1693                  * quite expensive.  Only do it if there was a carry.
1694                  */
1695                 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1696                         rip = (u32)rip;
1697 #endif
1698                 kvm_rip_write(vcpu, rip);
1699         } else {
1700                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1701                         return 0;
1702         }
1703
1704 rip_updated:
1705         /* skipping an emulated instruction also counts */
1706         vmx_set_interrupt_shadow(vcpu, 0);
1707
1708         return 1;
1709 }
1710
1711 /*
1712  * Recognizes a pending MTF VM-exit and records the nested state for later
1713  * delivery.
1714  */
1715 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1716 {
1717         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1718         struct vcpu_vmx *vmx = to_vmx(vcpu);
1719
1720         if (!is_guest_mode(vcpu))
1721                 return;
1722
1723         /*
1724          * Per the SDM, MTF takes priority over debug-trap exceptions besides
1725          * TSS T-bit traps and ICEBP (INT1).  KVM doesn't emulate T-bit traps
1726          * or ICEBP (in the emulator proper), and skipping of ICEBP after an
1727          * intercepted #DB deliberately avoids single-step #DB and MTF updates
1728          * as ICEBP is higher priority than both.  As instruction emulation is
1729          * completed at this point (i.e. KVM is at the instruction boundary),
1730          * any #DB exception pending delivery must be a debug-trap of lower
1731          * priority than MTF.  Record the pending MTF state to be delivered in
1732          * vmx_check_nested_events().
1733          */
1734         if (nested_cpu_has_mtf(vmcs12) &&
1735             (!vcpu->arch.exception.pending ||
1736              vcpu->arch.exception.vector == DB_VECTOR) &&
1737             (!vcpu->arch.exception_vmexit.pending ||
1738              vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
1739                 vmx->nested.mtf_pending = true;
1740                 kvm_make_request(KVM_REQ_EVENT, vcpu);
1741         } else {
1742                 vmx->nested.mtf_pending = false;
1743         }
1744 }
1745
1746 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1747 {
1748         vmx_update_emulated_instruction(vcpu);
1749         return skip_emulated_instruction(vcpu);
1750 }
1751
1752 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1753 {
1754         /*
1755          * Ensure that we clear the HLT state in the VMCS.  We don't need to
1756          * explicitly skip the instruction because if the HLT state is set,
1757          * then the instruction is already executing and RIP has already been
1758          * advanced.
1759          */
1760         if (kvm_hlt_in_guest(vcpu->kvm) &&
1761                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1762                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1763 }
1764
1765 static void vmx_inject_exception(struct kvm_vcpu *vcpu)
1766 {
1767         struct kvm_queued_exception *ex = &vcpu->arch.exception;
1768         u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
1769         struct vcpu_vmx *vmx = to_vmx(vcpu);
1770
1771         kvm_deliver_exception_payload(vcpu, ex);
1772
1773         if (ex->has_error_code) {
1774                 /*
1775                  * Despite the error code being architecturally defined as 32
1776                  * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1777                  * VMX don't actually supporting setting bits 31:16.  Hardware
1778                  * will (should) never provide a bogus error code, but AMD CPUs
1779                  * do generate error codes with bits 31:16 set, and so KVM's
1780                  * ABI lets userspace shove in arbitrary 32-bit values.  Drop
1781                  * the upper bits to avoid VM-Fail, losing information that
1782                  * does't really exist is preferable to killing the VM.
1783                  */
1784                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
1785                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1786         }
1787
1788         if (vmx->rmode.vm86_active) {
1789                 int inc_eip = 0;
1790                 if (kvm_exception_is_soft(ex->vector))
1791                         inc_eip = vcpu->arch.event_exit_inst_len;
1792                 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
1793                 return;
1794         }
1795
1796         WARN_ON_ONCE(vmx->emulation_required);
1797
1798         if (kvm_exception_is_soft(ex->vector)) {
1799                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1800                              vmx->vcpu.arch.event_exit_inst_len);
1801                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1802         } else
1803                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1804
1805         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1806
1807         vmx_clear_hlt(vcpu);
1808 }
1809
1810 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1811                                bool load_into_hardware)
1812 {
1813         struct vmx_uret_msr *uret_msr;
1814
1815         uret_msr = vmx_find_uret_msr(vmx, msr);
1816         if (!uret_msr)
1817                 return;
1818
1819         uret_msr->load_into_hardware = load_into_hardware;
1820 }
1821
1822 /*
1823  * Configuring user return MSRs to automatically save, load, and restore MSRs
1824  * that need to be shoved into hardware when running the guest.  Note, omitting
1825  * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1826  * loaded into hardware when running the guest.
1827  */
1828 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
1829 {
1830 #ifdef CONFIG_X86_64
1831         bool load_syscall_msrs;
1832
1833         /*
1834          * The SYSCALL MSRs are only needed on long mode guests, and only
1835          * when EFER.SCE is set.
1836          */
1837         load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1838                             (vmx->vcpu.arch.efer & EFER_SCE);
1839
1840         vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1841         vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1842         vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
1843 #endif
1844         vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
1845
1846         vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1847                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1848                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
1849
1850         /*
1851          * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1852          * kernel and old userspace.  If those guests run on a tsx=off host, do
1853          * allow guests to use TSX_CTRL, but don't change the value in hardware
1854          * so that TSX remains always disabled.
1855          */
1856         vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
1857
1858         /*
1859          * The set of MSRs to load may have changed, reload MSRs before the
1860          * next VM-Enter.
1861          */
1862         vmx->guest_uret_msrs_loaded = false;
1863 }
1864
1865 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1866 {
1867         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1868
1869         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
1870                 return vmcs12->tsc_offset;
1871
1872         return 0;
1873 }
1874
1875 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1876 {
1877         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1878
1879         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
1880             nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
1881                 return vmcs12->tsc_multiplier;
1882
1883         return kvm_caps.default_tsc_scaling_ratio;
1884 }
1885
1886 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1887 {
1888         vmcs_write64(TSC_OFFSET, offset);
1889 }
1890
1891 static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1892 {
1893         vmcs_write64(TSC_MULTIPLIER, multiplier);
1894 }
1895
1896 /*
1897  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1898  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1899  * all guests if the "nested" module option is off, and can also be disabled
1900  * for a single guest by disabling its VMX cpuid bit.
1901  */
1902 bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1903 {
1904         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1905 }
1906
1907 /*
1908  * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
1909  * guest CPUID.  Note, KVM allows userspace to set "VMX in SMX" to maintain
1910  * backwards compatibility even though KVM doesn't support emulating SMX.  And
1911  * because userspace set "VMX in SMX", the guest must also be allowed to set it,
1912  * e.g. if the MSR is left unlocked and the guest does a RMW operation.
1913  */
1914 #define KVM_SUPPORTED_FEATURE_CONTROL  (FEAT_CTL_LOCKED                  | \
1915                                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX  | \
1916                                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
1917                                         FEAT_CTL_SGX_LC_ENABLED          | \
1918                                         FEAT_CTL_SGX_ENABLED             | \
1919                                         FEAT_CTL_LMCE_ENABLED)
1920
1921 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
1922                                                     struct msr_data *msr)
1923 {
1924         uint64_t valid_bits;
1925
1926         /*
1927          * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
1928          * exposed to the guest.
1929          */
1930         WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
1931                      ~KVM_SUPPORTED_FEATURE_CONTROL);
1932
1933         if (!msr->host_initiated &&
1934             (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
1935                 return false;
1936
1937         if (msr->host_initiated)
1938                 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
1939         else
1940                 valid_bits = vmx->msr_ia32_feature_control_valid_bits;
1941
1942         return !(msr->data & ~valid_bits);
1943 }
1944
1945 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1946 {
1947         switch (msr->index) {
1948         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1949                 if (!nested)
1950                         return 1;
1951                 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1952         default:
1953                 return KVM_MSR_RET_INVALID;
1954         }
1955 }
1956
1957 /*
1958  * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
1959  * Returns 0 on success, non-0 otherwise.
1960  * Assumes vcpu_load() was already called.
1961  */
1962 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1963 {
1964         struct vcpu_vmx *vmx = to_vmx(vcpu);
1965         struct vmx_uret_msr *msr;
1966         u32 index;
1967
1968         switch (msr_info->index) {
1969 #ifdef CONFIG_X86_64
1970         case MSR_FS_BASE:
1971                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
1972                 break;
1973         case MSR_GS_BASE:
1974                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
1975                 break;
1976         case MSR_KERNEL_GS_BASE:
1977                 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1978                 break;
1979 #endif
1980         case MSR_EFER:
1981                 return kvm_get_msr_common(vcpu, msr_info);
1982         case MSR_IA32_TSX_CTRL:
1983                 if (!msr_info->host_initiated &&
1984                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1985                         return 1;
1986                 goto find_uret_msr;
1987         case MSR_IA32_UMWAIT_CONTROL:
1988                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1989                         return 1;
1990
1991                 msr_info->data = vmx->msr_ia32_umwait_control;
1992                 break;
1993         case MSR_IA32_SPEC_CTRL:
1994                 if (!msr_info->host_initiated &&
1995                     !guest_has_spec_ctrl_msr(vcpu))
1996                         return 1;
1997
1998                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
1999                 break;
2000         case MSR_IA32_SYSENTER_CS:
2001                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2002                 break;
2003         case MSR_IA32_SYSENTER_EIP:
2004                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2005                 break;
2006         case MSR_IA32_SYSENTER_ESP:
2007                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2008                 break;
2009         case MSR_IA32_BNDCFGS:
2010                 if (!kvm_mpx_supported() ||
2011                     (!msr_info->host_initiated &&
2012                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2013                         return 1;
2014                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2015                 break;
2016         case MSR_IA32_MCG_EXT_CTL:
2017                 if (!msr_info->host_initiated &&
2018                     !(vmx->msr_ia32_feature_control &
2019                       FEAT_CTL_LMCE_ENABLED))
2020                         return 1;
2021                 msr_info->data = vcpu->arch.mcg_ext_ctl;
2022                 break;
2023         case MSR_IA32_FEAT_CTL:
2024                 msr_info->data = vmx->msr_ia32_feature_control;
2025                 break;
2026         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2027                 if (!msr_info->host_initiated &&
2028                     !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
2029                         return 1;
2030                 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2031                         [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2032                 break;
2033         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2034                 if (!nested_vmx_allowed(vcpu))
2035                         return 1;
2036                 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2037                                     &msr_info->data))
2038                         return 1;
2039                 /*
2040                  * Enlightened VMCS v1 doesn't have certain VMCS fields but
2041                  * instead of just ignoring the features, different Hyper-V
2042                  * versions are either trying to use them and fail or do some
2043                  * sanity checking and refuse to boot. Filter all unsupported
2044                  * features out.
2045                  */
2046                 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
2047                         nested_evmcs_filter_control_msr(vcpu, msr_info->index,
2048                                                         &msr_info->data);
2049                 break;
2050         case MSR_IA32_RTIT_CTL:
2051                 if (!vmx_pt_mode_is_host_guest())
2052                         return 1;
2053                 msr_info->data = vmx->pt_desc.guest.ctl;
2054                 break;
2055         case MSR_IA32_RTIT_STATUS:
2056                 if (!vmx_pt_mode_is_host_guest())
2057                         return 1;
2058                 msr_info->data = vmx->pt_desc.guest.status;
2059                 break;
2060         case MSR_IA32_RTIT_CR3_MATCH:
2061                 if (!vmx_pt_mode_is_host_guest() ||
2062                         !intel_pt_validate_cap(vmx->pt_desc.caps,
2063                                                 PT_CAP_cr3_filtering))
2064                         return 1;
2065                 msr_info->data = vmx->pt_desc.guest.cr3_match;
2066                 break;
2067         case MSR_IA32_RTIT_OUTPUT_BASE:
2068                 if (!vmx_pt_mode_is_host_guest() ||
2069                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2070                                         PT_CAP_topa_output) &&
2071                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2072                                         PT_CAP_single_range_output)))
2073                         return 1;
2074                 msr_info->data = vmx->pt_desc.guest.output_base;
2075                 break;
2076         case MSR_IA32_RTIT_OUTPUT_MASK:
2077                 if (!vmx_pt_mode_is_host_guest() ||
2078                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2079                                         PT_CAP_topa_output) &&
2080                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2081                                         PT_CAP_single_range_output)))
2082                         return 1;
2083                 msr_info->data = vmx->pt_desc.guest.output_mask;
2084                 break;
2085         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2086                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2087                 if (!vmx_pt_mode_is_host_guest() ||
2088                     (index >= 2 * vmx->pt_desc.num_address_ranges))
2089                         return 1;
2090                 if (index % 2)
2091                         msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2092                 else
2093                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2094                 break;
2095         case MSR_IA32_DEBUGCTLMSR:
2096                 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
2097                 break;
2098         default:
2099         find_uret_msr:
2100                 msr = vmx_find_uret_msr(vmx, msr_info->index);
2101                 if (msr) {
2102                         msr_info->data = msr->data;
2103                         break;
2104                 }
2105                 return kvm_get_msr_common(vcpu, msr_info);
2106         }
2107
2108         return 0;
2109 }
2110
2111 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2112                                                     u64 data)
2113 {
2114 #ifdef CONFIG_X86_64
2115         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
2116                 return (u32)data;
2117 #endif
2118         return (unsigned long)data;
2119 }
2120
2121 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
2122 {
2123         u64 debugctl = 0;
2124
2125         if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
2126             (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
2127                 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
2128
2129         if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
2130             (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
2131                 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
2132
2133         return debugctl;
2134 }
2135
2136 static int vmx_set_msr_ia32_cmd(struct kvm_vcpu *vcpu,
2137                                 struct msr_data *msr_info,
2138                                 bool guest_has_feat, u64 cmd,
2139                                 int x86_feature_bit)
2140 {
2141         if (!msr_info->host_initiated && !guest_has_feat)
2142                 return 1;
2143
2144         if (!(msr_info->data & ~cmd))
2145                 return 1;
2146         if (!boot_cpu_has(x86_feature_bit))
2147                 return 1;
2148         if (!msr_info->data)
2149                 return 0;
2150
2151         wrmsrl(msr_info->index, cmd);
2152
2153         /*
2154          * For non-nested:
2155          * When it's written (to non-zero) for the first time, pass
2156          * it through.
2157          *
2158          * For nested:
2159          * The handling of the MSR bitmap for L2 guests is done in
2160          * nested_vmx_prepare_msr_bitmap. We should not touch the
2161          * vmcs02.msr_bitmap here since it gets completely overwritten
2162          * in the merging.
2163          */
2164         vmx_disable_intercept_for_msr(vcpu, msr_info->index, MSR_TYPE_W);
2165
2166         return 0;
2167 }
2168
2169 /*
2170  * Writes msr value into the appropriate "register".
2171  * Returns 0 on success, non-0 otherwise.
2172  * Assumes vcpu_load() was already called.
2173  */
2174 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2175 {
2176         struct vcpu_vmx *vmx = to_vmx(vcpu);
2177         struct vmx_uret_msr *msr;
2178         int ret = 0;
2179         u32 msr_index = msr_info->index;
2180         u64 data = msr_info->data;
2181         u32 index;
2182
2183         switch (msr_index) {
2184         case MSR_EFER:
2185                 ret = kvm_set_msr_common(vcpu, msr_info);
2186                 break;
2187 #ifdef CONFIG_X86_64
2188         case MSR_FS_BASE:
2189                 vmx_segment_cache_clear(vmx);
2190                 vmcs_writel(GUEST_FS_BASE, data);
2191                 break;
2192         case MSR_GS_BASE:
2193                 vmx_segment_cache_clear(vmx);
2194                 vmcs_writel(GUEST_GS_BASE, data);
2195                 break;
2196         case MSR_KERNEL_GS_BASE:
2197                 vmx_write_guest_kernel_gs_base(vmx, data);
2198                 break;
2199         case MSR_IA32_XFD:
2200                 ret = kvm_set_msr_common(vcpu, msr_info);
2201                 /*
2202                  * Always intercepting WRMSR could incur non-negligible
2203                  * overhead given xfd might be changed frequently in
2204                  * guest context switch. Disable write interception
2205                  * upon the first write with a non-zero value (indicating
2206                  * potential usage on dynamic xfeatures). Also update
2207                  * exception bitmap to trap #NM for proper virtualization
2208                  * of guest xfd_err.
2209                  */
2210                 if (!ret && data) {
2211                         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2212                                                       MSR_TYPE_RW);
2213                         vcpu->arch.xfd_no_write_intercept = true;
2214                         vmx_update_exception_bitmap(vcpu);
2215                 }
2216                 break;
2217 #endif
2218         case MSR_IA32_SYSENTER_CS:
2219                 if (is_guest_mode(vcpu))
2220                         get_vmcs12(vcpu)->guest_sysenter_cs = data;
2221                 vmcs_write32(GUEST_SYSENTER_CS, data);
2222                 break;
2223         case MSR_IA32_SYSENTER_EIP:
2224                 if (is_guest_mode(vcpu)) {
2225                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2226                         get_vmcs12(vcpu)->guest_sysenter_eip = data;
2227                 }
2228                 vmcs_writel(GUEST_SYSENTER_EIP, data);
2229                 break;
2230         case MSR_IA32_SYSENTER_ESP:
2231                 if (is_guest_mode(vcpu)) {
2232                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2233                         get_vmcs12(vcpu)->guest_sysenter_esp = data;
2234                 }
2235                 vmcs_writel(GUEST_SYSENTER_ESP, data);
2236                 break;
2237         case MSR_IA32_DEBUGCTLMSR: {
2238                 u64 invalid;
2239
2240                 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
2241                 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
2242                         kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
2243                         data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2244                         invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2245                 }
2246
2247                 if (invalid)
2248                         return 1;
2249
2250                 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2251                                                 VM_EXIT_SAVE_DEBUG_CONTROLS)
2252                         get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2253
2254                 vmcs_write64(GUEST_IA32_DEBUGCTL, data);
2255                 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2256                     (data & DEBUGCTLMSR_LBR))
2257                         intel_pmu_create_guest_lbr_event(vcpu);
2258                 return 0;
2259         }
2260         case MSR_IA32_BNDCFGS:
2261                 if (!kvm_mpx_supported() ||
2262                     (!msr_info->host_initiated &&
2263                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2264                         return 1;
2265                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
2266                     (data & MSR_IA32_BNDCFGS_RSVD))
2267                         return 1;
2268
2269                 if (is_guest_mode(vcpu) &&
2270                     ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2271                      (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2272                         get_vmcs12(vcpu)->guest_bndcfgs = data;
2273
2274                 vmcs_write64(GUEST_BNDCFGS, data);
2275                 break;
2276         case MSR_IA32_UMWAIT_CONTROL:
2277                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2278                         return 1;
2279
2280                 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2281                 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2282                         return 1;
2283
2284                 vmx->msr_ia32_umwait_control = data;
2285                 break;
2286         case MSR_IA32_SPEC_CTRL:
2287                 if (!msr_info->host_initiated &&
2288                     !guest_has_spec_ctrl_msr(vcpu))
2289                         return 1;
2290
2291                 if (kvm_spec_ctrl_test_value(data))
2292                         return 1;
2293
2294                 vmx->spec_ctrl = data;
2295                 if (!data)
2296                         break;
2297
2298                 /*
2299                  * For non-nested:
2300                  * When it's written (to non-zero) for the first time, pass
2301                  * it through.
2302                  *
2303                  * For nested:
2304                  * The handling of the MSR bitmap for L2 guests is done in
2305                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2306                  * vmcs02.msr_bitmap here since it gets completely overwritten
2307                  * in the merging. We update the vmcs01 here for L1 as well
2308                  * since it will end up touching the MSR anyway now.
2309                  */
2310                 vmx_disable_intercept_for_msr(vcpu,
2311                                               MSR_IA32_SPEC_CTRL,
2312                                               MSR_TYPE_RW);
2313                 break;
2314         case MSR_IA32_TSX_CTRL:
2315                 if (!msr_info->host_initiated &&
2316                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2317                         return 1;
2318                 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2319                         return 1;
2320                 goto find_uret_msr;
2321         case MSR_IA32_PRED_CMD:
2322                 ret = vmx_set_msr_ia32_cmd(vcpu, msr_info,
2323                                            guest_has_pred_cmd_msr(vcpu),
2324                                            PRED_CMD_IBPB,
2325                                            X86_FEATURE_IBPB);
2326                 break;
2327         case MSR_IA32_FLUSH_CMD:
2328                 ret = vmx_set_msr_ia32_cmd(vcpu, msr_info,
2329                                            guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D),
2330                                            L1D_FLUSH,
2331                                            X86_FEATURE_FLUSH_L1D);
2332                 break;
2333         case MSR_IA32_CR_PAT:
2334                 if (!kvm_pat_valid(data))
2335                         return 1;
2336
2337                 if (is_guest_mode(vcpu) &&
2338                     get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2339                         get_vmcs12(vcpu)->guest_ia32_pat = data;
2340
2341                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2342                         vmcs_write64(GUEST_IA32_PAT, data);
2343                         vcpu->arch.pat = data;
2344                         break;
2345                 }
2346                 ret = kvm_set_msr_common(vcpu, msr_info);
2347                 break;
2348         case MSR_IA32_MCG_EXT_CTL:
2349                 if ((!msr_info->host_initiated &&
2350                      !(to_vmx(vcpu)->msr_ia32_feature_control &
2351                        FEAT_CTL_LMCE_ENABLED)) ||
2352                     (data & ~MCG_EXT_CTL_LMCE_EN))
2353                         return 1;
2354                 vcpu->arch.mcg_ext_ctl = data;
2355                 break;
2356         case MSR_IA32_FEAT_CTL:
2357                 if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
2358                         return 1;
2359
2360                 vmx->msr_ia32_feature_control = data;
2361                 if (msr_info->host_initiated && data == 0)
2362                         vmx_leave_nested(vcpu);
2363
2364                 /* SGX may be enabled/disabled by guest's firmware */
2365                 vmx_write_encls_bitmap(vcpu, NULL);
2366                 break;
2367         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2368                 /*
2369                  * On real hardware, the LE hash MSRs are writable before
2370                  * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2371                  * at which point SGX related bits in IA32_FEATURE_CONTROL
2372                  * become writable.
2373                  *
2374                  * KVM does not emulate SGX activation for simplicity, so
2375                  * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2376                  * is unlocked.  This is technically not architectural
2377                  * behavior, but it's close enough.
2378                  */
2379                 if (!msr_info->host_initiated &&
2380                     (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
2381                     ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2382                     !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2383                         return 1;
2384                 vmx->msr_ia32_sgxlepubkeyhash
2385                         [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2386                 break;
2387         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2388                 if (!msr_info->host_initiated)
2389                         return 1; /* they are read-only */
2390                 if (!nested_vmx_allowed(vcpu))
2391                         return 1;
2392                 return vmx_set_vmx_msr(vcpu, msr_index, data);
2393         case MSR_IA32_RTIT_CTL:
2394                 if (!vmx_pt_mode_is_host_guest() ||
2395                         vmx_rtit_ctl_check(vcpu, data) ||
2396                         vmx->nested.vmxon)
2397                         return 1;
2398                 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2399                 vmx->pt_desc.guest.ctl = data;
2400                 pt_update_intercept_for_msr(vcpu);
2401                 break;
2402         case MSR_IA32_RTIT_STATUS:
2403                 if (!pt_can_write_msr(vmx))
2404                         return 1;
2405                 if (data & MSR_IA32_RTIT_STATUS_MASK)
2406                         return 1;
2407                 vmx->pt_desc.guest.status = data;
2408                 break;
2409         case MSR_IA32_RTIT_CR3_MATCH:
2410                 if (!pt_can_write_msr(vmx))
2411                         return 1;
2412                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2413                                            PT_CAP_cr3_filtering))
2414                         return 1;
2415                 vmx->pt_desc.guest.cr3_match = data;
2416                 break;
2417         case MSR_IA32_RTIT_OUTPUT_BASE:
2418                 if (!pt_can_write_msr(vmx))
2419                         return 1;
2420                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2421                                            PT_CAP_topa_output) &&
2422                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2423                                            PT_CAP_single_range_output))
2424                         return 1;
2425                 if (!pt_output_base_valid(vcpu, data))
2426                         return 1;
2427                 vmx->pt_desc.guest.output_base = data;
2428                 break;
2429         case MSR_IA32_RTIT_OUTPUT_MASK:
2430                 if (!pt_can_write_msr(vmx))
2431                         return 1;
2432                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2433                                            PT_CAP_topa_output) &&
2434                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2435                                            PT_CAP_single_range_output))
2436                         return 1;
2437                 vmx->pt_desc.guest.output_mask = data;
2438                 break;
2439         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2440                 if (!pt_can_write_msr(vmx))
2441                         return 1;
2442                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2443                 if (index >= 2 * vmx->pt_desc.num_address_ranges)
2444                         return 1;
2445                 if (is_noncanonical_address(data, vcpu))
2446                         return 1;
2447                 if (index % 2)
2448                         vmx->pt_desc.guest.addr_b[index / 2] = data;
2449                 else
2450                         vmx->pt_desc.guest.addr_a[index / 2] = data;
2451                 break;
2452         case MSR_IA32_PERF_CAPABILITIES:
2453                 if (data && !vcpu_to_pmu(vcpu)->version)
2454                         return 1;
2455                 if (data & PMU_CAP_LBR_FMT) {
2456                         if ((data & PMU_CAP_LBR_FMT) !=
2457                             (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
2458                                 return 1;
2459                         if (!cpuid_model_is_consistent(vcpu))
2460                                 return 1;
2461                 }
2462                 if (data & PERF_CAP_PEBS_FORMAT) {
2463                         if ((data & PERF_CAP_PEBS_MASK) !=
2464                             (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
2465                                 return 1;
2466                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
2467                                 return 1;
2468                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
2469                                 return 1;
2470                         if (!cpuid_model_is_consistent(vcpu))
2471                                 return 1;
2472                 }
2473                 ret = kvm_set_msr_common(vcpu, msr_info);
2474                 break;
2475
2476         default:
2477         find_uret_msr:
2478                 msr = vmx_find_uret_msr(vmx, msr_index);
2479                 if (msr)
2480                         ret = vmx_set_guest_uret_msr(vmx, msr, data);
2481                 else
2482                         ret = kvm_set_msr_common(vcpu, msr_info);
2483         }
2484
2485         /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2486         if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2487                 vmx_update_fb_clear_dis(vcpu, vmx);
2488
2489         return ret;
2490 }
2491
2492 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2493 {
2494         unsigned long guest_owned_bits;
2495
2496         kvm_register_mark_available(vcpu, reg);
2497
2498         switch (reg) {
2499         case VCPU_REGS_RSP:
2500                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2501                 break;
2502         case VCPU_REGS_RIP:
2503                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2504                 break;
2505         case VCPU_EXREG_PDPTR:
2506                 if (enable_ept)
2507                         ept_save_pdptrs(vcpu);
2508                 break;
2509         case VCPU_EXREG_CR0:
2510                 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2511
2512                 vcpu->arch.cr0 &= ~guest_owned_bits;
2513                 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2514                 break;
2515         case VCPU_EXREG_CR3:
2516                 /*
2517                  * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2518                  * CR3 is loaded into hardware, not the guest's CR3.
2519                  */
2520                 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2521                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2522                 break;
2523         case VCPU_EXREG_CR4:
2524                 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2525
2526                 vcpu->arch.cr4 &= ~guest_owned_bits;
2527                 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2528                 break;
2529         default:
2530                 KVM_BUG_ON(1, vcpu->kvm);
2531                 break;
2532         }
2533 }
2534
2535 /*
2536  * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2537  * directly instead of going through cpu_has(), to ensure KVM is trapping
2538  * ENCLS whenever it's supported in hardware.  It does not matter whether
2539  * the host OS supports or has enabled SGX.
2540  */
2541 static bool cpu_has_sgx(void)
2542 {
2543         return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2544 }
2545
2546 /*
2547  * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2548  * can't be used due to errata where VM Exit may incorrectly clear
2549  * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
2550  * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2551  */
2552 static bool cpu_has_perf_global_ctrl_bug(void)
2553 {
2554         if (boot_cpu_data.x86 == 0x6) {
2555                 switch (boot_cpu_data.x86_model) {
2556                 case INTEL_FAM6_NEHALEM_EP:     /* AAK155 */
2557                 case INTEL_FAM6_NEHALEM:        /* AAP115 */
2558                 case INTEL_FAM6_WESTMERE:       /* AAT100 */
2559                 case INTEL_FAM6_WESTMERE_EP:    /* BC86,AAY89,BD102 */
2560                 case INTEL_FAM6_NEHALEM_EX:     /* BA97 */
2561                         return true;
2562                 default:
2563                         break;
2564                 }
2565         }
2566
2567         return false;
2568 }
2569
2570 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
2571 {
2572         u32 vmx_msr_low, vmx_msr_high;
2573         u32 ctl = ctl_min | ctl_opt;
2574
2575         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2576
2577         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2578         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2579
2580         /* Ensure minimum (required) set of control bits are supported. */
2581         if (ctl_min & ~ctl)
2582                 return -EIO;
2583
2584         *result = ctl;
2585         return 0;
2586 }
2587
2588 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
2589 {
2590         u64 allowed;
2591
2592         rdmsrl(msr, allowed);
2593
2594         return  ctl_opt & allowed;
2595 }
2596
2597 static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2598                              struct vmx_capability *vmx_cap)
2599 {
2600         u32 vmx_msr_low, vmx_msr_high;
2601         u32 _pin_based_exec_control = 0;
2602         u32 _cpu_based_exec_control = 0;
2603         u32 _cpu_based_2nd_exec_control = 0;
2604         u64 _cpu_based_3rd_exec_control = 0;
2605         u32 _vmexit_control = 0;
2606         u32 _vmentry_control = 0;
2607         u64 misc_msr;
2608         int i;
2609
2610         /*
2611          * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2612          * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2613          * intercepts writes to PAT and EFER, i.e. never enables those controls.
2614          */
2615         struct {
2616                 u32 entry_control;
2617                 u32 exit_control;
2618         } const vmcs_entry_exit_pairs[] = {
2619                 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,  VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2620                 { VM_ENTRY_LOAD_IA32_PAT,               VM_EXIT_LOAD_IA32_PAT },
2621                 { VM_ENTRY_LOAD_IA32_EFER,              VM_EXIT_LOAD_IA32_EFER },
2622                 { VM_ENTRY_LOAD_BNDCFGS,                VM_EXIT_CLEAR_BNDCFGS },
2623                 { VM_ENTRY_LOAD_IA32_RTIT_CTL,          VM_EXIT_CLEAR_IA32_RTIT_CTL },
2624         };
2625
2626         memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2627
2628         if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2629                                 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2630                                 MSR_IA32_VMX_PROCBASED_CTLS,
2631                                 &_cpu_based_exec_control))
2632                 return -EIO;
2633         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2634                 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2635                                         KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
2636                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2637                                         &_cpu_based_2nd_exec_control))
2638                         return -EIO;
2639         }
2640 #ifndef CONFIG_X86_64
2641         if (!(_cpu_based_2nd_exec_control &
2642                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2643                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2644 #endif
2645
2646         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2647                 _cpu_based_2nd_exec_control &= ~(
2648                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2649                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2650                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2651
2652         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2653                 &vmx_cap->ept, &vmx_cap->vpid);
2654
2655         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2656             vmx_cap->ept) {
2657                 pr_warn_once("EPT CAP should not exist if not support "
2658                                 "1-setting enable EPT VM-execution control\n");
2659
2660                 if (error_on_inconsistent_vmcs_config)
2661                         return -EIO;
2662
2663                 vmx_cap->ept = 0;
2664         }
2665         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2666             vmx_cap->vpid) {
2667                 pr_warn_once("VPID CAP should not exist if not support "
2668                                 "1-setting enable VPID VM-execution control\n");
2669
2670                 if (error_on_inconsistent_vmcs_config)
2671                         return -EIO;
2672
2673                 vmx_cap->vpid = 0;
2674         }
2675
2676         if (!cpu_has_sgx())
2677                 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2678
2679         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2680                 _cpu_based_3rd_exec_control =
2681                         adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
2682                                               MSR_IA32_VMX_PROCBASED_CTLS3);
2683
2684         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2685                                 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2686                                 MSR_IA32_VMX_EXIT_CTLS,
2687                                 &_vmexit_control))
2688                 return -EIO;
2689
2690         if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2691                                 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2692                                 MSR_IA32_VMX_PINBASED_CTLS,
2693                                 &_pin_based_exec_control))
2694                 return -EIO;
2695
2696         if (cpu_has_broken_vmx_preemption_timer())
2697                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2698         if (!(_cpu_based_2nd_exec_control &
2699                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2700                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2701
2702         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2703                                 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2704                                 MSR_IA32_VMX_ENTRY_CTLS,
2705                                 &_vmentry_control))
2706                 return -EIO;
2707
2708         for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
2709                 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
2710                 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
2711
2712                 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
2713                         continue;
2714
2715                 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
2716                              _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
2717
2718                 if (error_on_inconsistent_vmcs_config)
2719                         return -EIO;
2720
2721                 _vmentry_control &= ~n_ctrl;
2722                 _vmexit_control &= ~x_ctrl;
2723         }
2724
2725         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2726
2727         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2728         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2729                 return -EIO;
2730
2731 #ifdef CONFIG_X86_64
2732         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2733         if (vmx_msr_high & (1u<<16))
2734                 return -EIO;
2735 #endif
2736
2737         /* Require Write-Back (WB) memory type for VMCS accesses. */
2738         if (((vmx_msr_high >> 18) & 15) != 6)
2739                 return -EIO;
2740
2741         rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
2742
2743         vmcs_conf->size = vmx_msr_high & 0x1fff;
2744         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2745
2746         vmcs_conf->revision_id = vmx_msr_low;
2747
2748         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2749         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2750         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2751         vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
2752         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2753         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2754         vmcs_conf->misc = misc_msr;
2755
2756 #if IS_ENABLED(CONFIG_HYPERV)
2757         if (enlightened_vmcs)
2758                 evmcs_sanitize_exec_ctrls(vmcs_conf);
2759 #endif
2760
2761         return 0;
2762 }
2763
2764 static bool kvm_is_vmx_supported(void)
2765 {
2766         int cpu = raw_smp_processor_id();
2767
2768         if (!cpu_has_vmx()) {
2769                 pr_err("VMX not supported by CPU %d\n", cpu);
2770                 return false;
2771         }
2772
2773         if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2774             !this_cpu_has(X86_FEATURE_VMX)) {
2775                 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
2776                 return false;
2777         }
2778
2779         return true;
2780 }
2781
2782 static int vmx_check_processor_compat(void)
2783 {
2784         int cpu = raw_smp_processor_id();
2785         struct vmcs_config vmcs_conf;
2786         struct vmx_capability vmx_cap;
2787
2788         if (!kvm_is_vmx_supported())
2789                 return -EIO;
2790
2791         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2792                 pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
2793                 return -EIO;
2794         }
2795         if (nested)
2796                 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
2797         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2798                 pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
2799                 return -EIO;
2800         }
2801         return 0;
2802 }
2803
2804 static int kvm_cpu_vmxon(u64 vmxon_pointer)
2805 {
2806         u64 msr;
2807
2808         cr4_set_bits(X86_CR4_VMXE);
2809
2810         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2811                           _ASM_EXTABLE(1b, %l[fault])
2812                           : : [vmxon_pointer] "m"(vmxon_pointer)
2813                           : : fault);
2814         return 0;
2815
2816 fault:
2817         WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2818                   rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2819         cr4_clear_bits(X86_CR4_VMXE);
2820
2821         return -EFAULT;
2822 }
2823
2824 static int vmx_hardware_enable(void)
2825 {
2826         int cpu = raw_smp_processor_id();
2827         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2828         int r;
2829
2830         if (cr4_read_shadow() & X86_CR4_VMXE)
2831                 return -EBUSY;
2832
2833         /*
2834          * This can happen if we hot-added a CPU but failed to allocate
2835          * VP assist page for it.
2836          */
2837         if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
2838                 return -EFAULT;
2839
2840         intel_pt_handle_vmx(1);
2841
2842         r = kvm_cpu_vmxon(phys_addr);
2843         if (r) {
2844                 intel_pt_handle_vmx(0);
2845                 return r;
2846         }
2847
2848         if (enable_ept)
2849                 ept_sync_global();
2850
2851         return 0;
2852 }
2853
2854 static void vmclear_local_loaded_vmcss(void)
2855 {
2856         int cpu = raw_smp_processor_id();
2857         struct loaded_vmcs *v, *n;
2858
2859         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2860                                  loaded_vmcss_on_cpu_link)
2861                 __loaded_vmcs_clear(v);
2862 }
2863
2864 static void vmx_hardware_disable(void)
2865 {
2866         vmclear_local_loaded_vmcss();
2867
2868         if (cpu_vmxoff())
2869                 kvm_spurious_fault();
2870
2871         hv_reset_evmcs();
2872
2873         intel_pt_handle_vmx(0);
2874 }
2875
2876 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2877 {
2878         int node = cpu_to_node(cpu);
2879         struct page *pages;
2880         struct vmcs *vmcs;
2881
2882         pages = __alloc_pages_node(node, flags, 0);
2883         if (!pages)
2884                 return NULL;
2885         vmcs = page_address(pages);
2886         memset(vmcs, 0, vmcs_config.size);
2887
2888         /* KVM supports Enlightened VMCS v1 only */
2889         if (kvm_is_using_evmcs())
2890                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2891         else
2892                 vmcs->hdr.revision_id = vmcs_config.revision_id;
2893
2894         if (shadow)
2895                 vmcs->hdr.shadow_vmcs = 1;
2896         return vmcs;
2897 }
2898
2899 void free_vmcs(struct vmcs *vmcs)
2900 {
2901         free_page((unsigned long)vmcs);
2902 }
2903
2904 /*
2905  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2906  */
2907 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2908 {
2909         if (!loaded_vmcs->vmcs)
2910                 return;
2911         loaded_vmcs_clear(loaded_vmcs);
2912         free_vmcs(loaded_vmcs->vmcs);
2913         loaded_vmcs->vmcs = NULL;
2914         if (loaded_vmcs->msr_bitmap)
2915                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2916         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2917 }
2918
2919 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2920 {
2921         loaded_vmcs->vmcs = alloc_vmcs(false);
2922         if (!loaded_vmcs->vmcs)
2923                 return -ENOMEM;
2924
2925         vmcs_clear(loaded_vmcs->vmcs);
2926
2927         loaded_vmcs->shadow_vmcs = NULL;
2928         loaded_vmcs->hv_timer_soft_disabled = false;
2929         loaded_vmcs->cpu = -1;
2930         loaded_vmcs->launched = 0;
2931
2932         if (cpu_has_vmx_msr_bitmap()) {
2933                 loaded_vmcs->msr_bitmap = (unsigned long *)
2934                                 __get_free_page(GFP_KERNEL_ACCOUNT);
2935                 if (!loaded_vmcs->msr_bitmap)
2936                         goto out_vmcs;
2937                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2938         }
2939
2940         memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2941         memset(&loaded_vmcs->controls_shadow, 0,
2942                 sizeof(struct vmcs_controls_shadow));
2943
2944         return 0;
2945
2946 out_vmcs:
2947         free_loaded_vmcs(loaded_vmcs);
2948         return -ENOMEM;
2949 }
2950
2951 static void free_kvm_area(void)
2952 {
2953         int cpu;
2954
2955         for_each_possible_cpu(cpu) {
2956                 free_vmcs(per_cpu(vmxarea, cpu));
2957                 per_cpu(vmxarea, cpu) = NULL;
2958         }
2959 }
2960
2961 static __init int alloc_kvm_area(void)
2962 {
2963         int cpu;
2964
2965         for_each_possible_cpu(cpu) {
2966                 struct vmcs *vmcs;
2967
2968                 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2969                 if (!vmcs) {
2970                         free_kvm_area();
2971                         return -ENOMEM;
2972                 }
2973
2974                 /*
2975                  * When eVMCS is enabled, alloc_vmcs_cpu() sets
2976                  * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2977                  * revision_id reported by MSR_IA32_VMX_BASIC.
2978                  *
2979                  * However, even though not explicitly documented by
2980                  * TLFS, VMXArea passed as VMXON argument should
2981                  * still be marked with revision_id reported by
2982                  * physical CPU.
2983                  */
2984                 if (kvm_is_using_evmcs())
2985                         vmcs->hdr.revision_id = vmcs_config.revision_id;
2986
2987                 per_cpu(vmxarea, cpu) = vmcs;
2988         }
2989         return 0;
2990 }
2991
2992 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2993                 struct kvm_segment *save)
2994 {
2995         if (!emulate_invalid_guest_state) {
2996                 /*
2997                  * CS and SS RPL should be equal during guest entry according
2998                  * to VMX spec, but in reality it is not always so. Since vcpu
2999                  * is in the middle of the transition from real mode to
3000                  * protected mode it is safe to assume that RPL 0 is a good
3001                  * default value.
3002                  */
3003                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3004                         save->selector &= ~SEGMENT_RPL_MASK;
3005                 save->dpl = save->selector & SEGMENT_RPL_MASK;
3006                 save->s = 1;
3007         }
3008         __vmx_set_segment(vcpu, save, seg);
3009 }
3010
3011 static void enter_pmode(struct kvm_vcpu *vcpu)
3012 {
3013         unsigned long flags;
3014         struct vcpu_vmx *vmx = to_vmx(vcpu);
3015
3016         /*
3017          * Update real mode segment cache. It may be not up-to-date if segment
3018          * register was written while vcpu was in a guest mode.
3019          */
3020         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3021         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3022         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3023         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3024         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3025         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3026
3027         vmx->rmode.vm86_active = 0;
3028
3029         __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3030
3031         flags = vmcs_readl(GUEST_RFLAGS);
3032         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3033         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3034         vmcs_writel(GUEST_RFLAGS, flags);
3035
3036         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3037                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3038
3039         vmx_update_exception_bitmap(vcpu);
3040
3041         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3042         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3043         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3044         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3045         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3046         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3047 }
3048
3049 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3050 {
3051         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3052         struct kvm_segment var = *save;
3053
3054         var.dpl = 0x3;
3055         if (seg == VCPU_SREG_CS)
3056                 var.type = 0x3;
3057
3058         if (!emulate_invalid_guest_state) {
3059                 var.selector = var.base >> 4;
3060                 var.base = var.base & 0xffff0;
3061                 var.limit = 0xffff;
3062                 var.g = 0;
3063                 var.db = 0;
3064                 var.present = 1;
3065                 var.s = 1;
3066                 var.l = 0;
3067                 var.unusable = 0;
3068                 var.type = 0x3;
3069                 var.avl = 0;
3070                 if (save->base & 0xf)
3071                         pr_warn_once("segment base is not paragraph aligned "
3072                                      "when entering protected mode (seg=%d)", seg);
3073         }
3074
3075         vmcs_write16(sf->selector, var.selector);
3076         vmcs_writel(sf->base, var.base);
3077         vmcs_write32(sf->limit, var.limit);
3078         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3079 }
3080
3081 static void enter_rmode(struct kvm_vcpu *vcpu)
3082 {
3083         unsigned long flags;
3084         struct vcpu_vmx *vmx = to_vmx(vcpu);
3085         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3086
3087         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3088         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3089         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3090         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3091         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3092         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3093         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3094
3095         vmx->rmode.vm86_active = 1;
3096
3097         /*
3098          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3099          * vcpu. Warn the user that an update is overdue.
3100          */
3101         if (!kvm_vmx->tss_addr)
3102                 pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
3103
3104         vmx_segment_cache_clear(vmx);
3105
3106         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3107         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3108         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3109
3110         flags = vmcs_readl(GUEST_RFLAGS);
3111         vmx->rmode.save_rflags = flags;
3112
3113         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3114
3115         vmcs_writel(GUEST_RFLAGS, flags);
3116         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3117         vmx_update_exception_bitmap(vcpu);
3118
3119         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3120         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3121         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3122         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3123         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3124         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3125 }
3126
3127 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3128 {
3129         struct vcpu_vmx *vmx = to_vmx(vcpu);
3130
3131         /* Nothing to do if hardware doesn't support EFER. */
3132         if (!vmx_find_uret_msr(vmx, MSR_EFER))
3133                 return 0;
3134
3135         vcpu->arch.efer = efer;
3136 #ifdef CONFIG_X86_64
3137         if (efer & EFER_LMA)
3138                 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3139         else
3140                 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
3141 #else
3142         if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3143                 return 1;
3144 #endif
3145
3146         vmx_setup_uret_msrs(vmx);
3147         return 0;
3148 }
3149
3150 #ifdef CONFIG_X86_64
3151
3152 static void enter_lmode(struct kvm_vcpu *vcpu)
3153 {
3154         u32 guest_tr_ar;
3155
3156         vmx_segment_cache_clear(to_vmx(vcpu));
3157
3158         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3159         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3160                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3161                                      __func__);
3162                 vmcs_write32(GUEST_TR_AR_BYTES,
3163                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3164                              | VMX_AR_TYPE_BUSY_64_TSS);
3165         }
3166         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3167 }
3168
3169 static void exit_lmode(struct kvm_vcpu *vcpu)
3170 {
3171         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3172 }
3173
3174 #endif
3175
3176 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
3177 {
3178         struct vcpu_vmx *vmx = to_vmx(vcpu);
3179
3180         /*
3181          * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3182          * the CPU is not required to invalidate guest-physical mappings on
3183          * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
3184          * associated with the root EPT structure and not any particular VPID
3185          * (INVVPID also isn't required to invalidate guest-physical mappings).
3186          */
3187         if (enable_ept) {
3188                 ept_sync_global();
3189         } else if (enable_vpid) {
3190                 if (cpu_has_vmx_invvpid_global()) {
3191                         vpid_sync_vcpu_global();
3192                 } else {
3193                         vpid_sync_vcpu_single(vmx->vpid);
3194                         vpid_sync_vcpu_single(vmx->nested.vpid02);
3195                 }
3196         }
3197 }
3198
3199 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3200 {
3201         if (is_guest_mode(vcpu))
3202                 return nested_get_vpid02(vcpu);
3203         return to_vmx(vcpu)->vpid;
3204 }
3205
3206 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3207 {
3208         struct kvm_mmu *mmu = vcpu->arch.mmu;
3209         u64 root_hpa = mmu->root.hpa;
3210
3211         /* No flush required if the current context is invalid. */
3212         if (!VALID_PAGE(root_hpa))
3213                 return;
3214
3215         if (enable_ept)
3216                 ept_sync_context(construct_eptp(vcpu, root_hpa,
3217                                                 mmu->root_role.level));
3218         else
3219                 vpid_sync_context(vmx_get_current_vpid(vcpu));
3220 }
3221
3222 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3223 {
3224         /*
3225          * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3226          * vmx_flush_tlb_guest() for an explanation of why this is ok.
3227          */
3228         vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
3229 }
3230
3231 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3232 {
3233         /*
3234          * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3235          * vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit are
3236          * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3237          * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3238          * i.e. no explicit INVVPID is necessary.
3239          */
3240         vpid_sync_context(vmx_get_current_vpid(vcpu));
3241 }
3242
3243 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3244 {
3245         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3246
3247         if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3248                 return;
3249
3250         if (is_pae_paging(vcpu)) {
3251                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3252                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3253                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3254                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3255         }
3256 }
3257
3258 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3259 {
3260         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3261
3262         if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3263                 return;
3264
3265         mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3266         mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3267         mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3268         mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3269
3270         kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
3271 }
3272
3273 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3274                           CPU_BASED_CR3_STORE_EXITING)
3275
3276 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3277 {
3278         struct vcpu_vmx *vmx = to_vmx(vcpu);
3279         unsigned long hw_cr0, old_cr0_pg;
3280         u32 tmp;
3281
3282         old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3283
3284         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3285         if (is_unrestricted_guest(vcpu))
3286                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3287         else {
3288                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3289                 if (!enable_ept)
3290                         hw_cr0 |= X86_CR0_WP;
3291
3292                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3293                         enter_pmode(vcpu);
3294
3295                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3296                         enter_rmode(vcpu);
3297         }
3298
3299         vmcs_writel(CR0_READ_SHADOW, cr0);
3300         vmcs_writel(GUEST_CR0, hw_cr0);
3301         vcpu->arch.cr0 = cr0;
3302         kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3303
3304 #ifdef CONFIG_X86_64
3305         if (vcpu->arch.efer & EFER_LME) {
3306                 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3307                         enter_lmode(vcpu);
3308                 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3309                         exit_lmode(vcpu);
3310         }
3311 #endif
3312
3313         if (enable_ept && !is_unrestricted_guest(vcpu)) {
3314                 /*
3315                  * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
3316                  * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3317                  * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3318                  * KVM's CR3 is installed.
3319                  */
3320                 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3321                         vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3322
3323                 /*
3324                  * When running with EPT but not unrestricted guest, KVM must
3325                  * intercept CR3 accesses when paging is _disabled_.  This is
3326                  * necessary because restricted guests can't actually run with
3327                  * paging disabled, and so KVM stuffs its own CR3 in order to
3328                  * run the guest when identity mapped page tables.
3329                  *
3330                  * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3331                  * update, it may be stale with respect to CR3 interception,
3332                  * e.g. after nested VM-Enter.
3333                  *
3334                  * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3335                  * stores to forward them to L1, even if KVM does not need to
3336                  * intercept them to preserve its identity mapped page tables.
3337                  */
3338                 if (!(cr0 & X86_CR0_PG)) {
3339                         exec_controls_setbit(vmx, CR3_EXITING_BITS);
3340                 } else if (!is_guest_mode(vcpu)) {
3341                         exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3342                 } else {
3343                         tmp = exec_controls_get(vmx);
3344                         tmp &= ~CR3_EXITING_BITS;
3345                         tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3346                         exec_controls_set(vmx, tmp);
3347                 }
3348
3349                 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3350                 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3351                         vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3352
3353                 /*
3354                  * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3355                  * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3356                  */
3357                 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3358                         kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
3359         }
3360
3361         /* depends on vcpu->arch.cr0 to be set to a new value */
3362         vmx->emulation_required = vmx_emulation_required(vcpu);
3363 }
3364
3365 static int vmx_get_max_tdp_level(void)
3366 {
3367         if (cpu_has_vmx_ept_5levels())
3368                 return 5;
3369         return 4;
3370 }
3371
3372 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3373 {
3374         u64 eptp = VMX_EPTP_MT_WB;
3375
3376         eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3377
3378         if (enable_ept_ad_bits &&
3379             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3380                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
3381         eptp |= root_hpa;
3382
3383         return eptp;
3384 }
3385
3386 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3387                              int root_level)
3388 {
3389         struct kvm *kvm = vcpu->kvm;
3390         bool update_guest_cr3 = true;
3391         unsigned long guest_cr3;
3392         u64 eptp;
3393
3394         if (enable_ept) {
3395                 eptp = construct_eptp(vcpu, root_hpa, root_level);
3396                 vmcs_write64(EPT_POINTER, eptp);
3397
3398                 hv_track_root_tdp(vcpu, root_hpa);
3399
3400                 if (!enable_unrestricted_guest && !is_paging(vcpu))
3401                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3402                 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
3403                         guest_cr3 = vcpu->arch.cr3;
3404                 else /* vmcs.GUEST_CR3 is already up-to-date. */
3405                         update_guest_cr3 = false;
3406                 vmx_ept_load_pdptrs(vcpu);
3407         } else {
3408                 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
3409         }
3410
3411         if (update_guest_cr3)
3412                 vmcs_writel(GUEST_CR3, guest_cr3);
3413 }
3414
3415
3416 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3417 {
3418         /*
3419          * We operate under the default treatment of SMM, so VMX cannot be
3420          * enabled under SMM.  Note, whether or not VMXE is allowed at all,
3421          * i.e. is a reserved bit, is handled by common x86 code.
3422          */
3423         if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3424                 return false;
3425
3426         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3427                 return false;
3428
3429         return true;
3430 }
3431
3432 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3433 {
3434         unsigned long old_cr4 = vcpu->arch.cr4;
3435         struct vcpu_vmx *vmx = to_vmx(vcpu);
3436         /*
3437          * Pass through host's Machine Check Enable value to hw_cr4, which
3438          * is in force while we are in guest mode.  Do not let guests control
3439          * this bit, even if host CR4.MCE == 0.
3440          */
3441         unsigned long hw_cr4;
3442
3443         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3444         if (is_unrestricted_guest(vcpu))
3445                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3446         else if (vmx->rmode.vm86_active)
3447                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3448         else
3449                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3450
3451         if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3452                 if (cr4 & X86_CR4_UMIP) {
3453                         secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3454                         hw_cr4 &= ~X86_CR4_UMIP;
3455                 } else if (!is_guest_mode(vcpu) ||
3456                         !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3457                         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3458                 }
3459         }
3460
3461         vcpu->arch.cr4 = cr4;
3462         kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3463
3464         if (!is_unrestricted_guest(vcpu)) {
3465                 if (enable_ept) {
3466                         if (!is_paging(vcpu)) {
3467                                 hw_cr4 &= ~X86_CR4_PAE;
3468                                 hw_cr4 |= X86_CR4_PSE;
3469                         } else if (!(cr4 & X86_CR4_PAE)) {
3470                                 hw_cr4 &= ~X86_CR4_PAE;
3471                         }
3472                 }
3473
3474                 /*
3475                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3476                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
3477                  * to be manually disabled when guest switches to non-paging
3478                  * mode.
3479                  *
3480                  * If !enable_unrestricted_guest, the CPU is always running
3481                  * with CR0.PG=1 and CR4 needs to be modified.
3482                  * If enable_unrestricted_guest, the CPU automatically
3483                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3484                  */
3485                 if (!is_paging(vcpu))
3486                         hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3487         }
3488
3489         vmcs_writel(CR4_READ_SHADOW, cr4);
3490         vmcs_writel(GUEST_CR4, hw_cr4);
3491
3492         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3493                 kvm_update_cpuid_runtime(vcpu);
3494 }
3495
3496 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3497 {
3498         struct vcpu_vmx *vmx = to_vmx(vcpu);
3499         u32 ar;
3500
3501         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3502                 *var = vmx->rmode.segs[seg];
3503                 if (seg == VCPU_SREG_TR
3504                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3505                         return;
3506                 var->base = vmx_read_guest_seg_base(vmx, seg);
3507                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3508                 return;
3509         }
3510         var->base = vmx_read_guest_seg_base(vmx, seg);
3511         var->limit = vmx_read_guest_seg_limit(vmx, seg);
3512         var->selector = vmx_read_guest_seg_selector(vmx, seg);
3513         ar = vmx_read_guest_seg_ar(vmx, seg);
3514         var->unusable = (ar >> 16) & 1;
3515         var->type = ar & 15;
3516         var->s = (ar >> 4) & 1;
3517         var->dpl = (ar >> 5) & 3;
3518         /*
3519          * Some userspaces do not preserve unusable property. Since usable
3520          * segment has to be present according to VMX spec we can use present
3521          * property to amend userspace bug by making unusable segment always
3522          * nonpresent. vmx_segment_access_rights() already marks nonpresent
3523          * segment as unusable.
3524          */
3525         var->present = !var->unusable;
3526         var->avl = (ar >> 12) & 1;
3527         var->l = (ar >> 13) & 1;
3528         var->db = (ar >> 14) & 1;
3529         var->g = (ar >> 15) & 1;
3530 }
3531
3532 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3533 {
3534         struct kvm_segment s;
3535
3536         if (to_vmx(vcpu)->rmode.vm86_active) {
3537                 vmx_get_segment(vcpu, &s, seg);
3538                 return s.base;
3539         }
3540         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3541 }
3542
3543 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3544 {
3545         struct vcpu_vmx *vmx = to_vmx(vcpu);
3546
3547         if (unlikely(vmx->rmode.vm86_active))
3548                 return 0;
3549         else {
3550                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3551                 return VMX_AR_DPL(ar);
3552         }
3553 }
3554
3555 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3556 {
3557         u32 ar;
3558
3559         ar = var->type & 15;
3560         ar |= (var->s & 1) << 4;
3561         ar |= (var->dpl & 3) << 5;
3562         ar |= (var->present & 1) << 7;
3563         ar |= (var->avl & 1) << 12;
3564         ar |= (var->l & 1) << 13;
3565         ar |= (var->db & 1) << 14;
3566         ar |= (var->g & 1) << 15;
3567         ar |= (var->unusable || !var->present) << 16;
3568
3569         return ar;
3570 }
3571
3572 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3573 {
3574         struct vcpu_vmx *vmx = to_vmx(vcpu);
3575         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3576
3577         vmx_segment_cache_clear(vmx);
3578
3579         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3580                 vmx->rmode.segs[seg] = *var;
3581                 if (seg == VCPU_SREG_TR)
3582                         vmcs_write16(sf->selector, var->selector);
3583                 else if (var->s)
3584                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3585                 return;
3586         }
3587
3588         vmcs_writel(sf->base, var->base);
3589         vmcs_write32(sf->limit, var->limit);
3590         vmcs_write16(sf->selector, var->selector);
3591
3592         /*
3593          *   Fix the "Accessed" bit in AR field of segment registers for older
3594          * qemu binaries.
3595          *   IA32 arch specifies that at the time of processor reset the
3596          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3597          * is setting it to 0 in the userland code. This causes invalid guest
3598          * state vmexit when "unrestricted guest" mode is turned on.
3599          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3600          * tree. Newer qemu binaries with that qemu fix would not need this
3601          * kvm hack.
3602          */
3603         if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3604                 var->type |= 0x1; /* Accessed */
3605
3606         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3607 }
3608
3609 static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3610 {
3611         __vmx_set_segment(vcpu, var, seg);
3612
3613         to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
3614 }
3615
3616 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3617 {
3618         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3619
3620         *db = (ar >> 14) & 1;
3621         *l = (ar >> 13) & 1;
3622 }
3623
3624 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3625 {
3626         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3627         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3628 }
3629
3630 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3631 {
3632         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3633         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3634 }
3635
3636 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3637 {
3638         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3639         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3640 }
3641
3642 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3643 {
3644         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3645         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3646 }
3647
3648 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3649 {
3650         struct kvm_segment var;
3651         u32 ar;
3652
3653         vmx_get_segment(vcpu, &var, seg);
3654         var.dpl = 0x3;
3655         if (seg == VCPU_SREG_CS)
3656                 var.type = 0x3;
3657         ar = vmx_segment_access_rights(&var);
3658
3659         if (var.base != (var.selector << 4))
3660                 return false;
3661         if (var.limit != 0xffff)
3662                 return false;
3663         if (ar != 0xf3)
3664                 return false;
3665
3666         return true;
3667 }
3668
3669 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3670 {
3671         struct kvm_segment cs;
3672         unsigned int cs_rpl;
3673
3674         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3675         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3676
3677         if (cs.unusable)
3678                 return false;
3679         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3680                 return false;
3681         if (!cs.s)
3682                 return false;
3683         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3684                 if (cs.dpl > cs_rpl)
3685                         return false;
3686         } else {
3687                 if (cs.dpl != cs_rpl)
3688                         return false;
3689         }
3690         if (!cs.present)
3691                 return false;
3692
3693         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3694         return true;
3695 }
3696
3697 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3698 {
3699         struct kvm_segment ss;
3700         unsigned int ss_rpl;
3701
3702         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3703         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3704
3705         if (ss.unusable)
3706                 return true;
3707         if (ss.type != 3 && ss.type != 7)
3708                 return false;
3709         if (!ss.s)
3710                 return false;
3711         if (ss.dpl != ss_rpl) /* DPL != RPL */
3712                 return false;
3713         if (!ss.present)
3714                 return false;
3715
3716         return true;
3717 }
3718
3719 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3720 {
3721         struct kvm_segment var;
3722         unsigned int rpl;
3723
3724         vmx_get_segment(vcpu, &var, seg);
3725         rpl = var.selector & SEGMENT_RPL_MASK;
3726
3727         if (var.unusable)
3728                 return true;
3729         if (!var.s)
3730                 return false;
3731         if (!var.present)
3732                 return false;
3733         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3734                 if (var.dpl < rpl) /* DPL < RPL */
3735                         return false;
3736         }
3737
3738         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3739          * rights flags
3740          */
3741         return true;
3742 }
3743
3744 static bool tr_valid(struct kvm_vcpu *vcpu)
3745 {
3746         struct kvm_segment tr;
3747
3748         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3749
3750         if (tr.unusable)
3751                 return false;
3752         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
3753                 return false;
3754         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3755                 return false;
3756         if (!tr.present)
3757                 return false;
3758
3759         return true;
3760 }
3761
3762 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3763 {
3764         struct kvm_segment ldtr;
3765
3766         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3767
3768         if (ldtr.unusable)
3769                 return true;
3770         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
3771                 return false;
3772         if (ldtr.type != 2)
3773                 return false;
3774         if (!ldtr.present)
3775                 return false;
3776
3777         return true;
3778 }
3779
3780 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3781 {
3782         struct kvm_segment cs, ss;
3783
3784         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3785         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3786
3787         return ((cs.selector & SEGMENT_RPL_MASK) ==
3788                  (ss.selector & SEGMENT_RPL_MASK));
3789 }
3790
3791 /*
3792  * Check if guest state is valid. Returns true if valid, false if
3793  * not.
3794  * We assume that registers are always usable
3795  */
3796 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3797 {
3798         /* real mode guest state checks */
3799         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3800                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3801                         return false;
3802                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3803                         return false;
3804                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3805                         return false;
3806                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3807                         return false;
3808                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3809                         return false;
3810                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3811                         return false;
3812         } else {
3813         /* protected mode guest state checks */
3814                 if (!cs_ss_rpl_check(vcpu))
3815                         return false;
3816                 if (!code_segment_valid(vcpu))
3817                         return false;
3818                 if (!stack_segment_valid(vcpu))
3819                         return false;
3820                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3821                         return false;
3822                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3823                         return false;
3824                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3825                         return false;
3826                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3827                         return false;
3828                 if (!tr_valid(vcpu))
3829                         return false;
3830                 if (!ldtr_valid(vcpu))
3831                         return false;
3832         }
3833         /* TODO:
3834          * - Add checks on RIP
3835          * - Add checks on RFLAGS
3836          */
3837
3838         return true;
3839 }
3840
3841 static int init_rmode_tss(struct kvm *kvm, void __user *ua)
3842 {
3843         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3844         u16 data;
3845         int i;
3846
3847         for (i = 0; i < 3; i++) {
3848                 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
3849                         return -EFAULT;
3850         }
3851
3852         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3853         if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
3854                 return -EFAULT;
3855
3856         data = ~0;
3857         if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
3858                 return -EFAULT;
3859
3860         return 0;
3861 }
3862
3863 static int init_rmode_identity_map(struct kvm *kvm)
3864 {
3865         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3866         int i, r = 0;
3867         void __user *uaddr;
3868         u32 tmp;
3869
3870         /* Protect kvm_vmx->ept_identity_pagetable_done. */
3871         mutex_lock(&kvm->slots_lock);
3872
3873         if (likely(kvm_vmx->ept_identity_pagetable_done))
3874                 goto out;
3875
3876         if (!kvm_vmx->ept_identity_map_addr)
3877                 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3878
3879         uaddr = __x86_set_memory_region(kvm,
3880                                         IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3881                                         kvm_vmx->ept_identity_map_addr,
3882                                         PAGE_SIZE);
3883         if (IS_ERR(uaddr)) {
3884                 r = PTR_ERR(uaddr);
3885                 goto out;
3886         }
3887
3888         /* Set up identity-mapping pagetable for EPT in real mode */
3889         for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
3890                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3891                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3892                 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
3893                         r = -EFAULT;
3894                         goto out;
3895                 }
3896         }
3897         kvm_vmx->ept_identity_pagetable_done = true;
3898
3899 out:
3900         mutex_unlock(&kvm->slots_lock);
3901         return r;
3902 }
3903
3904 static void seg_setup(int seg)
3905 {
3906         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3907         unsigned int ar;
3908
3909         vmcs_write16(sf->selector, 0);
3910         vmcs_writel(sf->base, 0);
3911         vmcs_write32(sf->limit, 0xffff);
3912         ar = 0x93;
3913         if (seg == VCPU_SREG_CS)
3914                 ar |= 0x08; /* code segment */
3915
3916         vmcs_write32(sf->ar_bytes, ar);
3917 }
3918
3919 int allocate_vpid(void)
3920 {
3921         int vpid;
3922
3923         if (!enable_vpid)
3924                 return 0;
3925         spin_lock(&vmx_vpid_lock);
3926         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3927         if (vpid < VMX_NR_VPIDS)
3928                 __set_bit(vpid, vmx_vpid_bitmap);
3929         else
3930                 vpid = 0;
3931         spin_unlock(&vmx_vpid_lock);
3932         return vpid;
3933 }
3934
3935 void free_vpid(int vpid)
3936 {
3937         if (!enable_vpid || vpid == 0)
3938                 return;
3939         spin_lock(&vmx_vpid_lock);
3940         __clear_bit(vpid, vmx_vpid_bitmap);
3941         spin_unlock(&vmx_vpid_lock);
3942 }
3943
3944 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3945 {
3946         /*
3947          * When KVM is a nested hypervisor on top of Hyper-V and uses
3948          * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3949          * bitmap has changed.
3950          */
3951         if (kvm_is_using_evmcs()) {
3952                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
3953
3954                 if (evmcs->hv_enlightenments_control.msr_bitmap)
3955                         evmcs->hv_clean_fields &=
3956                                 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
3957         }
3958
3959         vmx->nested.force_msr_bitmap_recalc = true;
3960 }
3961
3962 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
3963 {
3964         struct vcpu_vmx *vmx = to_vmx(vcpu);
3965         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3966
3967         if (!cpu_has_vmx_msr_bitmap())
3968                 return;
3969
3970         vmx_msr_bitmap_l01_changed(vmx);
3971
3972         /*
3973          * Mark the desired intercept state in shadow bitmap, this is needed
3974          * for resync when the MSR filters change.
3975         */
3976         if (is_valid_passthrough_msr(msr)) {
3977                 int idx = possible_passthrough_msr_slot(msr);
3978
3979                 if (idx != -ENOENT) {
3980                         if (type & MSR_TYPE_R)
3981                                 clear_bit(idx, vmx->shadow_msr_intercept.read);
3982                         if (type & MSR_TYPE_W)
3983                                 clear_bit(idx, vmx->shadow_msr_intercept.write);
3984                 }
3985         }
3986
3987         if ((type & MSR_TYPE_R) &&
3988             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
3989                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
3990                 type &= ~MSR_TYPE_R;
3991         }
3992
3993         if ((type & MSR_TYPE_W) &&
3994             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
3995                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
3996                 type &= ~MSR_TYPE_W;
3997         }
3998
3999         if (type & MSR_TYPE_R)
4000                 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
4001
4002         if (type & MSR_TYPE_W)
4003                 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
4004 }
4005
4006 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
4007 {
4008         struct vcpu_vmx *vmx = to_vmx(vcpu);
4009         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4010
4011         if (!cpu_has_vmx_msr_bitmap())
4012                 return;
4013
4014         vmx_msr_bitmap_l01_changed(vmx);
4015
4016         /*
4017          * Mark the desired intercept state in shadow bitmap, this is needed
4018          * for resync when the MSR filter changes.
4019         */
4020         if (is_valid_passthrough_msr(msr)) {
4021                 int idx = possible_passthrough_msr_slot(msr);
4022
4023                 if (idx != -ENOENT) {
4024                         if (type & MSR_TYPE_R)
4025                                 set_bit(idx, vmx->shadow_msr_intercept.read);
4026                         if (type & MSR_TYPE_W)
4027                                 set_bit(idx, vmx->shadow_msr_intercept.write);
4028                 }
4029         }
4030
4031         if (type & MSR_TYPE_R)
4032                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
4033
4034         if (type & MSR_TYPE_W)
4035                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
4036 }
4037
4038 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
4039 {
4040         /*
4041          * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
4042          * of the MSR bitmap.  KVM emulates APIC registers up through 0x3f0,
4043          * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
4044          */
4045         const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
4046         const int write_idx = read_idx + (0x800 / sizeof(u64));
4047         struct vcpu_vmx *vmx = to_vmx(vcpu);
4048         u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
4049         u8 mode;
4050
4051         if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
4052                 return;
4053
4054         if (cpu_has_secondary_exec_ctrls() &&
4055             (secondary_exec_controls_get(vmx) &
4056              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4057                 mode = MSR_BITMAP_MODE_X2APIC;
4058                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4059                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4060         } else {
4061                 mode = 0;
4062         }
4063
4064         if (mode == vmx->x2apic_msr_bitmap_mode)
4065                 return;
4066
4067         vmx->x2apic_msr_bitmap_mode = mode;
4068
4069         /*
4070          * Reset the bitmap for MSRs 0x800 - 0x83f.  Leave AMD's uber-extended
4071          * registers (0x840 and above) intercepted, KVM doesn't support them.
4072          * Intercept all writes by default and poke holes as needed.  Pass
4073          * through reads for all valid registers by default in x2APIC+APICv
4074          * mode, only the current timer count needs on-demand emulation by KVM.
4075          */
4076         if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
4077                 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
4078         else
4079                 msr_bitmap[read_idx] = ~0ull;
4080         msr_bitmap[write_idx] = ~0ull;
4081
4082         /*
4083          * TPR reads and writes can be virtualized even if virtual interrupt
4084          * delivery is not in use.
4085          */
4086         vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4087                                   !(mode & MSR_BITMAP_MODE_X2APIC));
4088
4089         if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4090                 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4091                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4092                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4093                 if (enable_ipiv)
4094                         vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
4095         }
4096 }
4097
4098 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
4099 {
4100         struct vcpu_vmx *vmx = to_vmx(vcpu);
4101         bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4102         u32 i;
4103
4104         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4105         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4106         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4107         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
4108         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
4109                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4110                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4111         }
4112 }
4113
4114 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
4115 {
4116         struct vcpu_vmx *vmx = to_vmx(vcpu);
4117         void *vapic_page;
4118         u32 vppr;
4119         int rvi;
4120
4121         if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
4122                 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4123                 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
4124                 return false;
4125
4126         rvi = vmx_get_rvi();
4127
4128         vapic_page = vmx->nested.virtual_apic_map.hva;
4129         vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
4130
4131         return ((rvi & 0xf0) > (vppr & 0xf0));
4132 }
4133
4134 static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
4135 {
4136         struct vcpu_vmx *vmx = to_vmx(vcpu);
4137         u32 i;
4138
4139         /*
4140          * Redo intercept permissions for MSRs that KVM is passing through to
4141          * the guest.  Disabling interception will check the new MSR filter and
4142          * ensure that KVM enables interception if usersepace wants to filter
4143          * the MSR.  MSRs that KVM is already intercepting don't need to be
4144          * refreshed since KVM is going to intercept them regardless of what
4145          * userspace wants.
4146          */
4147         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
4148                 u32 msr = vmx_possible_passthrough_msrs[i];
4149
4150                 if (!test_bit(i, vmx->shadow_msr_intercept.read))
4151                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
4152
4153                 if (!test_bit(i, vmx->shadow_msr_intercept.write))
4154                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
4155         }
4156
4157         /* PT MSRs can be passed through iff PT is exposed to the guest. */
4158         if (vmx_pt_mode_is_host_guest())
4159                 pt_update_intercept_for_msr(vcpu);
4160 }
4161
4162 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
4163                                                      int pi_vec)
4164 {
4165 #ifdef CONFIG_SMP
4166         if (vcpu->mode == IN_GUEST_MODE) {
4167                 /*
4168                  * The vector of the virtual has already been set in the PIR.
4169                  * Send a notification event to deliver the virtual interrupt
4170                  * unless the vCPU is the currently running vCPU, i.e. the
4171                  * event is being sent from a fastpath VM-Exit handler, in
4172                  * which case the PIR will be synced to the vIRR before
4173                  * re-entering the guest.
4174                  *
4175                  * When the target is not the running vCPU, the following
4176                  * possibilities emerge:
4177                  *
4178                  * Case 1: vCPU stays in non-root mode. Sending a notification
4179                  * event posts the interrupt to the vCPU.
4180                  *
4181                  * Case 2: vCPU exits to root mode and is still runnable. The
4182                  * PIR will be synced to the vIRR before re-entering the guest.
4183                  * Sending a notification event is ok as the host IRQ handler
4184                  * will ignore the spurious event.
4185                  *
4186                  * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
4187                  * has already synced PIR to vIRR and never blocks the vCPU if
4188                  * the vIRR is not empty. Therefore, a blocked vCPU here does
4189                  * not wait for any requested interrupts in PIR, and sending a
4190                  * notification event also results in a benign, spurious event.
4191                  */
4192
4193                 if (vcpu != kvm_get_running_vcpu())
4194                         apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
4195                 return;
4196         }
4197 #endif
4198         /*
4199          * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
4200          * otherwise do nothing as KVM will grab the highest priority pending
4201          * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
4202          */
4203         kvm_vcpu_wake_up(vcpu);
4204 }
4205
4206 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4207                                                 int vector)
4208 {
4209         struct vcpu_vmx *vmx = to_vmx(vcpu);
4210
4211         if (is_guest_mode(vcpu) &&
4212             vector == vmx->nested.posted_intr_nv) {
4213                 /*
4214                  * If a posted intr is not recognized by hardware,
4215                  * we will accomplish it in the next vmentry.
4216                  */
4217                 vmx->nested.pi_pending = true;
4218                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4219
4220                 /*
4221                  * This pairs with the smp_mb_*() after setting vcpu->mode in
4222                  * vcpu_enter_guest() to guarantee the vCPU sees the event
4223                  * request if triggering a posted interrupt "fails" because
4224                  * vcpu->mode != IN_GUEST_MODE.  The extra barrier is needed as
4225                  * the smb_wmb() in kvm_make_request() only ensures everything
4226                  * done before making the request is visible when the request
4227                  * is visible, it doesn't ensure ordering between the store to
4228                  * vcpu->requests and the load from vcpu->mode.
4229                  */
4230                 smp_mb__after_atomic();
4231
4232                 /* the PIR and ON have been set by L1. */
4233                 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
4234                 return 0;
4235         }
4236         return -1;
4237 }
4238 /*
4239  * Send interrupt to vcpu via posted interrupt way.
4240  * 1. If target vcpu is running(non-root mode), send posted interrupt
4241  * notification to vcpu and hardware will sync PIR to vIRR atomically.
4242  * 2. If target vcpu isn't running(root mode), kick it to pick up the
4243  * interrupt from PIR in next vmentry.
4244  */
4245 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4246 {
4247         struct vcpu_vmx *vmx = to_vmx(vcpu);
4248         int r;
4249
4250         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4251         if (!r)
4252                 return 0;
4253
4254         /* Note, this is called iff the local APIC is in-kernel. */
4255         if (!vcpu->arch.apic->apicv_active)
4256                 return -1;
4257
4258         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4259                 return 0;
4260
4261         /* If a previous notification has sent the IPI, nothing to do.  */
4262         if (pi_test_and_set_on(&vmx->pi_desc))
4263                 return 0;
4264
4265         /*
4266          * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
4267          * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
4268          * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
4269          * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
4270          */
4271         kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
4272         return 0;
4273 }
4274
4275 static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4276                                   int trig_mode, int vector)
4277 {
4278         struct kvm_vcpu *vcpu = apic->vcpu;
4279
4280         if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4281                 kvm_lapic_set_irr(vector, apic);
4282                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4283                 kvm_vcpu_kick(vcpu);
4284         } else {
4285                 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4286                                            trig_mode, vector);
4287         }
4288 }
4289
4290 /*
4291  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4292  * will not change in the lifetime of the guest.
4293  * Note that host-state that does change is set elsewhere. E.g., host-state
4294  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4295  */
4296 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4297 {
4298         u32 low32, high32;
4299         unsigned long tmpl;
4300         unsigned long cr0, cr3, cr4;
4301
4302         cr0 = read_cr0();
4303         WARN_ON(cr0 & X86_CR0_TS);
4304         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
4305
4306         /*
4307          * Save the most likely value for this task's CR3 in the VMCS.
4308          * We can't use __get_current_cr3_fast() because we're not atomic.
4309          */
4310         cr3 = __read_cr3();
4311         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
4312         vmx->loaded_vmcs->host_state.cr3 = cr3;
4313
4314         /* Save the most likely value for this task's CR4 in the VMCS. */
4315         cr4 = cr4_read_shadow();
4316         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
4317         vmx->loaded_vmcs->host_state.cr4 = cr4;
4318
4319         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4320 #ifdef CONFIG_X86_64
4321         /*
4322          * Load null selectors, so we can avoid reloading them in
4323          * vmx_prepare_switch_to_host(), in case userspace uses
4324          * the null selectors too (the expected case).
4325          */
4326         vmcs_write16(HOST_DS_SELECTOR, 0);
4327         vmcs_write16(HOST_ES_SELECTOR, 0);
4328 #else
4329         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4330         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4331 #endif
4332         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4333         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4334
4335         vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
4336
4337         vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4338
4339         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4340         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4341
4342         /*
4343          * SYSENTER is used for 32-bit system calls on either 32-bit or
4344          * 64-bit kernels.  It is always zero If neither is allowed, otherwise
4345          * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4346          * have already done so!).
4347          */
4348         if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4349                 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4350
4351         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4352         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4353
4354         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4355                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4356                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4357         }
4358
4359         if (cpu_has_load_ia32_efer())
4360                 vmcs_write64(HOST_IA32_EFER, host_efer);
4361 }
4362
4363 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4364 {
4365         struct kvm_vcpu *vcpu = &vmx->vcpu;
4366
4367         vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4368                                           ~vcpu->arch.cr4_guest_rsvd_bits;
4369         if (!enable_ept) {
4370                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4371                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4372         }
4373         if (is_guest_mode(&vmx->vcpu))
4374                 vcpu->arch.cr4_guest_owned_bits &=
4375                         ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4376         vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4377 }
4378
4379 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4380 {
4381         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4382
4383         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4384                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4385
4386         if (!enable_vnmi)
4387                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4388
4389         if (!enable_preemption_timer)
4390                 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4391
4392         return pin_based_exec_ctrl;
4393 }
4394
4395 static u32 vmx_vmentry_ctrl(void)
4396 {
4397         u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4398
4399         if (vmx_pt_mode_is_system())
4400                 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4401                                   VM_ENTRY_LOAD_IA32_RTIT_CTL);
4402         /*
4403          * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4404          */
4405         vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4406                           VM_ENTRY_LOAD_IA32_EFER |
4407                           VM_ENTRY_IA32E_MODE);
4408
4409         if (cpu_has_perf_global_ctrl_bug())
4410                 vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4411
4412         return vmentry_ctrl;
4413 }
4414
4415 static u32 vmx_vmexit_ctrl(void)
4416 {
4417         u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4418
4419         /*
4420          * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4421          * nested virtualization and thus allowed to be set in vmcs12.
4422          */
4423         vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4424                          VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4425
4426         if (vmx_pt_mode_is_system())
4427                 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4428                                  VM_EXIT_CLEAR_IA32_RTIT_CTL);
4429
4430         if (cpu_has_perf_global_ctrl_bug())
4431                 vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4432
4433         /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4434         return vmexit_ctrl &
4435                 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
4436 }
4437
4438 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4439 {
4440         struct vcpu_vmx *vmx = to_vmx(vcpu);
4441
4442         if (is_guest_mode(vcpu)) {
4443                 vmx->nested.update_vmcs01_apicv_status = true;
4444                 return;
4445         }
4446
4447         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4448
4449         if (kvm_vcpu_apicv_active(vcpu)) {
4450                 secondary_exec_controls_setbit(vmx,
4451                                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
4452                                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4453                 if (enable_ipiv)
4454                         tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4455         } else {
4456                 secondary_exec_controls_clearbit(vmx,
4457                                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
4458                                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4459                 if (enable_ipiv)
4460                         tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4461         }
4462
4463         vmx_update_msr_bitmap_x2apic(vcpu);
4464 }
4465
4466 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4467 {
4468         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4469
4470         /*
4471          * Not used by KVM, but fully supported for nesting, i.e. are allowed in
4472          * vmcs12 and propagated to vmcs02 when set in vmcs12.
4473          */
4474         exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4475                           CPU_BASED_USE_IO_BITMAPS |
4476                           CPU_BASED_MONITOR_TRAP_FLAG |
4477                           CPU_BASED_PAUSE_EXITING);
4478
4479         /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4480         exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4481                           CPU_BASED_NMI_WINDOW_EXITING);
4482
4483         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4484                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4485
4486         if (!cpu_need_tpr_shadow(&vmx->vcpu))
4487                 exec_control &= ~CPU_BASED_TPR_SHADOW;
4488
4489 #ifdef CONFIG_X86_64
4490         if (exec_control & CPU_BASED_TPR_SHADOW)
4491                 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4492                                   CPU_BASED_CR8_STORE_EXITING);
4493         else
4494                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4495                                 CPU_BASED_CR8_LOAD_EXITING;
4496 #endif
4497         /* No need to intercept CR3 access or INVPLG when using EPT. */
4498         if (enable_ept)
4499                 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4500                                   CPU_BASED_CR3_STORE_EXITING |
4501                                   CPU_BASED_INVLPG_EXITING);
4502         if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4503                 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4504                                 CPU_BASED_MONITOR_EXITING);
4505         if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4506                 exec_control &= ~CPU_BASED_HLT_EXITING;
4507         return exec_control;
4508 }
4509
4510 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4511 {
4512         u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4513
4514         /*
4515          * IPI virtualization relies on APICv. Disable IPI virtualization if
4516          * APICv is inhibited.
4517          */
4518         if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4519                 exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4520
4521         return exec_control;
4522 }
4523
4524 /*
4525  * Adjust a single secondary execution control bit to intercept/allow an
4526  * instruction in the guest.  This is usually done based on whether or not a
4527  * feature has been exposed to the guest in order to correctly emulate faults.
4528  */
4529 static inline void
4530 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4531                                   u32 control, bool enabled, bool exiting)
4532 {
4533         /*
4534          * If the control is for an opt-in feature, clear the control if the
4535          * feature is not exposed to the guest, i.e. not enabled.  If the
4536          * control is opt-out, i.e. an exiting control, clear the control if
4537          * the feature _is_ exposed to the guest, i.e. exiting/interception is
4538          * disabled for the associated instruction.  Note, the caller is
4539          * responsible presetting exec_control to set all supported bits.
4540          */
4541         if (enabled == exiting)
4542                 *exec_control &= ~control;
4543
4544         /*
4545          * Update the nested MSR settings so that a nested VMM can/can't set
4546          * controls for features that are/aren't exposed to the guest.
4547          */
4548         if (nested) {
4549                 /*
4550                  * All features that can be added or removed to VMX MSRs must
4551                  * be supported in the first place for nested virtualization.
4552                  */
4553                 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4554                         enabled = false;
4555
4556                 if (enabled)
4557                         vmx->nested.msrs.secondary_ctls_high |= control;
4558                 else
4559                         vmx->nested.msrs.secondary_ctls_high &= ~control;
4560         }
4561 }
4562
4563 /*
4564  * Wrapper macro for the common case of adjusting a secondary execution control
4565  * based on a single guest CPUID bit, with a dedicated feature bit.  This also
4566  * verifies that the control is actually supported by KVM and hardware.
4567  */
4568 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4569 ({                                                                       \
4570         bool __enabled;                                                  \
4571                                                                          \
4572         if (cpu_has_vmx_##name()) {                                      \
4573                 __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
4574                                             X86_FEATURE_##feat_name);    \
4575                 vmx_adjust_secondary_exec_control(vmx, exec_control,     \
4576                         SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4577         }                                                                \
4578 })
4579
4580 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4581 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4582         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4583
4584 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4585         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4586
4587 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4588 {
4589         struct kvm_vcpu *vcpu = &vmx->vcpu;
4590
4591         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4592
4593         if (vmx_pt_mode_is_system())
4594                 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4595         if (!cpu_need_virtualize_apic_accesses(vcpu))
4596                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4597         if (vmx->vpid == 0)
4598                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4599         if (!enable_ept) {
4600                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4601                 enable_unrestricted_guest = 0;
4602         }
4603         if (!enable_unrestricted_guest)
4604                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4605         if (kvm_pause_in_guest(vmx->vcpu.kvm))
4606                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4607         if (!kvm_vcpu_apicv_active(vcpu))
4608                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4609                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4610         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4611
4612         /*
4613          * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
4614          * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
4615          */
4616         exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
4617
4618         /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4619          * in vmx_set_cr4.  */
4620         exec_control &= ~SECONDARY_EXEC_DESC;
4621
4622         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4623            (handle_vmptrld).
4624            We can NOT enable shadow_vmcs here because we don't have yet
4625            a current VMCS12
4626         */
4627         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4628
4629         /*
4630          * PML is enabled/disabled when dirty logging of memsmlots changes, but
4631          * it needs to be set here when dirty logging is already active, e.g.
4632          * if this vCPU was created after dirty logging was enabled.
4633          */
4634         if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
4635                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4636
4637         if (cpu_has_vmx_xsaves()) {
4638                 /* Exposing XSAVES only when XSAVE is exposed */
4639                 bool xsaves_enabled =
4640                         boot_cpu_has(X86_FEATURE_XSAVE) &&
4641                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4642                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4643
4644                 vcpu->arch.xsaves_enabled = xsaves_enabled;
4645
4646                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4647                                                   SECONDARY_EXEC_XSAVES,
4648                                                   xsaves_enabled, false);
4649         }
4650
4651         /*
4652          * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4653          * feature is exposed to the guest.  This creates a virtualization hole
4654          * if both are supported in hardware but only one is exposed to the
4655          * guest, but letting the guest execute RDTSCP or RDPID when either one
4656          * is advertised is preferable to emulating the advertised instruction
4657          * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4658          */
4659         if (cpu_has_vmx_rdtscp()) {
4660                 bool rdpid_or_rdtscp_enabled =
4661                         guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
4662                         guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
4663
4664                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4665                                                   SECONDARY_EXEC_ENABLE_RDTSCP,
4666                                                   rdpid_or_rdtscp_enabled, false);
4667         }
4668         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4669
4670         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4671         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4672
4673         vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4674                                     ENABLE_USR_WAIT_PAUSE, false);
4675
4676         if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4677                 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4678
4679         if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4680                 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4681
4682         return exec_control;
4683 }
4684
4685 static inline int vmx_get_pid_table_order(struct kvm *kvm)
4686 {
4687         return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4688 }
4689
4690 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4691 {
4692         struct page *pages;
4693         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4694
4695         if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4696                 return 0;
4697
4698         if (kvm_vmx->pid_table)
4699                 return 0;
4700
4701         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
4702         if (!pages)
4703                 return -ENOMEM;
4704
4705         kvm_vmx->pid_table = (void *)page_address(pages);
4706         return 0;
4707 }
4708
4709 static int vmx_vcpu_precreate(struct kvm *kvm)
4710 {
4711         return vmx_alloc_ipiv_pid_table(kvm);
4712 }
4713
4714 #define VMX_XSS_EXIT_BITMAP 0
4715
4716 static void init_vmcs(struct vcpu_vmx *vmx)
4717 {
4718         struct kvm *kvm = vmx->vcpu.kvm;
4719         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4720
4721         if (nested)
4722                 nested_vmx_set_vmcs_shadowing_bitmap();
4723
4724         if (cpu_has_vmx_msr_bitmap())
4725                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4726
4727         vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
4728
4729         /* Control */
4730         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4731
4732         exec_controls_set(vmx, vmx_exec_control(vmx));
4733
4734         if (cpu_has_secondary_exec_ctrls())
4735                 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
4736
4737         if (cpu_has_tertiary_exec_ctrls())
4738                 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4739
4740         if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
4741                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4742                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4743                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4744                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4745
4746                 vmcs_write16(GUEST_INTR_STATUS, 0);
4747
4748                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4749                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4750         }
4751
4752         if (vmx_can_use_ipiv(&vmx->vcpu)) {
4753                 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4754                 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4755         }
4756
4757         if (!kvm_pause_in_guest(kvm)) {
4758                 vmcs_write32(PLE_GAP, ple_gap);
4759                 vmx->ple_window = ple_window;
4760                 vmx->ple_window_dirty = true;
4761         }
4762
4763         if (kvm_notify_vmexit_enabled(kvm))
4764                 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4765
4766         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4767         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4768         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4769
4770         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4771         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4772         vmx_set_constant_host_state(vmx);
4773         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4774         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4775
4776         if (cpu_has_vmx_vmfunc())
4777                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4778
4779         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4780         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4781         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4782         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4783         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4784
4785         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4786                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4787
4788         vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
4789
4790         /* 22.2.1, 20.8.1 */
4791         vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4792
4793         vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4794         vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4795
4796         set_cr4_guest_host_mask(vmx);
4797
4798         if (vmx->vpid != 0)
4799                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4800
4801         if (cpu_has_vmx_xsaves())
4802                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4803
4804         if (enable_pml) {
4805                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4806                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4807         }
4808
4809         vmx_write_encls_bitmap(&vmx->vcpu, NULL);
4810
4811         if (vmx_pt_mode_is_host_guest()) {
4812                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4813                 /* Bit[6~0] are forced to 1, writes are ignored. */
4814                 vmx->pt_desc.guest.output_mask = 0x7F;
4815                 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4816         }
4817
4818         vmcs_write32(GUEST_SYSENTER_CS, 0);
4819         vmcs_writel(GUEST_SYSENTER_ESP, 0);
4820         vmcs_writel(GUEST_SYSENTER_EIP, 0);
4821         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4822
4823         if (cpu_has_vmx_tpr_shadow()) {
4824                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4825                 if (cpu_need_tpr_shadow(&vmx->vcpu))
4826                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4827                                      __pa(vmx->vcpu.arch.apic->regs));
4828                 vmcs_write32(TPR_THRESHOLD, 0);
4829         }
4830
4831         vmx_setup_uret_msrs(vmx);
4832 }
4833
4834 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4835 {
4836         struct vcpu_vmx *vmx = to_vmx(vcpu);
4837
4838         init_vmcs(vmx);
4839
4840         if (nested)
4841                 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4842
4843         vcpu_setup_sgx_lepubkeyhash(vcpu);
4844
4845         vmx->nested.posted_intr_nv = -1;
4846         vmx->nested.vmxon_ptr = INVALID_GPA;
4847         vmx->nested.current_vmptr = INVALID_GPA;
4848         vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4849
4850         vcpu->arch.microcode_version = 0x100000000ULL;
4851         vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4852
4853         /*
4854          * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4855          * or POSTED_INTR_WAKEUP_VECTOR.
4856          */
4857         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
4858         vmx->pi_desc.sn = 1;
4859 }
4860
4861 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4862 {
4863         struct vcpu_vmx *vmx = to_vmx(vcpu);
4864
4865         if (!init_event)
4866                 __vmx_vcpu_reset(vcpu);
4867
4868         vmx->rmode.vm86_active = 0;
4869         vmx->spec_ctrl = 0;
4870
4871         vmx->msr_ia32_umwait_control = 0;
4872
4873         vmx->hv_deadline_tsc = -1;
4874         kvm_set_cr8(vcpu, 0);
4875
4876         vmx_segment_cache_clear(vmx);
4877         kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
4878
4879         seg_setup(VCPU_SREG_CS);
4880         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4881         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4882
4883         seg_setup(VCPU_SREG_DS);
4884         seg_setup(VCPU_SREG_ES);
4885         seg_setup(VCPU_SREG_FS);
4886         seg_setup(VCPU_SREG_GS);
4887         seg_setup(VCPU_SREG_SS);
4888
4889         vmcs_write16(GUEST_TR_SELECTOR, 0);
4890         vmcs_writel(GUEST_TR_BASE, 0);
4891         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4892         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4893
4894         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4895         vmcs_writel(GUEST_LDTR_BASE, 0);
4896         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4897         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4898
4899         vmcs_writel(GUEST_GDTR_BASE, 0);
4900         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4901
4902         vmcs_writel(GUEST_IDTR_BASE, 0);
4903         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4904
4905         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4906         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4907         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4908         if (kvm_mpx_supported())
4909                 vmcs_write64(GUEST_BNDCFGS, 0);
4910
4911         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4912
4913         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4914
4915         vpid_sync_context(vmx->vpid);
4916
4917         vmx_update_fb_clear_dis(vcpu, vmx);
4918 }
4919
4920 static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
4921 {
4922         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4923 }
4924
4925 static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
4926 {
4927         if (!enable_vnmi ||
4928             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4929                 vmx_enable_irq_window(vcpu);
4930                 return;
4931         }
4932
4933         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4934 }
4935
4936 static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
4937 {
4938         struct vcpu_vmx *vmx = to_vmx(vcpu);
4939         uint32_t intr;
4940         int irq = vcpu->arch.interrupt.nr;
4941
4942         trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
4943
4944         ++vcpu->stat.irq_injections;
4945         if (vmx->rmode.vm86_active) {
4946                 int inc_eip = 0;
4947                 if (vcpu->arch.interrupt.soft)
4948                         inc_eip = vcpu->arch.event_exit_inst_len;
4949                 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4950                 return;
4951         }
4952         intr = irq | INTR_INFO_VALID_MASK;
4953         if (vcpu->arch.interrupt.soft) {
4954                 intr |= INTR_TYPE_SOFT_INTR;
4955                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4956                              vmx->vcpu.arch.event_exit_inst_len);
4957         } else
4958                 intr |= INTR_TYPE_EXT_INTR;
4959         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4960
4961         vmx_clear_hlt(vcpu);
4962 }
4963
4964 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4965 {
4966         struct vcpu_vmx *vmx = to_vmx(vcpu);
4967
4968         if (!enable_vnmi) {
4969                 /*
4970                  * Tracking the NMI-blocked state in software is built upon
4971                  * finding the next open IRQ window. This, in turn, depends on
4972                  * well-behaving guests: They have to keep IRQs disabled at
4973                  * least as long as the NMI handler runs. Otherwise we may
4974                  * cause NMI nesting, maybe breaking the guest. But as this is
4975                  * highly unlikely, we can live with the residual risk.
4976                  */
4977                 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4978                 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4979         }
4980
4981         ++vcpu->stat.nmi_injections;
4982         vmx->loaded_vmcs->nmi_known_unmasked = false;
4983
4984         if (vmx->rmode.vm86_active) {
4985                 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
4986                 return;
4987         }
4988
4989         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4990                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4991
4992         vmx_clear_hlt(vcpu);
4993 }
4994
4995 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4996 {
4997         struct vcpu_vmx *vmx = to_vmx(vcpu);
4998         bool masked;
4999
5000         if (!enable_vnmi)
5001                 return vmx->loaded_vmcs->soft_vnmi_blocked;
5002         if (vmx->loaded_vmcs->nmi_known_unmasked)
5003                 return false;
5004         masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
5005         vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5006         return masked;
5007 }
5008
5009 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
5010 {
5011         struct vcpu_vmx *vmx = to_vmx(vcpu);
5012
5013         if (!enable_vnmi) {
5014                 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5015                         vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5016                         vmx->loaded_vmcs->vnmi_blocked_time = 0;
5017                 }
5018         } else {
5019                 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5020                 if (masked)
5021                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5022                                       GUEST_INTR_STATE_NMI);
5023                 else
5024                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5025                                         GUEST_INTR_STATE_NMI);
5026         }
5027 }
5028
5029 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5030 {
5031         if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5032                 return false;
5033
5034         if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5035                 return true;
5036
5037         return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5038                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5039                  GUEST_INTR_STATE_NMI));
5040 }
5041
5042 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5043 {
5044         if (to_vmx(vcpu)->nested.nested_run_pending)
5045                 return -EBUSY;
5046
5047         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
5048         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5049                 return -EBUSY;
5050
5051         return !vmx_nmi_blocked(vcpu);
5052 }
5053
5054 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5055 {
5056         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5057                 return false;
5058
5059         return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
5060                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5061                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5062 }
5063
5064 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5065 {
5066         if (to_vmx(vcpu)->nested.nested_run_pending)
5067                 return -EBUSY;
5068
5069         /*
5070          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5071          * e.g. if the IRQ arrived asynchronously after checking nested events.
5072          */
5073         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5074                 return -EBUSY;
5075
5076         return !vmx_interrupt_blocked(vcpu);
5077 }
5078
5079 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5080 {
5081         void __user *ret;
5082
5083         if (enable_unrestricted_guest)
5084                 return 0;
5085
5086         mutex_lock(&kvm->slots_lock);
5087         ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5088                                       PAGE_SIZE * 3);
5089         mutex_unlock(&kvm->slots_lock);
5090
5091         if (IS_ERR(ret))
5092                 return PTR_ERR(ret);
5093
5094         to_kvm_vmx(kvm)->tss_addr = addr;
5095
5096         return init_rmode_tss(kvm, ret);
5097 }
5098
5099 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5100 {
5101         to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5102         return 0;
5103 }
5104
5105 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5106 {
5107         switch (vec) {
5108         case BP_VECTOR:
5109                 /*
5110                  * Update instruction length as we may reinject the exception
5111                  * from user space while in guest debugging mode.
5112                  */
5113                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5114                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5115                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5116                         return false;
5117                 fallthrough;
5118         case DB_VECTOR:
5119                 return !(vcpu->guest_debug &
5120                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
5121         case DE_VECTOR:
5122         case OF_VECTOR:
5123         case BR_VECTOR:
5124         case UD_VECTOR:
5125         case DF_VECTOR:
5126         case SS_VECTOR:
5127         case GP_VECTOR:
5128         case MF_VECTOR:
5129                 return true;
5130         }
5131         return false;
5132 }
5133
5134 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5135                                   int vec, u32 err_code)
5136 {
5137         /*
5138          * Instruction with address size override prefix opcode 0x67
5139          * Cause the #SS fault with 0 error code in VM86 mode.
5140          */
5141         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5142                 if (kvm_emulate_instruction(vcpu, 0)) {
5143                         if (vcpu->arch.halt_request) {
5144                                 vcpu->arch.halt_request = 0;
5145                                 return kvm_emulate_halt_noskip(vcpu);
5146                         }
5147                         return 1;
5148                 }
5149                 return 0;
5150         }
5151
5152         /*
5153          * Forward all other exceptions that are valid in real mode.
5154          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5155          *        the required debugging infrastructure rework.
5156          */
5157         kvm_queue_exception(vcpu, vec);
5158         return 1;
5159 }
5160
5161 static int handle_machine_check(struct kvm_vcpu *vcpu)
5162 {
5163         /* handled by vmx_vcpu_run() */
5164         return 1;
5165 }
5166
5167 /*
5168  * If the host has split lock detection disabled, then #AC is
5169  * unconditionally injected into the guest, which is the pre split lock
5170  * detection behaviour.
5171  *
5172  * If the host has split lock detection enabled then #AC is
5173  * only injected into the guest when:
5174  *  - Guest CPL == 3 (user mode)
5175  *  - Guest has #AC detection enabled in CR0
5176  *  - Guest EFLAGS has AC bit set
5177  */
5178 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
5179 {
5180         if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5181                 return true;
5182
5183         return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
5184                (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5185 }
5186
5187 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5188 {
5189         struct vcpu_vmx *vmx = to_vmx(vcpu);
5190         struct kvm_run *kvm_run = vcpu->run;
5191         u32 intr_info, ex_no, error_code;
5192         unsigned long cr2, dr6;
5193         u32 vect_info;
5194
5195         vect_info = vmx->idt_vectoring_info;
5196         intr_info = vmx_get_intr_info(vcpu);
5197
5198         /*
5199          * Machine checks are handled by handle_exception_irqoff(), or by
5200          * vmx_vcpu_run() if a #MC occurs on VM-Entry.  NMIs are handled by
5201          * vmx_vcpu_enter_exit().
5202          */
5203         if (is_machine_check(intr_info) || is_nmi(intr_info))
5204                 return 1;
5205
5206         /*
5207          * Queue the exception here instead of in handle_nm_fault_irqoff().
5208          * This ensures the nested_vmx check is not skipped so vmexit can
5209          * be reflected to L1 (when it intercepts #NM) before reaching this
5210          * point.
5211          */
5212         if (is_nm_fault(intr_info)) {
5213                 kvm_queue_exception(vcpu, NM_VECTOR);
5214                 return 1;
5215         }
5216
5217         if (is_invalid_opcode(intr_info))
5218                 return handle_ud(vcpu);
5219
5220         error_code = 0;
5221         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5222                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5223
5224         if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5225                 WARN_ON_ONCE(!enable_vmware_backdoor);
5226
5227                 /*
5228                  * VMware backdoor emulation on #GP interception only handles
5229                  * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5230                  * error code on #GP.
5231                  */
5232                 if (error_code) {
5233                         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5234                         return 1;
5235                 }
5236                 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5237         }
5238
5239         /*
5240          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5241          * MMIO, it is better to report an internal error.
5242          * See the comments in vmx_handle_exit.
5243          */
5244         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5245             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5246                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5247                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5248                 vcpu->run->internal.ndata = 4;
5249                 vcpu->run->internal.data[0] = vect_info;
5250                 vcpu->run->internal.data[1] = intr_info;
5251                 vcpu->run->internal.data[2] = error_code;
5252                 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
5253                 return 0;
5254         }
5255
5256         if (is_page_fault(intr_info)) {
5257                 cr2 = vmx_get_exit_qual(vcpu);
5258                 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
5259                         /*
5260                          * EPT will cause page fault only if we need to
5261                          * detect illegal GPAs.
5262                          */
5263                         WARN_ON_ONCE(!allow_smaller_maxphyaddr);
5264                         kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5265                         return 1;
5266                 } else
5267                         return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5268         }
5269
5270         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5271
5272         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5273                 return handle_rmode_exception(vcpu, ex_no, error_code);
5274
5275         switch (ex_no) {
5276         case DB_VECTOR:
5277                 dr6 = vmx_get_exit_qual(vcpu);
5278                 if (!(vcpu->guest_debug &
5279                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5280                         /*
5281                          * If the #DB was due to ICEBP, a.k.a. INT1, skip the
5282                          * instruction.  ICEBP generates a trap-like #DB, but
5283                          * despite its interception control being tied to #DB,
5284                          * is an instruction intercept, i.e. the VM-Exit occurs
5285                          * on the ICEBP itself.  Use the inner "skip" helper to
5286                          * avoid single-step #DB and MTF updates, as ICEBP is
5287                          * higher priority.  Note, skipping ICEBP still clears
5288                          * STI and MOVSS blocking.
5289                          *
5290                          * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5291                          * if single-step is enabled in RFLAGS and STI or MOVSS
5292                          * blocking is active, as the CPU doesn't set the bit
5293                          * on VM-Exit due to #DB interception.  VM-Entry has a
5294                          * consistency check that a single-step #DB is pending
5295                          * in this scenario as the previous instruction cannot
5296                          * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5297                          * don't modify RFLAGS), therefore the one instruction
5298                          * delay when activating single-step breakpoints must
5299                          * have already expired.  Note, the CPU sets/clears BS
5300                          * as appropriate for all other VM-Exits types.
5301                          */
5302                         if (is_icebp(intr_info))
5303                                 WARN_ON(!skip_emulated_instruction(vcpu));
5304                         else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5305                                  (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5306                                   (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5307                                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5308                                             vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
5309
5310                         kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
5311                         return 1;
5312                 }
5313                 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
5314                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5315                 fallthrough;
5316         case BP_VECTOR:
5317                 /*
5318                  * Update instruction length as we may reinject #BP from
5319                  * user space while in guest debugging mode. Reading it for
5320                  * #DB as well causes no harm, it is not used in that case.
5321                  */
5322                 vmx->vcpu.arch.event_exit_inst_len =
5323                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5324                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5325                 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5326                 kvm_run->debug.arch.exception = ex_no;
5327                 break;
5328         case AC_VECTOR:
5329                 if (vmx_guest_inject_ac(vcpu)) {
5330                         kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5331                         return 1;
5332                 }
5333
5334                 /*
5335                  * Handle split lock. Depending on detection mode this will
5336                  * either warn and disable split lock detection for this
5337                  * task or force SIGBUS on it.
5338                  */
5339                 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5340                         return 1;
5341                 fallthrough;
5342         default:
5343                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5344                 kvm_run->ex.exception = ex_no;
5345                 kvm_run->ex.error_code = error_code;
5346                 break;
5347         }
5348         return 0;
5349 }
5350
5351 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5352 {
5353         ++vcpu->stat.irq_exits;
5354         return 1;
5355 }
5356
5357 static int handle_triple_fault(struct kvm_vcpu *vcpu)
5358 {
5359         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5360         vcpu->mmio_needed = 0;
5361         return 0;
5362 }
5363
5364 static int handle_io(struct kvm_vcpu *vcpu)
5365 {
5366         unsigned long exit_qualification;
5367         int size, in, string;
5368         unsigned port;
5369
5370         exit_qualification = vmx_get_exit_qual(vcpu);
5371         string = (exit_qualification & 16) != 0;
5372
5373         ++vcpu->stat.io_exits;
5374
5375         if (string)
5376                 return kvm_emulate_instruction(vcpu, 0);
5377
5378         port = exit_qualification >> 16;
5379         size = (exit_qualification & 7) + 1;
5380         in = (exit_qualification & 8) != 0;
5381
5382         return kvm_fast_pio(vcpu, size, port, in);
5383 }
5384
5385 static void
5386 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5387 {
5388         /*
5389          * Patch in the VMCALL instruction:
5390          */
5391         hypercall[0] = 0x0f;
5392         hypercall[1] = 0x01;
5393         hypercall[2] = 0xc1;
5394 }
5395
5396 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5397 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5398 {
5399         if (is_guest_mode(vcpu)) {
5400                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5401                 unsigned long orig_val = val;
5402
5403                 /*
5404                  * We get here when L2 changed cr0 in a way that did not change
5405                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5406                  * but did change L0 shadowed bits. So we first calculate the
5407                  * effective cr0 value that L1 would like to write into the
5408                  * hardware. It consists of the L2-owned bits from the new
5409                  * value combined with the L1-owned bits from L1's guest_cr0.
5410                  */
5411                 val = (val & ~vmcs12->cr0_guest_host_mask) |
5412                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5413
5414                 if (!nested_guest_cr0_valid(vcpu, val))
5415                         return 1;
5416
5417                 if (kvm_set_cr0(vcpu, val))
5418                         return 1;
5419                 vmcs_writel(CR0_READ_SHADOW, orig_val);
5420                 return 0;
5421         } else {
5422                 if (to_vmx(vcpu)->nested.vmxon &&
5423                     !nested_host_cr0_valid(vcpu, val))
5424                         return 1;
5425
5426                 return kvm_set_cr0(vcpu, val);
5427         }
5428 }
5429
5430 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5431 {
5432         if (is_guest_mode(vcpu)) {
5433                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5434                 unsigned long orig_val = val;
5435
5436                 /* analogously to handle_set_cr0 */
5437                 val = (val & ~vmcs12->cr4_guest_host_mask) |
5438                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5439                 if (kvm_set_cr4(vcpu, val))
5440                         return 1;
5441                 vmcs_writel(CR4_READ_SHADOW, orig_val);
5442                 return 0;
5443         } else
5444                 return kvm_set_cr4(vcpu, val);
5445 }
5446
5447 static int handle_desc(struct kvm_vcpu *vcpu)
5448 {
5449         WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
5450         return kvm_emulate_instruction(vcpu, 0);
5451 }
5452
5453 static int handle_cr(struct kvm_vcpu *vcpu)
5454 {
5455         unsigned long exit_qualification, val;
5456         int cr;
5457         int reg;
5458         int err;
5459         int ret;
5460
5461         exit_qualification = vmx_get_exit_qual(vcpu);
5462         cr = exit_qualification & 15;
5463         reg = (exit_qualification >> 8) & 15;
5464         switch ((exit_qualification >> 4) & 3) {
5465         case 0: /* mov to cr */
5466                 val = kvm_register_read(vcpu, reg);
5467                 trace_kvm_cr_write(cr, val);
5468                 switch (cr) {
5469                 case 0:
5470                         err = handle_set_cr0(vcpu, val);
5471                         return kvm_complete_insn_gp(vcpu, err);
5472                 case 3:
5473                         WARN_ON_ONCE(enable_unrestricted_guest);
5474
5475                         err = kvm_set_cr3(vcpu, val);
5476                         return kvm_complete_insn_gp(vcpu, err);
5477                 case 4:
5478                         err = handle_set_cr4(vcpu, val);
5479                         return kvm_complete_insn_gp(vcpu, err);
5480                 case 8: {
5481                                 u8 cr8_prev = kvm_get_cr8(vcpu);
5482                                 u8 cr8 = (u8)val;
5483                                 err = kvm_set_cr8(vcpu, cr8);
5484                                 ret = kvm_complete_insn_gp(vcpu, err);
5485                                 if (lapic_in_kernel(vcpu))
5486                                         return ret;
5487                                 if (cr8_prev <= cr8)
5488                                         return ret;
5489                                 /*
5490                                  * TODO: we might be squashing a
5491                                  * KVM_GUESTDBG_SINGLESTEP-triggered
5492                                  * KVM_EXIT_DEBUG here.
5493                                  */
5494                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5495                                 return 0;
5496                         }
5497                 }
5498                 break;
5499         case 2: /* clts */
5500                 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5501                 return -EIO;
5502         case 1: /*mov from cr*/
5503                 switch (cr) {
5504                 case 3:
5505                         WARN_ON_ONCE(enable_unrestricted_guest);
5506
5507                         val = kvm_read_cr3(vcpu);
5508                         kvm_register_write(vcpu, reg, val);
5509                         trace_kvm_cr_read(cr, val);
5510                         return kvm_skip_emulated_instruction(vcpu);
5511                 case 8:
5512                         val = kvm_get_cr8(vcpu);
5513                         kvm_register_write(vcpu, reg, val);
5514                         trace_kvm_cr_read(cr, val);
5515                         return kvm_skip_emulated_instruction(vcpu);
5516                 }
5517                 break;
5518         case 3: /* lmsw */
5519                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5520                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5521                 kvm_lmsw(vcpu, val);
5522
5523                 return kvm_skip_emulated_instruction(vcpu);
5524         default:
5525                 break;
5526         }
5527         vcpu->run->exit_reason = 0;
5528         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5529                (int)(exit_qualification >> 4) & 3, cr);
5530         return 0;
5531 }
5532
5533 static int handle_dr(struct kvm_vcpu *vcpu)
5534 {
5535         unsigned long exit_qualification;
5536         int dr, dr7, reg;
5537         int err = 1;
5538
5539         exit_qualification = vmx_get_exit_qual(vcpu);
5540         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5541
5542         /* First, if DR does not exist, trigger UD */
5543         if (!kvm_require_dr(vcpu, dr))
5544                 return 1;
5545
5546         if (vmx_get_cpl(vcpu) > 0)
5547                 goto out;
5548
5549         dr7 = vmcs_readl(GUEST_DR7);
5550         if (dr7 & DR7_GD) {
5551                 /*
5552                  * As the vm-exit takes precedence over the debug trap, we
5553                  * need to emulate the latter, either for the host or the
5554                  * guest debugging itself.
5555                  */
5556                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5557                         vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
5558                         vcpu->run->debug.arch.dr7 = dr7;
5559                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5560                         vcpu->run->debug.arch.exception = DB_VECTOR;
5561                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5562                         return 0;
5563                 } else {
5564                         kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5565                         return 1;
5566                 }
5567         }
5568
5569         if (vcpu->guest_debug == 0) {
5570                 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5571
5572                 /*
5573                  * No more DR vmexits; force a reload of the debug registers
5574                  * and reenter on this instruction.  The next vmexit will
5575                  * retrieve the full state of the debug registers.
5576                  */
5577                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5578                 return 1;
5579         }
5580
5581         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5582         if (exit_qualification & TYPE_MOV_FROM_DR) {
5583                 unsigned long val;
5584
5585                 kvm_get_dr(vcpu, dr, &val);
5586                 kvm_register_write(vcpu, reg, val);
5587                 err = 0;
5588         } else {
5589                 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
5590         }
5591
5592 out:
5593         return kvm_complete_insn_gp(vcpu, err);
5594 }
5595
5596 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5597 {
5598         get_debugreg(vcpu->arch.db[0], 0);
5599         get_debugreg(vcpu->arch.db[1], 1);
5600         get_debugreg(vcpu->arch.db[2], 2);
5601         get_debugreg(vcpu->arch.db[3], 3);
5602         get_debugreg(vcpu->arch.dr6, 6);
5603         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5604
5605         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5606         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5607
5608         /*
5609          * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5610          * a stale dr6 from the guest.
5611          */
5612         set_debugreg(DR6_RESERVED, 6);
5613 }
5614
5615 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5616 {
5617         vmcs_writel(GUEST_DR7, val);
5618 }
5619
5620 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5621 {
5622         kvm_apic_update_ppr(vcpu);
5623         return 1;
5624 }
5625
5626 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5627 {
5628         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5629
5630         kvm_make_request(KVM_REQ_EVENT, vcpu);
5631
5632         ++vcpu->stat.irq_window_exits;
5633         return 1;
5634 }
5635
5636 static int handle_invlpg(struct kvm_vcpu *vcpu)
5637 {
5638         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5639
5640         kvm_mmu_invlpg(vcpu, exit_qualification);
5641         return kvm_skip_emulated_instruction(vcpu);
5642 }
5643
5644 static int handle_apic_access(struct kvm_vcpu *vcpu)
5645 {
5646         if (likely(fasteoi)) {
5647                 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5648                 int access_type, offset;
5649
5650                 access_type = exit_qualification & APIC_ACCESS_TYPE;
5651                 offset = exit_qualification & APIC_ACCESS_OFFSET;
5652                 /*
5653                  * Sane guest uses MOV to write EOI, with written value
5654                  * not cared. So make a short-circuit here by avoiding
5655                  * heavy instruction emulation.
5656                  */
5657                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5658                     (offset == APIC_EOI)) {
5659                         kvm_lapic_set_eoi(vcpu);
5660                         return kvm_skip_emulated_instruction(vcpu);
5661                 }
5662         }
5663         return kvm_emulate_instruction(vcpu, 0);
5664 }
5665
5666 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5667 {
5668         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5669         int vector = exit_qualification & 0xff;
5670
5671         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5672         kvm_apic_set_eoi_accelerated(vcpu, vector);
5673         return 1;
5674 }
5675
5676 static int handle_apic_write(struct kvm_vcpu *vcpu)
5677 {
5678         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5679
5680         /*
5681          * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5682          * hardware has done any necessary aliasing, offset adjustments, etc...
5683          * for the access.  I.e. the correct value has already been  written to
5684          * the vAPIC page for the correct 16-byte chunk.  KVM needs only to
5685          * retrieve the register value and emulate the access.
5686          */
5687         u32 offset = exit_qualification & 0xff0;
5688
5689         kvm_apic_write_nodecode(vcpu, offset);
5690         return 1;
5691 }
5692
5693 static int handle_task_switch(struct kvm_vcpu *vcpu)
5694 {
5695         struct vcpu_vmx *vmx = to_vmx(vcpu);
5696         unsigned long exit_qualification;
5697         bool has_error_code = false;
5698         u32 error_code = 0;
5699         u16 tss_selector;
5700         int reason, type, idt_v, idt_index;
5701
5702         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5703         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5704         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5705
5706         exit_qualification = vmx_get_exit_qual(vcpu);
5707
5708         reason = (u32)exit_qualification >> 30;
5709         if (reason == TASK_SWITCH_GATE && idt_v) {
5710                 switch (type) {
5711                 case INTR_TYPE_NMI_INTR:
5712                         vcpu->arch.nmi_injected = false;
5713                         vmx_set_nmi_mask(vcpu, true);
5714                         break;
5715                 case INTR_TYPE_EXT_INTR:
5716                 case INTR_TYPE_SOFT_INTR:
5717                         kvm_clear_interrupt_queue(vcpu);
5718                         break;
5719                 case INTR_TYPE_HARD_EXCEPTION:
5720                         if (vmx->idt_vectoring_info &
5721                             VECTORING_INFO_DELIVER_CODE_MASK) {
5722                                 has_error_code = true;
5723                                 error_code =
5724                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
5725                         }
5726                         fallthrough;
5727                 case INTR_TYPE_SOFT_EXCEPTION:
5728                         kvm_clear_exception_queue(vcpu);
5729                         break;
5730                 default:
5731                         break;
5732                 }
5733         }
5734         tss_selector = exit_qualification;
5735
5736         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5737                        type != INTR_TYPE_EXT_INTR &&
5738                        type != INTR_TYPE_NMI_INTR))
5739                 WARN_ON(!skip_emulated_instruction(vcpu));
5740
5741         /*
5742          * TODO: What about debug traps on tss switch?
5743          *       Are we supposed to inject them and update dr6?
5744          */
5745         return kvm_task_switch(vcpu, tss_selector,
5746                                type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5747                                reason, has_error_code, error_code);
5748 }
5749
5750 static int handle_ept_violation(struct kvm_vcpu *vcpu)
5751 {
5752         unsigned long exit_qualification;
5753         gpa_t gpa;
5754         u64 error_code;
5755
5756         exit_qualification = vmx_get_exit_qual(vcpu);
5757
5758         /*
5759          * EPT violation happened while executing iret from NMI,
5760          * "blocked by NMI" bit has to be set before next VM entry.
5761          * There are errata that may cause this bit to not be set:
5762          * AAK134, BY25.
5763          */
5764         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5765                         enable_vnmi &&
5766                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5767                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5768
5769         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5770         trace_kvm_page_fault(vcpu, gpa, exit_qualification);
5771
5772         /* Is it a read fault? */
5773         error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5774                      ? PFERR_USER_MASK : 0;
5775         /* Is it a write fault? */
5776         error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5777                       ? PFERR_WRITE_MASK : 0;
5778         /* Is it a fetch fault? */
5779         error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5780                       ? PFERR_FETCH_MASK : 0;
5781         /* ept page table entry is present? */
5782         error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
5783                       ? PFERR_PRESENT_MASK : 0;
5784
5785         error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
5786                PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5787
5788         vcpu->arch.exit_qualification = exit_qualification;
5789
5790         /*
5791          * Check that the GPA doesn't exceed physical memory limits, as that is
5792          * a guest page fault.  We have to emulate the instruction here, because
5793          * if the illegal address is that of a paging structure, then
5794          * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
5795          * would also use advanced VM-exit information for EPT violations to
5796          * reconstruct the page fault error code.
5797          */
5798         if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
5799                 return kvm_emulate_instruction(vcpu, 0);
5800
5801         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5802 }
5803
5804 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5805 {
5806         gpa_t gpa;
5807
5808         if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
5809                 return 1;
5810
5811         /*
5812          * A nested guest cannot optimize MMIO vmexits, because we have an
5813          * nGPA here instead of the required GPA.
5814          */
5815         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5816         if (!is_guest_mode(vcpu) &&
5817             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5818                 trace_kvm_fast_mmio(gpa);
5819                 return kvm_skip_emulated_instruction(vcpu);
5820         }
5821
5822         return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5823 }
5824
5825 static int handle_nmi_window(struct kvm_vcpu *vcpu)
5826 {
5827         if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5828                 return -EIO;
5829
5830         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5831         ++vcpu->stat.nmi_window_exits;
5832         kvm_make_request(KVM_REQ_EVENT, vcpu);
5833
5834         return 1;
5835 }
5836
5837 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
5838 {
5839         struct vcpu_vmx *vmx = to_vmx(vcpu);
5840
5841         return vmx->emulation_required && !vmx->rmode.vm86_active &&
5842                (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
5843 }
5844
5845 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5846 {
5847         struct vcpu_vmx *vmx = to_vmx(vcpu);
5848         bool intr_window_requested;
5849         unsigned count = 130;
5850
5851         intr_window_requested = exec_controls_get(vmx) &
5852                                 CPU_BASED_INTR_WINDOW_EXITING;
5853
5854         while (vmx->emulation_required && count-- != 0) {
5855                 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5856                         return handle_interrupt_window(&vmx->vcpu);
5857
5858                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5859                         return 1;
5860
5861                 if (!kvm_emulate_instruction(vcpu, 0))
5862                         return 0;
5863
5864                 if (vmx_emulation_required_with_pending_exception(vcpu)) {
5865                         kvm_prepare_emulation_failure_exit(vcpu);
5866                         return 0;
5867                 }
5868
5869                 if (vcpu->arch.halt_request) {
5870                         vcpu->arch.halt_request = 0;
5871                         return kvm_emulate_halt_noskip(vcpu);
5872                 }
5873
5874                 /*
5875                  * Note, return 1 and not 0, vcpu_run() will invoke
5876                  * xfer_to_guest_mode() which will create a proper return
5877                  * code.
5878                  */
5879                 if (__xfer_to_guest_mode_work_pending())
5880                         return 1;
5881         }
5882
5883         return 1;
5884 }
5885
5886 static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
5887 {
5888         if (vmx_emulation_required_with_pending_exception(vcpu)) {
5889                 kvm_prepare_emulation_failure_exit(vcpu);
5890                 return 0;
5891         }
5892
5893         return 1;
5894 }
5895
5896 static void grow_ple_window(struct kvm_vcpu *vcpu)
5897 {
5898         struct vcpu_vmx *vmx = to_vmx(vcpu);
5899         unsigned int old = vmx->ple_window;
5900
5901         vmx->ple_window = __grow_ple_window(old, ple_window,
5902                                             ple_window_grow,
5903                                             ple_window_max);
5904
5905         if (vmx->ple_window != old) {
5906                 vmx->ple_window_dirty = true;
5907                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5908                                             vmx->ple_window, old);
5909         }
5910 }
5911
5912 static void shrink_ple_window(struct kvm_vcpu *vcpu)
5913 {
5914         struct vcpu_vmx *vmx = to_vmx(vcpu);
5915         unsigned int old = vmx->ple_window;
5916
5917         vmx->ple_window = __shrink_ple_window(old, ple_window,
5918                                               ple_window_shrink,
5919                                               ple_window);
5920
5921         if (vmx->ple_window != old) {
5922                 vmx->ple_window_dirty = true;
5923                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5924                                             vmx->ple_window, old);
5925         }
5926 }
5927
5928 /*
5929  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5930  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5931  */
5932 static int handle_pause(struct kvm_vcpu *vcpu)
5933 {
5934         if (!kvm_pause_in_guest(vcpu->kvm))
5935                 grow_ple_window(vcpu);
5936
5937         /*
5938          * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5939          * VM-execution control is ignored if CPL > 0. OTOH, KVM
5940          * never set PAUSE_EXITING and just set PLE if supported,
5941          * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5942          */
5943         kvm_vcpu_on_spin(vcpu, true);
5944         return kvm_skip_emulated_instruction(vcpu);
5945 }
5946
5947 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5948 {
5949         return 1;
5950 }
5951
5952 static int handle_invpcid(struct kvm_vcpu *vcpu)
5953 {
5954         u32 vmx_instruction_info;
5955         unsigned long type;
5956         gva_t gva;
5957         struct {
5958                 u64 pcid;
5959                 u64 gla;
5960         } operand;
5961         int gpr_index;
5962
5963         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5964                 kvm_queue_exception(vcpu, UD_VECTOR);
5965                 return 1;
5966         }
5967
5968         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5969         gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5970         type = kvm_register_read(vcpu, gpr_index);
5971
5972         /* According to the Intel instruction reference, the memory operand
5973          * is read even if it isn't needed (e.g., for type==all)
5974          */
5975         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5976                                 vmx_instruction_info, false,
5977                                 sizeof(operand), &gva))
5978                 return 1;
5979
5980         return kvm_handle_invpcid(vcpu, type, gva);
5981 }
5982
5983 static int handle_pml_full(struct kvm_vcpu *vcpu)
5984 {
5985         unsigned long exit_qualification;
5986
5987         trace_kvm_pml_full(vcpu->vcpu_id);
5988
5989         exit_qualification = vmx_get_exit_qual(vcpu);
5990
5991         /*
5992          * PML buffer FULL happened while executing iret from NMI,
5993          * "blocked by NMI" bit has to be set before next VM entry.
5994          */
5995         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5996                         enable_vnmi &&
5997                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5998                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5999                                 GUEST_INTR_STATE_NMI);
6000
6001         /*
6002          * PML buffer already flushed at beginning of VMEXIT. Nothing to do
6003          * here.., and there's no userspace involvement needed for PML.
6004          */
6005         return 1;
6006 }
6007
6008 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
6009 {
6010         struct vcpu_vmx *vmx = to_vmx(vcpu);
6011
6012         if (!vmx->req_immediate_exit &&
6013             !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
6014                 kvm_lapic_expired_hv_timer(vcpu);
6015                 return EXIT_FASTPATH_REENTER_GUEST;
6016         }
6017
6018         return EXIT_FASTPATH_NONE;
6019 }
6020
6021 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6022 {
6023         handle_fastpath_preemption_timer(vcpu);
6024         return 1;
6025 }
6026
6027 /*
6028  * When nested=0, all VMX instruction VM Exits filter here.  The handlers
6029  * are overwritten by nested_vmx_setup() when nested=1.
6030  */
6031 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6032 {
6033         kvm_queue_exception(vcpu, UD_VECTOR);
6034         return 1;
6035 }
6036
6037 #ifndef CONFIG_X86_SGX_KVM
6038 static int handle_encls(struct kvm_vcpu *vcpu)
6039 {
6040         /*
6041          * SGX virtualization is disabled.  There is no software enable bit for
6042          * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6043          * the guest from executing ENCLS (when SGX is supported by hardware).
6044          */
6045         kvm_queue_exception(vcpu, UD_VECTOR);
6046         return 1;
6047 }
6048 #endif /* CONFIG_X86_SGX_KVM */
6049
6050 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6051 {
6052         /*
6053          * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6054          * VM-Exits. Unconditionally set the flag here and leave the handling to
6055          * vmx_handle_exit().
6056          */
6057         to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
6058         return 1;
6059 }
6060
6061 static int handle_notify(struct kvm_vcpu *vcpu)
6062 {
6063         unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6064         bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6065
6066         ++vcpu->stat.notify_window_exits;
6067
6068         /*
6069          * Notify VM exit happened while executing iret from NMI,
6070          * "blocked by NMI" bit has to be set before next VM entry.
6071          */
6072         if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6073                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6074                               GUEST_INTR_STATE_NMI);
6075
6076         if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6077             context_invalid) {
6078                 vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6079                 vcpu->run->notify.flags = context_invalid ?
6080                                           KVM_NOTIFY_CONTEXT_INVALID : 0;
6081                 return 0;
6082         }
6083
6084         return 1;
6085 }
6086
6087 /*
6088  * The exit handlers return 1 if the exit was handled fully and guest execution
6089  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
6090  * to be done to userspace and return 0.
6091  */
6092 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6093         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
6094         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
6095         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
6096         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
6097         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
6098         [EXIT_REASON_CR_ACCESS]               = handle_cr,
6099         [EXIT_REASON_DR_ACCESS]               = handle_dr,
6100         [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
6101         [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
6102         [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
6103         [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
6104         [EXIT_REASON_HLT]                     = kvm_emulate_halt,
6105         [EXIT_REASON_INVD]                    = kvm_emulate_invd,
6106         [EXIT_REASON_INVLPG]                  = handle_invlpg,
6107         [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
6108         [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
6109         [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
6110         [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
6111         [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
6112         [EXIT_REASON_VMPTRST]                 = handle_vmx_instruction,
6113         [EXIT_REASON_VMREAD]                  = handle_vmx_instruction,
6114         [EXIT_REASON_VMRESUME]                = handle_vmx_instruction,
6115         [EXIT_REASON_VMWRITE]                 = handle_vmx_instruction,
6116         [EXIT_REASON_VMOFF]                   = handle_vmx_instruction,
6117         [EXIT_REASON_VMON]                    = handle_vmx_instruction,
6118         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
6119         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
6120         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
6121         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
6122         [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
6123         [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
6124         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
6125         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
6126         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
6127         [EXIT_REASON_LDTR_TR]                 = handle_desc,
6128         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
6129         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
6130         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
6131         [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
6132         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
6133         [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
6134         [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
6135         [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
6136         [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
6137         [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
6138         [EXIT_REASON_PML_FULL]                = handle_pml_full,
6139         [EXIT_REASON_INVPCID]                 = handle_invpcid,
6140         [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
6141         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
6142         [EXIT_REASON_ENCLS]                   = handle_encls,
6143         [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
6144         [EXIT_REASON_NOTIFY]                  = handle_notify,
6145 };
6146
6147 static const int kvm_vmx_max_exit_handlers =
6148         ARRAY_SIZE(kvm_vmx_exit_handlers);
6149
6150 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6151                               u64 *info1, u64 *info2,
6152                               u32 *intr_info, u32 *error_code)
6153 {
6154         struct vcpu_vmx *vmx = to_vmx(vcpu);
6155
6156         *reason = vmx->exit_reason.full;
6157         *info1 = vmx_get_exit_qual(vcpu);
6158         if (!(vmx->exit_reason.failed_vmentry)) {
6159                 *info2 = vmx->idt_vectoring_info;
6160                 *intr_info = vmx_get_intr_info(vcpu);
6161                 if (is_exception_with_error_code(*intr_info))
6162                         *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6163                 else
6164                         *error_code = 0;
6165         } else {
6166                 *info2 = 0;
6167                 *intr_info = 0;
6168                 *error_code = 0;
6169         }
6170 }
6171
6172 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6173 {
6174         if (vmx->pml_pg) {
6175                 __free_page(vmx->pml_pg);
6176                 vmx->pml_pg = NULL;
6177         }
6178 }
6179
6180 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6181 {
6182         struct vcpu_vmx *vmx = to_vmx(vcpu);
6183         u64 *pml_buf;
6184         u16 pml_idx;
6185
6186         pml_idx = vmcs_read16(GUEST_PML_INDEX);
6187
6188         /* Do nothing if PML buffer is empty */
6189         if (pml_idx == (PML_ENTITY_NUM - 1))
6190                 return;
6191
6192         /* PML index always points to next available PML buffer entity */
6193         if (pml_idx >= PML_ENTITY_NUM)
6194                 pml_idx = 0;
6195         else
6196                 pml_idx++;
6197
6198         pml_buf = page_address(vmx->pml_pg);
6199         for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6200                 u64 gpa;
6201
6202                 gpa = pml_buf[pml_idx];
6203                 WARN_ON(gpa & (PAGE_SIZE - 1));
6204                 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6205         }
6206
6207         /* reset PML index */
6208         vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6209 }
6210
6211 static void vmx_dump_sel(char *name, uint32_t sel)
6212 {
6213         pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6214                name, vmcs_read16(sel),
6215                vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6216                vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6217                vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6218 }
6219
6220 static void vmx_dump_dtsel(char *name, uint32_t limit)
6221 {
6222         pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
6223                name, vmcs_read32(limit),
6224                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6225 }
6226
6227 static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6228 {
6229         unsigned int i;
6230         struct vmx_msr_entry *e;
6231
6232         pr_err("MSR %s:\n", name);
6233         for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6234                 pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6235 }
6236
6237 void dump_vmcs(struct kvm_vcpu *vcpu)
6238 {
6239         struct vcpu_vmx *vmx = to_vmx(vcpu);
6240         u32 vmentry_ctl, vmexit_ctl;
6241         u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6242         u64 tertiary_exec_control;
6243         unsigned long cr4;
6244         int efer_slot;
6245
6246         if (!dump_invalid_vmcs) {
6247                 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6248                 return;
6249         }
6250
6251         vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6252         vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6253         cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6254         pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6255         cr4 = vmcs_readl(GUEST_CR4);
6256
6257         if (cpu_has_secondary_exec_ctrls())
6258                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6259         else
6260                 secondary_exec_control = 0;
6261
6262         if (cpu_has_tertiary_exec_ctrls())
6263                 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6264         else
6265                 tertiary_exec_control = 0;
6266
6267         pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6268                vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
6269         pr_err("*** Guest State ***\n");
6270         pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6271                vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6272                vmcs_readl(CR0_GUEST_HOST_MASK));
6273         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6274                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6275         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6276         if (cpu_has_vmx_ept()) {
6277                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
6278                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6279                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
6280                        vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6281         }
6282         pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
6283                vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6284         pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
6285                vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6286         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6287                vmcs_readl(GUEST_SYSENTER_ESP),
6288                vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6289         vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
6290         vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
6291         vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
6292         vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
6293         vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
6294         vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
6295         vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6296         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6297         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6298         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
6299         efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
6300         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
6301                 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
6302         else if (efer_slot >= 0)
6303                 pr_err("EFER= 0x%016llx (autoload)\n",
6304                        vmx->msr_autoload.guest.val[efer_slot].value);
6305         else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6306                 pr_err("EFER= 0x%016llx (effective)\n",
6307                        vcpu->arch.efer | (EFER_LMA | EFER_LME));
6308         else
6309                 pr_err("EFER= 0x%016llx (effective)\n",
6310                        vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
6311         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
6312                 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
6313         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
6314                vmcs_read64(GUEST_IA32_DEBUGCTL),
6315                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6316         if (cpu_has_load_perf_global_ctrl() &&
6317             vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6318                 pr_err("PerfGlobCtl = 0x%016llx\n",
6319                        vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6320         if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6321                 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6322         pr_err("Interruptibility = %08x  ActivityState = %08x\n",
6323                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6324                vmcs_read32(GUEST_ACTIVITY_STATE));
6325         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6326                 pr_err("InterruptStatus = %04x\n",
6327                        vmcs_read16(GUEST_INTR_STATUS));
6328         if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6329                 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6330         if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6331                 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
6332
6333         pr_err("*** Host State ***\n");
6334         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
6335                vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6336         pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6337                vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6338                vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6339                vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6340                vmcs_read16(HOST_TR_SELECTOR));
6341         pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6342                vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6343                vmcs_readl(HOST_TR_BASE));
6344         pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6345                vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6346         pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6347                vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6348                vmcs_readl(HOST_CR4));
6349         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6350                vmcs_readl(HOST_IA32_SYSENTER_ESP),
6351                vmcs_read32(HOST_IA32_SYSENTER_CS),
6352                vmcs_readl(HOST_IA32_SYSENTER_EIP));
6353         if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6354                 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6355         if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6356                 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
6357         if (cpu_has_load_perf_global_ctrl() &&
6358             vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6359                 pr_err("PerfGlobCtl = 0x%016llx\n",
6360                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6361         if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6362                 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
6363
6364         pr_err("*** Control State ***\n");
6365         pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6366                cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6367         pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6368                pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
6369         pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6370                vmcs_read32(EXCEPTION_BITMAP),
6371                vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6372                vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6373         pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6374                vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6375                vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6376                vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6377         pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6378                vmcs_read32(VM_EXIT_INTR_INFO),
6379                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6380                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6381         pr_err("        reason=%08x qualification=%016lx\n",
6382                vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6383         pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6384                vmcs_read32(IDT_VECTORING_INFO_FIELD),
6385                vmcs_read32(IDT_VECTORING_ERROR_CODE));
6386         pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6387         if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6388                 pr_err("TSC Multiplier = 0x%016llx\n",
6389                        vmcs_read64(TSC_MULTIPLIER));
6390         if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6391                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6392                         u16 status = vmcs_read16(GUEST_INTR_STATUS);
6393                         pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6394                 }
6395                 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6396                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6397                         pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6398                 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6399         }
6400         if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6401                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6402         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6403                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6404         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6405                 pr_err("PLE Gap=%08x Window=%08x\n",
6406                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6407         if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6408                 pr_err("Virtual processor ID = 0x%04x\n",
6409                        vmcs_read16(VIRTUAL_PROCESSOR_ID));
6410 }
6411
6412 /*
6413  * The guest has exited.  See if we can fix it or if we need userspace
6414  * assistance.
6415  */
6416 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6417 {
6418         struct vcpu_vmx *vmx = to_vmx(vcpu);
6419         union vmx_exit_reason exit_reason = vmx->exit_reason;
6420         u32 vectoring_info = vmx->idt_vectoring_info;
6421         u16 exit_handler_index;
6422
6423         /*
6424          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6425          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6426          * querying dirty_bitmap, we only need to kick all vcpus out of guest
6427          * mode as if vcpus is in root mode, the PML buffer must has been
6428          * flushed already.  Note, PML is never enabled in hardware while
6429          * running L2.
6430          */
6431         if (enable_pml && !is_guest_mode(vcpu))
6432                 vmx_flush_pml_buffer(vcpu);
6433
6434         /*
6435          * KVM should never reach this point with a pending nested VM-Enter.
6436          * More specifically, short-circuiting VM-Entry to emulate L2 due to
6437          * invalid guest state should never happen as that means KVM knowingly
6438          * allowed a nested VM-Enter with an invalid vmcs12.  More below.
6439          */
6440         if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6441                 return -EIO;
6442
6443         if (is_guest_mode(vcpu)) {
6444                 /*
6445                  * PML is never enabled when running L2, bail immediately if a
6446                  * PML full exit occurs as something is horribly wrong.
6447                  */
6448                 if (exit_reason.basic == EXIT_REASON_PML_FULL)
6449                         goto unexpected_vmexit;
6450
6451                 /*
6452                  * The host physical addresses of some pages of guest memory
6453                  * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6454                  * Page). The CPU may write to these pages via their host
6455                  * physical address while L2 is running, bypassing any
6456                  * address-translation-based dirty tracking (e.g. EPT write
6457                  * protection).
6458                  *
6459                  * Mark them dirty on every exit from L2 to prevent them from
6460                  * getting out of sync with dirty tracking.
6461                  */
6462                 nested_mark_vmcs12_pages_dirty(vcpu);
6463
6464                 /*
6465                  * Synthesize a triple fault if L2 state is invalid.  In normal
6466                  * operation, nested VM-Enter rejects any attempt to enter L2
6467                  * with invalid state.  However, those checks are skipped if
6468                  * state is being stuffed via RSM or KVM_SET_NESTED_STATE.  If
6469                  * L2 state is invalid, it means either L1 modified SMRAM state
6470                  * or userspace provided bad state.  Synthesize TRIPLE_FAULT as
6471                  * doing so is architecturally allowed in the RSM case, and is
6472                  * the least awful solution for the userspace case without
6473                  * risking false positives.
6474                  */
6475                 if (vmx->emulation_required) {
6476                         nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6477                         return 1;
6478                 }
6479
6480                 if (nested_vmx_reflect_vmexit(vcpu))
6481                         return 1;
6482         }
6483
6484         /* If guest state is invalid, start emulating.  L2 is handled above. */
6485         if (vmx->emulation_required)
6486                 return handle_invalid_guest_state(vcpu);
6487
6488         if (exit_reason.failed_vmentry) {
6489                 dump_vmcs(vcpu);
6490                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6491                 vcpu->run->fail_entry.hardware_entry_failure_reason
6492                         = exit_reason.full;
6493                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6494                 return 0;
6495         }
6496
6497         if (unlikely(vmx->fail)) {
6498                 dump_vmcs(vcpu);
6499                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6500                 vcpu->run->fail_entry.hardware_entry_failure_reason
6501                         = vmcs_read32(VM_INSTRUCTION_ERROR);
6502                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6503                 return 0;
6504         }
6505
6506         /*
6507          * Note:
6508          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6509          * delivery event since it indicates guest is accessing MMIO.
6510          * The vm-exit can be triggered again after return to guest that
6511          * will cause infinite loop.
6512          */
6513         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6514             (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6515              exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6516              exit_reason.basic != EXIT_REASON_PML_FULL &&
6517              exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6518              exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6519              exit_reason.basic != EXIT_REASON_NOTIFY)) {
6520                 int ndata = 3;
6521
6522                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6523                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6524                 vcpu->run->internal.data[0] = vectoring_info;
6525                 vcpu->run->internal.data[1] = exit_reason.full;
6526                 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
6527                 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
6528                         vcpu->run->internal.data[ndata++] =
6529                                 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6530                 }
6531                 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6532                 vcpu->run->internal.ndata = ndata;
6533                 return 0;
6534         }
6535
6536         if (unlikely(!enable_vnmi &&
6537                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
6538                 if (!vmx_interrupt_blocked(vcpu)) {
6539                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6540                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6541                            vcpu->arch.nmi_pending) {
6542                         /*
6543                          * This CPU don't support us in finding the end of an
6544                          * NMI-blocked window if the guest runs with IRQs
6545                          * disabled. So we pull the trigger after 1 s of
6546                          * futile waiting, but inform the user about this.
6547                          */
6548                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6549                                "state on VCPU %d after 1 s timeout\n",
6550                                __func__, vcpu->vcpu_id);
6551                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6552                 }
6553         }
6554
6555         if (exit_fastpath != EXIT_FASTPATH_NONE)
6556                 return 1;
6557
6558         if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6559                 goto unexpected_vmexit;
6560 #ifdef CONFIG_RETPOLINE
6561         if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6562                 return kvm_emulate_wrmsr(vcpu);
6563         else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6564                 return handle_preemption_timer(vcpu);
6565         else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6566                 return handle_interrupt_window(vcpu);
6567         else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6568                 return handle_external_interrupt(vcpu);
6569         else if (exit_reason.basic == EXIT_REASON_HLT)
6570                 return kvm_emulate_halt(vcpu);
6571         else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6572                 return handle_ept_misconfig(vcpu);
6573 #endif
6574
6575         exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6576                                                 kvm_vmx_max_exit_handlers);
6577         if (!kvm_vmx_exit_handlers[exit_handler_index])
6578                 goto unexpected_vmexit;
6579
6580         return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6581
6582 unexpected_vmexit:
6583         vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6584                     exit_reason.full);
6585         dump_vmcs(vcpu);
6586         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6587         vcpu->run->internal.suberror =
6588                         KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6589         vcpu->run->internal.ndata = 2;
6590         vcpu->run->internal.data[0] = exit_reason.full;
6591         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
6592         return 0;
6593 }
6594
6595 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6596 {
6597         int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6598
6599         /*
6600          * Exit to user space when bus lock detected to inform that there is
6601          * a bus lock in guest.
6602          */
6603         if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
6604                 if (ret > 0)
6605                         vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6606
6607                 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6608                 return 0;
6609         }
6610         return ret;
6611 }
6612
6613 /*
6614  * Software based L1D cache flush which is used when microcode providing
6615  * the cache control MSR is not loaded.
6616  *
6617  * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6618  * flush it is required to read in 64 KiB because the replacement algorithm
6619  * is not exactly LRU. This could be sized at runtime via topology
6620  * information but as all relevant affected CPUs have 32KiB L1D cache size
6621  * there is no point in doing so.
6622  */
6623 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6624 {
6625         int size = PAGE_SIZE << L1D_CACHE_ORDER;
6626
6627         /*
6628          * This code is only executed when the flush mode is 'cond' or
6629          * 'always'
6630          */
6631         if (static_branch_likely(&vmx_l1d_flush_cond)) {
6632                 bool flush_l1d;
6633
6634                 /*
6635                  * Clear the per-vcpu flush bit, it gets set again
6636                  * either from vcpu_run() or from one of the unsafe
6637                  * VMEXIT handlers.
6638                  */
6639                 flush_l1d = vcpu->arch.l1tf_flush_l1d;
6640                 vcpu->arch.l1tf_flush_l1d = false;
6641
6642                 /*
6643                  * Clear the per-cpu flush bit, it gets set again from
6644                  * the interrupt handlers.
6645                  */
6646                 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6647                 kvm_clear_cpu_l1tf_flush_l1d();
6648
6649                 if (!flush_l1d)
6650                         return;
6651         }
6652
6653         vcpu->stat.l1d_flush++;
6654
6655         if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6656                 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6657                 return;
6658         }
6659
6660         asm volatile(
6661                 /* First ensure the pages are in the TLB */
6662                 "xorl   %%eax, %%eax\n"
6663                 ".Lpopulate_tlb:\n\t"
6664                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6665                 "addl   $4096, %%eax\n\t"
6666                 "cmpl   %%eax, %[size]\n\t"
6667                 "jne    .Lpopulate_tlb\n\t"
6668                 "xorl   %%eax, %%eax\n\t"
6669                 "cpuid\n\t"
6670                 /* Now fill the cache */
6671                 "xorl   %%eax, %%eax\n"
6672                 ".Lfill_cache:\n"
6673                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6674                 "addl   $64, %%eax\n\t"
6675                 "cmpl   %%eax, %[size]\n\t"
6676                 "jne    .Lfill_cache\n\t"
6677                 "lfence\n"
6678                 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6679                     [size] "r" (size)
6680                 : "eax", "ebx", "ecx", "edx");
6681 }
6682
6683 static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6684 {
6685         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6686         int tpr_threshold;
6687
6688         if (is_guest_mode(vcpu) &&
6689                 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6690                 return;
6691
6692         tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6693         if (is_guest_mode(vcpu))
6694                 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6695         else
6696                 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6697 }
6698
6699 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6700 {
6701         struct vcpu_vmx *vmx = to_vmx(vcpu);
6702         u32 sec_exec_control;
6703
6704         if (!lapic_in_kernel(vcpu))
6705                 return;
6706
6707         if (!flexpriority_enabled &&
6708             !cpu_has_vmx_virtualize_x2apic_mode())
6709                 return;
6710
6711         /* Postpone execution until vmcs01 is the current VMCS. */
6712         if (is_guest_mode(vcpu)) {
6713                 vmx->nested.change_vmcs01_virtual_apic_mode = true;
6714                 return;
6715         }
6716
6717         sec_exec_control = secondary_exec_controls_get(vmx);
6718         sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6719                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6720
6721         switch (kvm_get_apic_mode(vcpu)) {
6722         case LAPIC_MODE_INVALID:
6723                 WARN_ONCE(true, "Invalid local APIC state");
6724                 break;
6725         case LAPIC_MODE_DISABLED:
6726                 break;
6727         case LAPIC_MODE_XAPIC:
6728                 if (flexpriority_enabled) {
6729                         sec_exec_control |=
6730                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6731                         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6732
6733                         /*
6734                          * Flush the TLB, reloading the APIC access page will
6735                          * only do so if its physical address has changed, but
6736                          * the guest may have inserted a non-APIC mapping into
6737                          * the TLB while the APIC access page was disabled.
6738                          */
6739                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6740                 }
6741                 break;
6742         case LAPIC_MODE_X2APIC:
6743                 if (cpu_has_vmx_virtualize_x2apic_mode())
6744                         sec_exec_control |=
6745                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6746                 break;
6747         }
6748         secondary_exec_controls_set(vmx, sec_exec_control);
6749
6750         vmx_update_msr_bitmap_x2apic(vcpu);
6751 }
6752
6753 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6754 {
6755         struct page *page;
6756
6757         /* Defer reload until vmcs01 is the current VMCS. */
6758         if (is_guest_mode(vcpu)) {
6759                 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6760                 return;
6761         }
6762
6763         if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6764             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6765                 return;
6766
6767         page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
6768         if (is_error_page(page))
6769                 return;
6770
6771         vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
6772         vmx_flush_tlb_current(vcpu);
6773
6774         /*
6775          * Do not pin apic access page in memory, the MMU notifier
6776          * will call us again if it is migrated or swapped out.
6777          */
6778         put_page(page);
6779 }
6780
6781 static void vmx_hwapic_isr_update(int max_isr)
6782 {
6783         u16 status;
6784         u8 old;
6785
6786         if (max_isr == -1)
6787                 max_isr = 0;
6788
6789         status = vmcs_read16(GUEST_INTR_STATUS);
6790         old = status >> 8;
6791         if (max_isr != old) {
6792                 status &= 0xff;
6793                 status |= max_isr << 8;
6794                 vmcs_write16(GUEST_INTR_STATUS, status);
6795         }
6796 }
6797
6798 static void vmx_set_rvi(int vector)
6799 {
6800         u16 status;
6801         u8 old;
6802
6803         if (vector == -1)
6804                 vector = 0;
6805
6806         status = vmcs_read16(GUEST_INTR_STATUS);
6807         old = (u8)status & 0xff;
6808         if ((u8)vector != old) {
6809                 status &= ~0xff;
6810                 status |= (u8)vector;
6811                 vmcs_write16(GUEST_INTR_STATUS, status);
6812         }
6813 }
6814
6815 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6816 {
6817         /*
6818          * When running L2, updating RVI is only relevant when
6819          * vmcs12 virtual-interrupt-delivery enabled.
6820          * However, it can be enabled only when L1 also
6821          * intercepts external-interrupts and in that case
6822          * we should not update vmcs02 RVI but instead intercept
6823          * interrupt. Therefore, do nothing when running L2.
6824          */
6825         if (!is_guest_mode(vcpu))
6826                 vmx_set_rvi(max_irr);
6827 }
6828
6829 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6830 {
6831         struct vcpu_vmx *vmx = to_vmx(vcpu);
6832         int max_irr;
6833         bool got_posted_interrupt;
6834
6835         if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
6836                 return -EIO;
6837
6838         if (pi_test_on(&vmx->pi_desc)) {
6839                 pi_clear_on(&vmx->pi_desc);
6840                 /*
6841                  * IOMMU can write to PID.ON, so the barrier matters even on UP.
6842                  * But on x86 this is just a compiler barrier anyway.
6843                  */
6844                 smp_mb__after_atomic();
6845                 got_posted_interrupt =
6846                         kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6847         } else {
6848                 max_irr = kvm_lapic_find_highest_irr(vcpu);
6849                 got_posted_interrupt = false;
6850         }
6851
6852         /*
6853          * Newly recognized interrupts are injected via either virtual interrupt
6854          * delivery (RVI) or KVM_REQ_EVENT.  Virtual interrupt delivery is
6855          * disabled in two cases:
6856          *
6857          * 1) If L2 is running and the vCPU has a new pending interrupt.  If L1
6858          * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
6859          * VM-Exit to L1.  If L1 doesn't want to exit, the interrupt is injected
6860          * into L2, but KVM doesn't use virtual interrupt delivery to inject
6861          * interrupts into L2, and so KVM_REQ_EVENT is again needed.
6862          *
6863          * 2) If APICv is disabled for this vCPU, assigned devices may still
6864          * attempt to post interrupts.  The posted interrupt vector will cause
6865          * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
6866          */
6867         if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
6868                 vmx_set_rvi(max_irr);
6869         else if (got_posted_interrupt)
6870                 kvm_make_request(KVM_REQ_EVENT, vcpu);
6871
6872         return max_irr;
6873 }
6874
6875 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6876 {
6877         if (!kvm_vcpu_apicv_active(vcpu))
6878                 return;
6879
6880         vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6881         vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6882         vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6883         vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6884 }
6885
6886 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6887 {
6888         struct vcpu_vmx *vmx = to_vmx(vcpu);
6889
6890         pi_clear_on(&vmx->pi_desc);
6891         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6892 }
6893
6894 void vmx_do_interrupt_irqoff(unsigned long entry);
6895 void vmx_do_nmi_irqoff(void);
6896
6897 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
6898 {
6899         /*
6900          * Save xfd_err to guest_fpu before interrupt is enabled, so the
6901          * MSR value is not clobbered by the host activity before the guest
6902          * has chance to consume it.
6903          *
6904          * Do not blindly read xfd_err here, since this exception might
6905          * be caused by L1 interception on a platform which doesn't
6906          * support xfd at all.
6907          *
6908          * Do it conditionally upon guest_fpu::xfd. xfd_err matters
6909          * only when xfd contains a non-zero value.
6910          *
6911          * Queuing exception is done in vmx_handle_exit. See comment there.
6912          */
6913         if (vcpu->arch.guest_fpu.fpstate->xfd)
6914                 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
6915 }
6916
6917 static void handle_exception_irqoff(struct vcpu_vmx *vmx)
6918 {
6919         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
6920
6921         /* if exit due to PF check for async PF */
6922         if (is_page_fault(intr_info))
6923                 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
6924         /* if exit due to NM, handle before interrupts are enabled */
6925         else if (is_nm_fault(intr_info))
6926                 handle_nm_fault_irqoff(&vmx->vcpu);
6927         /* Handle machine checks before interrupts are enabled */
6928         else if (is_machine_check(intr_info))
6929                 kvm_machine_check();
6930 }
6931
6932 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6933 {
6934         u32 intr_info = vmx_get_intr_info(vcpu);
6935         unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6936         gate_desc *desc = (gate_desc *)host_idt_base + vector;
6937
6938         if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
6939             "unexpected VM-Exit interrupt info: 0x%x", intr_info))
6940                 return;
6941
6942         kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
6943         vmx_do_interrupt_irqoff(gate_offset(desc));
6944         kvm_after_interrupt(vcpu);
6945
6946         vcpu->arch.at_instruction_boundary = true;
6947 }
6948
6949 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6950 {
6951         struct vcpu_vmx *vmx = to_vmx(vcpu);
6952
6953         if (vmx->emulation_required)
6954                 return;
6955
6956         if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6957                 handle_external_interrupt_irqoff(vcpu);
6958         else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
6959                 handle_exception_irqoff(vmx);
6960 }
6961
6962 /*
6963  * The kvm parameter can be NULL (module initialization, or invocation before
6964  * VM creation). Be sure to check the kvm parameter before using it.
6965  */
6966 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
6967 {
6968         switch (index) {
6969         case MSR_IA32_SMBASE:
6970                 if (!IS_ENABLED(CONFIG_KVM_SMM))
6971                         return false;
6972                 /*
6973                  * We cannot do SMM unless we can run the guest in big
6974                  * real mode.
6975                  */
6976                 return enable_unrestricted_guest || emulate_invalid_guest_state;
6977         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6978                 return nested;
6979         case MSR_AMD64_VIRT_SPEC_CTRL:
6980         case MSR_AMD64_TSC_RATIO:
6981                 /* This is AMD only.  */
6982                 return false;
6983         default:
6984                 return true;
6985         }
6986 }
6987
6988 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6989 {
6990         u32 exit_intr_info;
6991         bool unblock_nmi;
6992         u8 vector;
6993         bool idtv_info_valid;
6994
6995         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6996
6997         if (enable_vnmi) {
6998                 if (vmx->loaded_vmcs->nmi_known_unmasked)
6999                         return;
7000
7001                 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
7002                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7003                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7004                 /*
7005                  * SDM 3: 27.7.1.2 (September 2008)
7006                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
7007                  * a guest IRET fault.
7008                  * SDM 3: 23.2.2 (September 2008)
7009                  * Bit 12 is undefined in any of the following cases:
7010                  *  If the VM exit sets the valid bit in the IDT-vectoring
7011                  *   information field.
7012                  *  If the VM exit is due to a double fault.
7013                  */
7014                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7015                     vector != DF_VECTOR && !idtv_info_valid)
7016                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7017                                       GUEST_INTR_STATE_NMI);
7018                 else
7019                         vmx->loaded_vmcs->nmi_known_unmasked =
7020                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7021                                   & GUEST_INTR_STATE_NMI);
7022         } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7023                 vmx->loaded_vmcs->vnmi_blocked_time +=
7024                         ktime_to_ns(ktime_sub(ktime_get(),
7025                                               vmx->loaded_vmcs->entry_time));
7026 }
7027
7028 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7029                                       u32 idt_vectoring_info,
7030                                       int instr_len_field,
7031                                       int error_code_field)
7032 {
7033         u8 vector;
7034         int type;
7035         bool idtv_info_valid;
7036
7037         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7038
7039         vcpu->arch.nmi_injected = false;
7040         kvm_clear_exception_queue(vcpu);
7041         kvm_clear_interrupt_queue(vcpu);
7042
7043         if (!idtv_info_valid)
7044                 return;
7045
7046         kvm_make_request(KVM_REQ_EVENT, vcpu);
7047
7048         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7049         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7050
7051         switch (type) {
7052         case INTR_TYPE_NMI_INTR:
7053                 vcpu->arch.nmi_injected = true;
7054                 /*
7055                  * SDM 3: 27.7.1.2 (September 2008)
7056                  * Clear bit "block by NMI" before VM entry if a NMI
7057                  * delivery faulted.
7058                  */
7059                 vmx_set_nmi_mask(vcpu, false);
7060                 break;
7061         case INTR_TYPE_SOFT_EXCEPTION:
7062                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7063                 fallthrough;
7064         case INTR_TYPE_HARD_EXCEPTION:
7065                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7066                         u32 err = vmcs_read32(error_code_field);
7067                         kvm_requeue_exception_e(vcpu, vector, err);
7068                 } else
7069                         kvm_requeue_exception(vcpu, vector);
7070                 break;
7071         case INTR_TYPE_SOFT_INTR:
7072                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7073                 fallthrough;
7074         case INTR_TYPE_EXT_INTR:
7075                 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7076                 break;
7077         default:
7078                 break;
7079         }
7080 }
7081
7082 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7083 {
7084         __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7085                                   VM_EXIT_INSTRUCTION_LEN,
7086                                   IDT_VECTORING_ERROR_CODE);
7087 }
7088
7089 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7090 {
7091         __vmx_complete_interrupts(vcpu,
7092                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7093                                   VM_ENTRY_INSTRUCTION_LEN,
7094                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
7095
7096         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7097 }
7098
7099 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7100 {
7101         int i, nr_msrs;
7102         struct perf_guest_switch_msr *msrs;
7103         struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7104
7105         pmu->host_cross_mapped_mask = 0;
7106         if (pmu->pebs_enable & pmu->global_ctrl)
7107                 intel_pmu_cross_mapped_check(pmu);
7108
7109         /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
7110         msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
7111         if (!msrs)
7112                 return;
7113
7114         for (i = 0; i < nr_msrs; i++)
7115                 if (msrs[i].host == msrs[i].guest)
7116                         clear_atomic_switch_msr(vmx, msrs[i].msr);
7117                 else
7118                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7119                                         msrs[i].host, false);
7120 }
7121
7122 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
7123 {
7124         struct vcpu_vmx *vmx = to_vmx(vcpu);
7125         u64 tscl;
7126         u32 delta_tsc;
7127
7128         if (vmx->req_immediate_exit) {
7129                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7130                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7131         } else if (vmx->hv_deadline_tsc != -1) {
7132                 tscl = rdtsc();
7133                 if (vmx->hv_deadline_tsc > tscl)
7134                         /* set_hv_timer ensures the delta fits in 32-bits */
7135                         delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7136                                 cpu_preemption_timer_multi);
7137                 else
7138                         delta_tsc = 0;
7139
7140                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7141                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7142         } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7143                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7144                 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7145         }
7146 }
7147
7148 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7149 {
7150         if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7151                 vmx->loaded_vmcs->host_state.rsp = host_rsp;
7152                 vmcs_writel(HOST_RSP, host_rsp);
7153         }
7154 }
7155
7156 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7157                                         unsigned int flags)
7158 {
7159         u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7160
7161         if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7162                 return;
7163
7164         if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7165                 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
7166
7167         /*
7168          * If the guest/host SPEC_CTRL values differ, restore the host value.
7169          *
7170          * For legacy IBRS, the IBRS bit always needs to be written after
7171          * transitioning from a less privileged predictor mode, regardless of
7172          * whether the guest/host values differ.
7173          */
7174         if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7175             vmx->spec_ctrl != hostval)
7176                 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
7177
7178         barrier_nospec();
7179 }
7180
7181 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
7182 {
7183         switch (to_vmx(vcpu)->exit_reason.basic) {
7184         case EXIT_REASON_MSR_WRITE:
7185                 return handle_fastpath_set_msr_irqoff(vcpu);
7186         case EXIT_REASON_PREEMPTION_TIMER:
7187                 return handle_fastpath_preemption_timer(vcpu);
7188         default:
7189                 return EXIT_FASTPATH_NONE;
7190         }
7191 }
7192
7193 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
7194                                         unsigned int flags)
7195 {
7196         struct vcpu_vmx *vmx = to_vmx(vcpu);
7197
7198         guest_state_enter_irqoff();
7199
7200         /* L1D Flush includes CPU buffer clear to mitigate MDS */
7201         if (static_branch_unlikely(&vmx_l1d_should_flush))
7202                 vmx_l1d_flush(vcpu);
7203         else if (static_branch_unlikely(&mds_user_clear))
7204                 mds_clear_cpu_buffers();
7205         else if (static_branch_unlikely(&mmio_stale_data_clear) &&
7206                  kvm_arch_has_assigned_device(vcpu->kvm))
7207                 mds_clear_cpu_buffers();
7208
7209         vmx_disable_fb_clear(vmx);
7210
7211         if (vcpu->arch.cr2 != native_read_cr2())
7212                 native_write_cr2(vcpu->arch.cr2);
7213
7214         vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7215                                    flags);
7216
7217         vcpu->arch.cr2 = native_read_cr2();
7218
7219         vmx_enable_fb_clear(vmx);
7220
7221         if (unlikely(vmx->fail))
7222                 vmx->exit_reason.full = 0xdead;
7223         else
7224                 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7225
7226         if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
7227             is_nmi(vmx_get_intr_info(vcpu))) {
7228                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
7229                 vmx_do_nmi_irqoff();
7230                 kvm_after_interrupt(vcpu);
7231         }
7232
7233         guest_state_exit_irqoff();
7234 }
7235
7236 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
7237 {
7238         struct vcpu_vmx *vmx = to_vmx(vcpu);
7239         unsigned long cr3, cr4;
7240
7241         /* Record the guest's net vcpu time for enforced NMI injections. */
7242         if (unlikely(!enable_vnmi &&
7243                      vmx->loaded_vmcs->soft_vnmi_blocked))
7244                 vmx->loaded_vmcs->entry_time = ktime_get();
7245
7246         /*
7247          * Don't enter VMX if guest state is invalid, let the exit handler
7248          * start emulation until we arrive back to a valid state.  Synthesize a
7249          * consistency check VM-Exit due to invalid guest state and bail.
7250          */
7251         if (unlikely(vmx->emulation_required)) {
7252                 vmx->fail = 0;
7253
7254                 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
7255                 vmx->exit_reason.failed_vmentry = 1;
7256                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7257                 vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
7258                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7259                 vmx->exit_intr_info = 0;
7260                 return EXIT_FASTPATH_NONE;
7261         }
7262
7263         trace_kvm_entry(vcpu);
7264
7265         if (vmx->ple_window_dirty) {
7266                 vmx->ple_window_dirty = false;
7267                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
7268         }
7269
7270         /*
7271          * We did this in prepare_switch_to_guest, because it needs to
7272          * be within srcu_read_lock.
7273          */
7274         WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
7275
7276         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7277                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7278         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7279                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7280         vcpu->arch.regs_dirty = 0;
7281
7282         /*
7283          * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
7284          * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7285          * it switches back to the current->mm, which can occur in KVM context
7286          * when switching to a temporary mm to patch kernel code, e.g. if KVM
7287          * toggles a static key while handling a VM-Exit.
7288          */
7289         cr3 = __get_current_cr3_fast();
7290         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7291                 vmcs_writel(HOST_CR3, cr3);
7292                 vmx->loaded_vmcs->host_state.cr3 = cr3;
7293         }
7294
7295         cr4 = cr4_read_shadow();
7296         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7297                 vmcs_writel(HOST_CR4, cr4);
7298                 vmx->loaded_vmcs->host_state.cr4 = cr4;
7299         }
7300
7301         /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
7302         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
7303                 set_debugreg(vcpu->arch.dr6, 6);
7304
7305         /* When single-stepping over STI and MOV SS, we must clear the
7306          * corresponding interruptibility bits in the guest state. Otherwise
7307          * vmentry fails as it then expects bit 14 (BS) in pending debug
7308          * exceptions being set, but that's not correct for the guest debugging
7309          * case. */
7310         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7311                 vmx_set_interrupt_shadow(vcpu, 0);
7312
7313         kvm_load_guest_xsave_state(vcpu);
7314
7315         pt_guest_enter(vmx);
7316
7317         atomic_switch_perf_msrs(vmx);
7318         if (intel_pmu_lbr_is_enabled(vcpu))
7319                 vmx_passthrough_lbr_msrs(vcpu);
7320
7321         if (enable_preemption_timer)
7322                 vmx_update_hv_timer(vcpu);
7323
7324         kvm_wait_lapic_expire(vcpu);
7325
7326         /* The actual VMENTER/EXIT is in the .noinstr.text section. */
7327         vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
7328
7329         /* All fields are clean at this point */
7330         if (kvm_is_using_evmcs()) {
7331                 current_evmcs->hv_clean_fields |=
7332                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7333
7334                 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
7335         }
7336
7337         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7338         if (vmx->host_debugctlmsr)
7339                 update_debugctlmsr(vmx->host_debugctlmsr);
7340
7341 #ifndef CONFIG_X86_64
7342         /*
7343          * The sysexit path does not restore ds/es, so we must set them to
7344          * a reasonable value ourselves.
7345          *
7346          * We can't defer this to vmx_prepare_switch_to_host() since that
7347          * function may be executed in interrupt context, which saves and
7348          * restore segments around it, nullifying its effect.
7349          */
7350         loadsegment(ds, __USER_DS);
7351         loadsegment(es, __USER_DS);
7352 #endif
7353
7354         vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7355
7356         pt_guest_exit(vmx);
7357
7358         kvm_load_host_xsave_state(vcpu);
7359
7360         if (is_guest_mode(vcpu)) {
7361                 /*
7362                  * Track VMLAUNCH/VMRESUME that have made past guest state
7363                  * checking.
7364                  */
7365                 if (vmx->nested.nested_run_pending &&
7366                     !vmx->exit_reason.failed_vmentry)
7367                         ++vcpu->stat.nested_run;
7368
7369                 vmx->nested.nested_run_pending = 0;
7370         }
7371
7372         vmx->idt_vectoring_info = 0;
7373
7374         if (unlikely(vmx->fail))
7375                 return EXIT_FASTPATH_NONE;
7376
7377         if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
7378                 kvm_machine_check();
7379
7380         if (likely(!vmx->exit_reason.failed_vmentry))
7381                 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7382
7383         trace_kvm_exit(vcpu, KVM_ISA_VMX);
7384
7385         if (unlikely(vmx->exit_reason.failed_vmentry))
7386                 return EXIT_FASTPATH_NONE;
7387
7388         vmx->loaded_vmcs->launched = 1;
7389
7390         vmx_recover_nmi_blocking(vmx);
7391         vmx_complete_interrupts(vmx);
7392
7393         if (is_guest_mode(vcpu))
7394                 return EXIT_FASTPATH_NONE;
7395
7396         return vmx_exit_handlers_fastpath(vcpu);
7397 }
7398
7399 static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
7400 {
7401         struct vcpu_vmx *vmx = to_vmx(vcpu);
7402
7403         if (enable_pml)
7404                 vmx_destroy_pml_buffer(vmx);
7405         free_vpid(vmx->vpid);
7406         nested_vmx_free_vcpu(vcpu);
7407         free_loaded_vmcs(vmx->loaded_vmcs);
7408 }
7409
7410 static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
7411 {
7412         struct vmx_uret_msr *tsx_ctrl;
7413         struct vcpu_vmx *vmx;
7414         int i, err;
7415
7416         BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7417         vmx = to_vmx(vcpu);
7418
7419         INIT_LIST_HEAD(&vmx->pi_wakeup_list);
7420
7421         err = -ENOMEM;
7422
7423         vmx->vpid = allocate_vpid();
7424
7425         /*
7426          * If PML is turned on, failure on enabling PML just results in failure
7427          * of creating the vcpu, therefore we can simplify PML logic (by
7428          * avoiding dealing with cases, such as enabling PML partially on vcpus
7429          * for the guest), etc.
7430          */
7431         if (enable_pml) {
7432                 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7433                 if (!vmx->pml_pg)
7434                         goto free_vpid;
7435         }
7436
7437         for (i = 0; i < kvm_nr_uret_msrs; ++i)
7438                 vmx->guest_uret_msrs[i].mask = -1ull;
7439         if (boot_cpu_has(X86_FEATURE_RTM)) {
7440                 /*
7441                  * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7442                  * Keep the host value unchanged to avoid changing CPUID bits
7443                  * under the host kernel's feet.
7444                  */
7445                 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7446                 if (tsx_ctrl)
7447                         tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7448         }
7449
7450         err = alloc_loaded_vmcs(&vmx->vmcs01);
7451         if (err < 0)
7452                 goto free_pml;
7453
7454         /*
7455          * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7456          * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7457          * feature only for vmcs01, KVM currently isn't equipped to realize any
7458          * performance benefits from enabling it for vmcs02.
7459          */
7460         if (kvm_is_using_evmcs() &&
7461             (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7462                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7463
7464                 evmcs->hv_enlightenments_control.msr_bitmap = 1;
7465         }
7466
7467         /* The MSR bitmap starts with all ones */
7468         bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7469         bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7470
7471         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
7472 #ifdef CONFIG_X86_64
7473         vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
7474         vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
7475         vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
7476 #endif
7477         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7478         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7479         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
7480         if (kvm_cstate_in_guest(vcpu->kvm)) {
7481                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
7482                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7483                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7484                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
7485         }
7486
7487         vmx->loaded_vmcs = &vmx->vmcs01;
7488
7489         if (cpu_need_virtualize_apic_accesses(vcpu)) {
7490                 err = kvm_alloc_apic_access_page(vcpu->kvm);
7491                 if (err)
7492                         goto free_vmcs;
7493         }
7494
7495         if (enable_ept && !enable_unrestricted_guest) {
7496                 err = init_rmode_identity_map(vcpu->kvm);
7497                 if (err)
7498                         goto free_vmcs;
7499         }
7500
7501         if (vmx_can_use_ipiv(vcpu))
7502                 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7503                            __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
7504
7505         return 0;
7506
7507 free_vmcs:
7508         free_loaded_vmcs(vmx->loaded_vmcs);
7509 free_pml:
7510         vmx_destroy_pml_buffer(vmx);
7511 free_vpid:
7512         free_vpid(vmx->vpid);
7513         return err;
7514 }
7515
7516 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7517 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7518
7519 static int vmx_vm_init(struct kvm *kvm)
7520 {
7521         if (!ple_gap)
7522                 kvm->arch.pause_in_guest = true;
7523
7524         if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7525                 switch (l1tf_mitigation) {
7526                 case L1TF_MITIGATION_OFF:
7527                 case L1TF_MITIGATION_FLUSH_NOWARN:
7528                         /* 'I explicitly don't care' is set */
7529                         break;
7530                 case L1TF_MITIGATION_FLUSH:
7531                 case L1TF_MITIGATION_FLUSH_NOSMT:
7532                 case L1TF_MITIGATION_FULL:
7533                         /*
7534                          * Warn upon starting the first VM in a potentially
7535                          * insecure environment.
7536                          */
7537                         if (sched_smt_active())
7538                                 pr_warn_once(L1TF_MSG_SMT);
7539                         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7540                                 pr_warn_once(L1TF_MSG_L1D);
7541                         break;
7542                 case L1TF_MITIGATION_FULL_FORCE:
7543                         /* Flush is enforced */
7544                         break;
7545                 }
7546         }
7547         return 0;
7548 }
7549
7550 static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7551 {
7552         u8 cache;
7553
7554         /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
7555          * memory aliases with conflicting memory types and sometimes MCEs.
7556          * We have to be careful as to what are honored and when.
7557          *
7558          * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
7559          * UC.  The effective memory type is UC or WC depending on guest PAT.
7560          * This was historically the source of MCEs and we want to be
7561          * conservative.
7562          *
7563          * When there is no need to deal with noncoherent DMA (e.g., no VT-d
7564          * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
7565          * EPT memory type is set to WB.  The effective memory type is forced
7566          * WB.
7567          *
7568          * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
7569          * EPT memory type is used to emulate guest CD/MTRR.
7570          */
7571
7572         if (is_mmio)
7573                 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7574
7575         if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
7576                 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7577
7578         if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
7579                 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7580                         cache = MTRR_TYPE_WRBACK;
7581                 else
7582                         cache = MTRR_TYPE_UNCACHABLE;
7583
7584                 return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7585         }
7586
7587         return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
7588 }
7589
7590 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7591 {
7592         /*
7593          * These bits in the secondary execution controls field
7594          * are dynamic, the others are mostly based on the hypervisor
7595          * architecture and the guest's CPUID.  Do not touch the
7596          * dynamic bits.
7597          */
7598         u32 mask =
7599                 SECONDARY_EXEC_SHADOW_VMCS |
7600                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7601                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7602                 SECONDARY_EXEC_DESC;
7603
7604         u32 cur_ctl = secondary_exec_controls_get(vmx);
7605
7606         secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7607 }
7608
7609 /*
7610  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7611  * (indicating "allowed-1") if they are supported in the guest's CPUID.
7612  */
7613 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7614 {
7615         struct vcpu_vmx *vmx = to_vmx(vcpu);
7616         struct kvm_cpuid_entry2 *entry;
7617
7618         vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7619         vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7620
7621 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
7622         if (entry && (entry->_reg & (_cpuid_mask)))                     \
7623                 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
7624 } while (0)
7625
7626         entry = kvm_find_cpuid_entry(vcpu, 0x1);
7627         cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
7628         cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
7629         cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
7630         cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
7631         cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
7632         cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
7633         cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
7634         cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
7635         cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
7636         cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7637         cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
7638         cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
7639         cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
7640         cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
7641
7642         entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
7643         cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
7644         cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
7645         cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
7646         cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
7647         cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
7648         cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
7649
7650 #undef cr4_fixed1_update
7651 }
7652
7653 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7654 {
7655         struct vcpu_vmx *vmx = to_vmx(vcpu);
7656         struct kvm_cpuid_entry2 *best = NULL;
7657         int i;
7658
7659         for (i = 0; i < PT_CPUID_LEAVES; i++) {
7660                 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
7661                 if (!best)
7662                         return;
7663                 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7664                 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7665                 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7666                 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7667         }
7668
7669         /* Get the number of configurable Address Ranges for filtering */
7670         vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
7671                                                 PT_CAP_num_address_ranges);
7672
7673         /* Initialize and clear the no dependency bits */
7674         vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7675                         RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7676                         RTIT_CTL_BRANCH_EN);
7677
7678         /*
7679          * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7680          * will inject an #GP
7681          */
7682         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7683                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7684
7685         /*
7686          * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7687          * PSBFreq can be set
7688          */
7689         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7690                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7691                                 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7692
7693         /*
7694          * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
7695          */
7696         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7697                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7698                                               RTIT_CTL_MTC_RANGE);
7699
7700         /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7701         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7702                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7703                                                         RTIT_CTL_PTW_EN);
7704
7705         /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7706         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7707                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7708
7709         /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7710         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7711                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7712
7713         /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
7714         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7715                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7716
7717         /* unmask address range configure area */
7718         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
7719                 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7720 }
7721
7722 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7723 {
7724         struct vcpu_vmx *vmx = to_vmx(vcpu);
7725
7726         /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7727         vcpu->arch.xsaves_enabled = false;
7728
7729         vmx_setup_uret_msrs(vmx);
7730
7731         if (cpu_has_secondary_exec_ctrls())
7732                 vmcs_set_secondary_exec_control(vmx,
7733                                                 vmx_secondary_exec_control(vmx));
7734
7735         if (nested_vmx_allowed(vcpu))
7736                 vmx->msr_ia32_feature_control_valid_bits |=
7737                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7738                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7739         else
7740                 vmx->msr_ia32_feature_control_valid_bits &=
7741                         ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7742                           FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7743
7744         if (nested_vmx_allowed(vcpu))
7745                 nested_vmx_cr_fixed1_bits_update(vcpu);
7746
7747         if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7748                         guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7749                 update_intel_pt_cfg(vcpu);
7750
7751         if (boot_cpu_has(X86_FEATURE_RTM)) {
7752                 struct vmx_uret_msr *msr;
7753                 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7754                 if (msr) {
7755                         bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7756                         vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7757                 }
7758         }
7759
7760         if (kvm_cpu_cap_has(X86_FEATURE_XFD))
7761                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
7762                                           !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
7763
7764
7765         set_cr4_guest_host_mask(vmx);
7766
7767         vmx_write_encls_bitmap(vcpu, NULL);
7768         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
7769                 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
7770         else
7771                 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7772
7773         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
7774                 vmx->msr_ia32_feature_control_valid_bits |=
7775                         FEAT_CTL_SGX_LC_ENABLED;
7776         else
7777                 vmx->msr_ia32_feature_control_valid_bits &=
7778                         ~FEAT_CTL_SGX_LC_ENABLED;
7779
7780         /* Refresh #PF interception to account for MAXPHYADDR changes. */
7781         vmx_update_exception_bitmap(vcpu);
7782 }
7783
7784 static u64 vmx_get_perf_capabilities(void)
7785 {
7786         u64 perf_cap = PMU_CAP_FW_WRITES;
7787         struct x86_pmu_lbr lbr;
7788         u64 host_perf_cap = 0;
7789
7790         if (!enable_pmu)
7791                 return 0;
7792
7793         if (boot_cpu_has(X86_FEATURE_PDCM))
7794                 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
7795
7796         x86_perf_get_lbr(&lbr);
7797         if (lbr.nr)
7798                 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
7799
7800         if (vmx_pebs_supported()) {
7801                 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
7802                 if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
7803                         perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7804         }
7805
7806         return perf_cap;
7807 }
7808
7809 static __init void vmx_set_cpu_caps(void)
7810 {
7811         kvm_set_cpu_caps();
7812
7813         /* CPUID 0x1 */
7814         if (nested)
7815                 kvm_cpu_cap_set(X86_FEATURE_VMX);
7816
7817         /* CPUID 0x7 */
7818         if (kvm_mpx_supported())
7819                 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7820         if (!cpu_has_vmx_invpcid())
7821                 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
7822         if (vmx_pt_mode_is_host_guest())
7823                 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7824         if (vmx_pebs_supported()) {
7825                 kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
7826                 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
7827         }
7828
7829         if (!enable_pmu)
7830                 kvm_cpu_cap_clear(X86_FEATURE_PDCM);
7831         kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
7832
7833         if (!enable_sgx) {
7834                 kvm_cpu_cap_clear(X86_FEATURE_SGX);
7835                 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
7836                 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
7837                 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
7838         }
7839
7840         if (vmx_umip_emulated())
7841                 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7842
7843         /* CPUID 0xD.1 */
7844         kvm_caps.supported_xss = 0;
7845         if (!cpu_has_vmx_xsaves())
7846                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7847
7848         /* CPUID 0x80000001 and 0x7 (RDPID) */
7849         if (!cpu_has_vmx_rdtscp()) {
7850                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7851                 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7852         }
7853
7854         if (cpu_has_vmx_waitpkg())
7855                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
7856 }
7857
7858 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7859 {
7860         to_vmx(vcpu)->req_immediate_exit = true;
7861 }
7862
7863 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7864                                   struct x86_instruction_info *info)
7865 {
7866         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7867         unsigned short port;
7868         bool intercept;
7869         int size;
7870
7871         if (info->intercept == x86_intercept_in ||
7872             info->intercept == x86_intercept_ins) {
7873                 port = info->src_val;
7874                 size = info->dst_bytes;
7875         } else {
7876                 port = info->dst_val;
7877                 size = info->src_bytes;
7878         }
7879
7880         /*
7881          * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7882          * VM-exits depend on the 'unconditional IO exiting' VM-execution
7883          * control.
7884          *
7885          * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7886          */
7887         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7888                 intercept = nested_cpu_has(vmcs12,
7889                                            CPU_BASED_UNCOND_IO_EXITING);
7890         else
7891                 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7892
7893         /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7894         return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7895 }
7896
7897 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7898                                struct x86_instruction_info *info,
7899                                enum x86_intercept_stage stage,
7900                                struct x86_exception *exception)
7901 {
7902         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7903
7904         switch (info->intercept) {
7905         /*
7906          * RDPID causes #UD if disabled through secondary execution controls.
7907          * Because it is marked as EmulateOnUD, we need to intercept it here.
7908          * Note, RDPID is hidden behind ENABLE_RDTSCP.
7909          */
7910         case x86_intercept_rdpid:
7911                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
7912                         exception->vector = UD_VECTOR;
7913                         exception->error_code_valid = false;
7914                         return X86EMUL_PROPAGATE_FAULT;
7915                 }
7916                 break;
7917
7918         case x86_intercept_in:
7919         case x86_intercept_ins:
7920         case x86_intercept_out:
7921         case x86_intercept_outs:
7922                 return vmx_check_intercept_io(vcpu, info);
7923
7924         case x86_intercept_lgdt:
7925         case x86_intercept_lidt:
7926         case x86_intercept_lldt:
7927         case x86_intercept_ltr:
7928         case x86_intercept_sgdt:
7929         case x86_intercept_sidt:
7930         case x86_intercept_sldt:
7931         case x86_intercept_str:
7932                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
7933                         return X86EMUL_CONTINUE;
7934
7935                 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7936                 break;
7937
7938         /* TODO: check more intercepts... */
7939         default:
7940                 break;
7941         }
7942
7943         return X86EMUL_UNHANDLEABLE;
7944 }
7945
7946 #ifdef CONFIG_X86_64
7947 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7948 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7949                                   u64 divisor, u64 *result)
7950 {
7951         u64 low = a << shift, high = a >> (64 - shift);
7952
7953         /* To avoid the overflow on divq */
7954         if (high >= divisor)
7955                 return 1;
7956
7957         /* Low hold the result, high hold rem which is discarded */
7958         asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7959             "rm" (divisor), "0" (low), "1" (high));
7960         *result = low;
7961
7962         return 0;
7963 }
7964
7965 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7966                             bool *expired)
7967 {
7968         struct vcpu_vmx *vmx;
7969         u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
7970         struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
7971
7972         vmx = to_vmx(vcpu);
7973         tscl = rdtsc();
7974         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7975         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
7976         lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
7977                                                     ktimer->timer_advance_ns);
7978
7979         if (delta_tsc > lapic_timer_advance_cycles)
7980                 delta_tsc -= lapic_timer_advance_cycles;
7981         else
7982                 delta_tsc = 0;
7983
7984         /* Convert to host delta tsc if tsc scaling is enabled */
7985         if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
7986             delta_tsc && u64_shl_div_u64(delta_tsc,
7987                                 kvm_caps.tsc_scaling_ratio_frac_bits,
7988                                 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
7989                 return -ERANGE;
7990
7991         /*
7992          * If the delta tsc can't fit in the 32 bit after the multi shift,
7993          * we can't use the preemption timer.
7994          * It's possible that it fits on later vmentries, but checking
7995          * on every vmentry is costly so we just use an hrtimer.
7996          */
7997         if (delta_tsc >> (cpu_preemption_timer_multi + 32))
7998                 return -ERANGE;
7999
8000         vmx->hv_deadline_tsc = tscl + delta_tsc;
8001         *expired = !delta_tsc;
8002         return 0;
8003 }
8004
8005 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8006 {
8007         to_vmx(vcpu)->hv_deadline_tsc = -1;
8008 }
8009 #endif
8010
8011 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
8012 {
8013         if (!kvm_pause_in_guest(vcpu->kvm))
8014                 shrink_ple_window(vcpu);
8015 }
8016
8017 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
8018 {
8019         struct vcpu_vmx *vmx = to_vmx(vcpu);
8020
8021         if (WARN_ON_ONCE(!enable_pml))
8022                 return;
8023
8024         if (is_guest_mode(vcpu)) {
8025                 vmx->nested.update_vmcs01_cpu_dirty_logging = true;
8026                 return;
8027         }
8028
8029         /*
8030          * Note, nr_memslots_dirty_logging can be changed concurrent with this
8031          * code, but in that case another update request will be made and so
8032          * the guest will never run with a stale PML value.
8033          */
8034         if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
8035                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8036         else
8037                 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8038 }
8039
8040 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
8041 {
8042         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8043                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8044                         FEAT_CTL_LMCE_ENABLED;
8045         else
8046                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8047                         ~FEAT_CTL_LMCE_ENABLED;
8048 }
8049
8050 #ifdef CONFIG_KVM_SMM
8051 static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
8052 {
8053         /* we need a nested vmexit to enter SMM, postpone if run is pending */
8054         if (to_vmx(vcpu)->nested.nested_run_pending)
8055                 return -EBUSY;
8056         return !is_smm(vcpu);
8057 }
8058
8059 static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
8060 {
8061         struct vcpu_vmx *vmx = to_vmx(vcpu);
8062
8063         /*
8064          * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8065          * SMI and RSM.  Using the common VM-Exit + VM-Enter routines is wrong
8066          * SMI and RSM only modify state that is saved and restored via SMRAM.
8067          * E.g. most MSRs are left untouched, but many are modified by VM-Exit
8068          * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8069          */
8070         vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8071         if (vmx->nested.smm.guest_mode)
8072                 nested_vmx_vmexit(vcpu, -1, 0, 0);
8073
8074         vmx->nested.smm.vmxon = vmx->nested.vmxon;
8075         vmx->nested.vmxon = false;
8076         vmx_clear_hlt(vcpu);
8077         return 0;
8078 }
8079
8080 static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
8081 {
8082         struct vcpu_vmx *vmx = to_vmx(vcpu);
8083         int ret;
8084
8085         if (vmx->nested.smm.vmxon) {
8086                 vmx->nested.vmxon = true;
8087                 vmx->nested.smm.vmxon = false;
8088         }
8089
8090         if (vmx->nested.smm.guest_mode) {
8091                 ret = nested_vmx_enter_non_root_mode(vcpu, false);
8092                 if (ret)
8093                         return ret;
8094
8095                 vmx->nested.nested_run_pending = 1;
8096                 vmx->nested.smm.guest_mode = false;
8097         }
8098         return 0;
8099 }
8100
8101 static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
8102 {
8103         /* RSM will cause a vmexit anyway.  */
8104 }
8105 #endif
8106
8107 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8108 {
8109         return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
8110 }
8111
8112 static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8113 {
8114         if (is_guest_mode(vcpu)) {
8115                 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8116
8117                 if (hrtimer_try_to_cancel(timer) == 1)
8118                         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8119         }
8120 }
8121
8122 static void vmx_hardware_unsetup(void)
8123 {
8124         kvm_set_posted_intr_wakeup_handler(NULL);
8125
8126         if (nested)
8127                 nested_vmx_hardware_unsetup();
8128
8129         free_kvm_area();
8130 }
8131
8132 #define VMX_REQUIRED_APICV_INHIBITS                     \
8133 (                                                       \
8134         BIT(APICV_INHIBIT_REASON_DISABLE)|              \
8135         BIT(APICV_INHIBIT_REASON_ABSENT) |              \
8136         BIT(APICV_INHIBIT_REASON_HYPERV) |              \
8137         BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |            \
8138         BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
8139         BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |    \
8140         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)    \
8141 )
8142
8143 static void vmx_vm_destroy(struct kvm *kvm)
8144 {
8145         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8146
8147         free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8148 }
8149
8150 static struct kvm_x86_ops vmx_x86_ops __initdata = {
8151         .name = KBUILD_MODNAME,
8152
8153         .check_processor_compatibility = vmx_check_processor_compat,
8154
8155         .hardware_unsetup = vmx_hardware_unsetup,
8156
8157         .hardware_enable = vmx_hardware_enable,
8158         .hardware_disable = vmx_hardware_disable,
8159         .has_emulated_msr = vmx_has_emulated_msr,
8160
8161         .vm_size = sizeof(struct kvm_vmx),
8162         .vm_init = vmx_vm_init,
8163         .vm_destroy = vmx_vm_destroy,
8164
8165         .vcpu_precreate = vmx_vcpu_precreate,
8166         .vcpu_create = vmx_vcpu_create,
8167         .vcpu_free = vmx_vcpu_free,
8168         .vcpu_reset = vmx_vcpu_reset,
8169
8170         .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
8171         .vcpu_load = vmx_vcpu_load,
8172         .vcpu_put = vmx_vcpu_put,
8173
8174         .update_exception_bitmap = vmx_update_exception_bitmap,
8175         .get_msr_feature = vmx_get_msr_feature,
8176         .get_msr = vmx_get_msr,
8177         .set_msr = vmx_set_msr,
8178         .get_segment_base = vmx_get_segment_base,
8179         .get_segment = vmx_get_segment,
8180         .set_segment = vmx_set_segment,
8181         .get_cpl = vmx_get_cpl,
8182         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
8183         .set_cr0 = vmx_set_cr0,
8184         .is_valid_cr4 = vmx_is_valid_cr4,
8185         .set_cr4 = vmx_set_cr4,
8186         .set_efer = vmx_set_efer,
8187         .get_idt = vmx_get_idt,
8188         .set_idt = vmx_set_idt,
8189         .get_gdt = vmx_get_gdt,
8190         .set_gdt = vmx_set_gdt,
8191         .set_dr7 = vmx_set_dr7,
8192         .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8193         .cache_reg = vmx_cache_reg,
8194         .get_rflags = vmx_get_rflags,
8195         .set_rflags = vmx_set_rflags,
8196         .get_if_flag = vmx_get_if_flag,
8197
8198         .flush_tlb_all = vmx_flush_tlb_all,
8199         .flush_tlb_current = vmx_flush_tlb_current,
8200         .flush_tlb_gva = vmx_flush_tlb_gva,
8201         .flush_tlb_guest = vmx_flush_tlb_guest,
8202
8203         .vcpu_pre_run = vmx_vcpu_pre_run,
8204         .vcpu_run = vmx_vcpu_run,
8205         .handle_exit = vmx_handle_exit,
8206         .skip_emulated_instruction = vmx_skip_emulated_instruction,
8207         .update_emulated_instruction = vmx_update_emulated_instruction,
8208         .set_interrupt_shadow = vmx_set_interrupt_shadow,
8209         .get_interrupt_shadow = vmx_get_interrupt_shadow,
8210         .patch_hypercall = vmx_patch_hypercall,
8211         .inject_irq = vmx_inject_irq,
8212         .inject_nmi = vmx_inject_nmi,
8213         .inject_exception = vmx_inject_exception,
8214         .cancel_injection = vmx_cancel_injection,
8215         .interrupt_allowed = vmx_interrupt_allowed,
8216         .nmi_allowed = vmx_nmi_allowed,
8217         .get_nmi_mask = vmx_get_nmi_mask,
8218         .set_nmi_mask = vmx_set_nmi_mask,
8219         .enable_nmi_window = vmx_enable_nmi_window,
8220         .enable_irq_window = vmx_enable_irq_window,
8221         .update_cr8_intercept = vmx_update_cr8_intercept,
8222         .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
8223         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
8224         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
8225         .load_eoi_exitmap = vmx_load_eoi_exitmap,
8226         .apicv_post_state_restore = vmx_apicv_post_state_restore,
8227         .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
8228         .hwapic_irr_update = vmx_hwapic_irr_update,
8229         .hwapic_isr_update = vmx_hwapic_isr_update,
8230         .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
8231         .sync_pir_to_irr = vmx_sync_pir_to_irr,
8232         .deliver_interrupt = vmx_deliver_interrupt,
8233         .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
8234
8235         .set_tss_addr = vmx_set_tss_addr,
8236         .set_identity_map_addr = vmx_set_identity_map_addr,
8237         .get_mt_mask = vmx_get_mt_mask,
8238
8239         .get_exit_info = vmx_get_exit_info,
8240
8241         .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
8242
8243         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
8244
8245         .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
8246         .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
8247         .write_tsc_offset = vmx_write_tsc_offset,
8248         .write_tsc_multiplier = vmx_write_tsc_multiplier,
8249
8250         .load_mmu_pgd = vmx_load_mmu_pgd,
8251
8252         .check_intercept = vmx_check_intercept,
8253         .handle_exit_irqoff = vmx_handle_exit_irqoff,
8254
8255         .request_immediate_exit = vmx_request_immediate_exit,
8256
8257         .sched_in = vmx_sched_in,
8258
8259         .cpu_dirty_log_size = PML_ENTITY_NUM,
8260         .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
8261
8262         .nested_ops = &vmx_nested_ops,
8263
8264         .pi_update_irte = vmx_pi_update_irte,
8265         .pi_start_assignment = vmx_pi_start_assignment,
8266
8267 #ifdef CONFIG_X86_64
8268         .set_hv_timer = vmx_set_hv_timer,
8269         .cancel_hv_timer = vmx_cancel_hv_timer,
8270 #endif
8271
8272         .setup_mce = vmx_setup_mce,
8273
8274 #ifdef CONFIG_KVM_SMM
8275         .smi_allowed = vmx_smi_allowed,
8276         .enter_smm = vmx_enter_smm,
8277         .leave_smm = vmx_leave_smm,
8278         .enable_smi_window = vmx_enable_smi_window,
8279 #endif
8280
8281         .can_emulate_instruction = vmx_can_emulate_instruction,
8282         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
8283         .migrate_timers = vmx_migrate_timers,
8284
8285         .msr_filter_changed = vmx_msr_filter_changed,
8286         .complete_emulated_msr = kvm_complete_insn_gp,
8287
8288         .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
8289 };
8290
8291 static unsigned int vmx_handle_intel_pt_intr(void)
8292 {
8293         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8294
8295         /* '0' on failure so that the !PT case can use a RET0 static call. */
8296         if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
8297                 return 0;
8298
8299         kvm_make_request(KVM_REQ_PMI, vcpu);
8300         __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8301                   (unsigned long *)&vcpu->arch.pmu.global_status);
8302         return 1;
8303 }
8304
8305 static __init void vmx_setup_user_return_msrs(void)
8306 {
8307
8308         /*
8309          * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8310          * will emulate SYSCALL in legacy mode if the vendor string in guest
8311          * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8312          * support this emulation, MSR_STAR is included in the list for i386,
8313          * but is never loaded into hardware.  MSR_CSTAR is also never loaded
8314          * into hardware and is here purely for emulation purposes.
8315          */
8316         const u32 vmx_uret_msrs_list[] = {
8317         #ifdef CONFIG_X86_64
8318                 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8319         #endif
8320                 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8321                 MSR_IA32_TSX_CTRL,
8322         };
8323         int i;
8324
8325         BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8326
8327         for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8328                 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
8329 }
8330
8331 static void __init vmx_setup_me_spte_mask(void)
8332 {
8333         u64 me_mask = 0;
8334
8335         /*
8336          * kvm_get_shadow_phys_bits() returns shadow_phys_bits.  Use
8337          * the former to avoid exposing shadow_phys_bits.
8338          *
8339          * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8340          * shadow_phys_bits.  On MKTME and/or TDX capable systems,
8341          * boot_cpu_data.x86_phys_bits holds the actual physical address
8342          * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
8343          * reported by CPUID.  Those bits between are KeyID bits.
8344          */
8345         if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
8346                 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8347                         kvm_get_shadow_phys_bits() - 1);
8348         /*
8349          * Unlike SME, host kernel doesn't support setting up any
8350          * MKTME KeyID on Intel platforms.  No memory encryption
8351          * bits should be included into the SPTE.
8352          */
8353         kvm_mmu_set_me_spte_mask(0, me_mask);
8354 }
8355
8356 static struct kvm_x86_init_ops vmx_init_ops __initdata;
8357
8358 static __init int hardware_setup(void)
8359 {
8360         unsigned long host_bndcfgs;
8361         struct desc_ptr dt;
8362         int r;
8363
8364         store_idt(&dt);
8365         host_idt_base = dt.address;
8366
8367         vmx_setup_user_return_msrs();
8368
8369         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8370                 return -EIO;
8371
8372         if (cpu_has_perf_global_ctrl_bug())
8373                 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
8374                              "does not work properly. Using workaround\n");
8375
8376         if (boot_cpu_has(X86_FEATURE_NX))
8377                 kvm_enable_efer_bits(EFER_NX);
8378
8379         if (boot_cpu_has(X86_FEATURE_MPX)) {
8380                 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8381                 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
8382         }
8383
8384         if (!cpu_has_vmx_mpx())
8385                 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8386                                              XFEATURE_MASK_BNDCSR);
8387
8388         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8389             !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8390                 enable_vpid = 0;
8391
8392         if (!cpu_has_vmx_ept() ||
8393             !cpu_has_vmx_ept_4levels() ||
8394             !cpu_has_vmx_ept_mt_wb() ||
8395             !cpu_has_vmx_invept_global())
8396                 enable_ept = 0;
8397
8398         /* NX support is required for shadow paging. */
8399         if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8400                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
8401                 return -EOPNOTSUPP;
8402         }
8403
8404         if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8405                 enable_ept_ad_bits = 0;
8406
8407         if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8408                 enable_unrestricted_guest = 0;
8409
8410         if (!cpu_has_vmx_flexpriority())
8411                 flexpriority_enabled = 0;
8412
8413         if (!cpu_has_virtual_nmis())
8414                 enable_vnmi = 0;
8415
8416 #ifdef CONFIG_X86_SGX_KVM
8417         if (!cpu_has_vmx_encls_vmexit())
8418                 enable_sgx = false;
8419 #endif
8420
8421         /*
8422          * set_apic_access_page_addr() is used to reload apic access
8423          * page upon invalidation.  No need to do anything if not
8424          * using the APIC_ACCESS_ADDR VMCS field.
8425          */
8426         if (!flexpriority_enabled)
8427                 vmx_x86_ops.set_apic_access_page_addr = NULL;
8428
8429         if (!cpu_has_vmx_tpr_shadow())
8430                 vmx_x86_ops.update_cr8_intercept = NULL;
8431
8432 #if IS_ENABLED(CONFIG_HYPERV)
8433         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8434             && enable_ept) {
8435                 vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
8436                 vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
8437         }
8438 #endif
8439
8440         if (!cpu_has_vmx_ple()) {
8441                 ple_gap = 0;
8442                 ple_window = 0;
8443                 ple_window_grow = 0;
8444                 ple_window_max = 0;
8445                 ple_window_shrink = 0;
8446         }
8447
8448         if (!cpu_has_vmx_apicv())
8449                 enable_apicv = 0;
8450         if (!enable_apicv)
8451                 vmx_x86_ops.sync_pir_to_irr = NULL;
8452
8453         if (!enable_apicv || !cpu_has_vmx_ipiv())
8454                 enable_ipiv = false;
8455
8456         if (cpu_has_vmx_tsc_scaling())
8457                 kvm_caps.has_tsc_control = true;
8458
8459         kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8460         kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8461         kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
8462         kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
8463
8464         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8465
8466         if (enable_ept)
8467                 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8468                                       cpu_has_vmx_ept_execute_only());
8469
8470         /*
8471          * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8472          * bits to shadow_zero_check.
8473          */
8474         vmx_setup_me_spte_mask();
8475
8476         kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
8477                           ept_caps_to_lpage_level(vmx_capability.ept));
8478
8479         /*
8480          * Only enable PML when hardware supports PML feature, and both EPT
8481          * and EPT A/D bit features are enabled -- PML depends on them to work.
8482          */
8483         if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8484                 enable_pml = 0;
8485
8486         if (!enable_pml)
8487                 vmx_x86_ops.cpu_dirty_log_size = 0;
8488
8489         if (!cpu_has_vmx_preemption_timer())
8490                 enable_preemption_timer = false;
8491
8492         if (enable_preemption_timer) {
8493                 u64 use_timer_freq = 5000ULL * 1000 * 1000;
8494
8495                 cpu_preemption_timer_multi =
8496                         vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8497
8498                 if (tsc_khz)
8499                         use_timer_freq = (u64)tsc_khz * 1000;
8500                 use_timer_freq >>= cpu_preemption_timer_multi;
8501
8502                 /*
8503                  * KVM "disables" the preemption timer by setting it to its max
8504                  * value.  Don't use the timer if it might cause spurious exits
8505                  * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8506                  */
8507                 if (use_timer_freq > 0xffffffffu / 10)
8508                         enable_preemption_timer = false;
8509         }
8510
8511         if (!enable_preemption_timer) {
8512                 vmx_x86_ops.set_hv_timer = NULL;
8513                 vmx_x86_ops.cancel_hv_timer = NULL;
8514                 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
8515         }
8516
8517         kvm_caps.supported_mce_cap |= MCG_LMCE_P;
8518         kvm_caps.supported_mce_cap |= MCG_CMCI_P;
8519
8520         if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8521                 return -EINVAL;
8522         if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
8523                 pt_mode = PT_MODE_SYSTEM;
8524         if (pt_mode == PT_MODE_HOST_GUEST)
8525                 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8526         else
8527                 vmx_init_ops.handle_intel_pt_intr = NULL;
8528
8529         setup_default_sgx_lepubkeyhash();
8530
8531         if (nested) {
8532                 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
8533
8534                 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8535                 if (r)
8536                         return r;
8537         }
8538
8539         vmx_set_cpu_caps();
8540
8541         r = alloc_kvm_area();
8542         if (r && nested)
8543                 nested_vmx_hardware_unsetup();
8544
8545         kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8546
8547         return r;
8548 }
8549
8550 static struct kvm_x86_init_ops vmx_init_ops __initdata = {
8551         .hardware_setup = hardware_setup,
8552         .handle_intel_pt_intr = NULL,
8553
8554         .runtime_ops = &vmx_x86_ops,
8555         .pmu_ops = &intel_pmu_ops,
8556 };
8557
8558 static void vmx_cleanup_l1d_flush(void)
8559 {
8560         if (vmx_l1d_flush_pages) {
8561                 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8562                 vmx_l1d_flush_pages = NULL;
8563         }
8564         /* Restore state so sysfs ignores VMX */
8565         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8566 }
8567
8568 static void __vmx_exit(void)
8569 {
8570         allow_smaller_maxphyaddr = false;
8571
8572 #ifdef CONFIG_KEXEC_CORE
8573         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8574         synchronize_rcu();
8575 #endif
8576         vmx_cleanup_l1d_flush();
8577 }
8578
8579 static void vmx_exit(void)
8580 {
8581         kvm_exit();
8582         kvm_x86_vendor_exit();
8583
8584         __vmx_exit();
8585 }
8586 module_exit(vmx_exit);
8587
8588 static int __init vmx_init(void)
8589 {
8590         int r, cpu;
8591
8592         if (!kvm_is_vmx_supported())
8593                 return -EOPNOTSUPP;
8594
8595         /*
8596          * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
8597          * to unwind if a later step fails.
8598          */
8599         hv_init_evmcs();
8600
8601         r = kvm_x86_vendor_init(&vmx_init_ops);
8602         if (r)
8603                 return r;
8604
8605         /*
8606          * Must be called after common x86 init so enable_ept is properly set
8607          * up. Hand the parameter mitigation value in which was stored in
8608          * the pre module init parser. If no parameter was given, it will
8609          * contain 'auto' which will be turned into the default 'cond'
8610          * mitigation mode.
8611          */
8612         r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8613         if (r)
8614                 goto err_l1d_flush;
8615
8616         vmx_setup_fb_clear_ctrl();
8617
8618         for_each_possible_cpu(cpu) {
8619                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8620
8621                 pi_init_cpu(cpu);
8622         }
8623
8624 #ifdef CONFIG_KEXEC_CORE
8625         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8626                            crash_vmclear_local_loaded_vmcss);
8627 #endif
8628         vmx_check_vmcs12_offsets();
8629
8630         /*
8631          * Shadow paging doesn't have a (further) performance penalty
8632          * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8633          * by default
8634          */
8635         if (!enable_ept)
8636                 allow_smaller_maxphyaddr = true;
8637
8638         /*
8639          * Common KVM initialization _must_ come last, after this, /dev/kvm is
8640          * exposed to userspace!
8641          */
8642         r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
8643                      THIS_MODULE);
8644         if (r)
8645                 goto err_kvm_init;
8646
8647         return 0;
8648
8649 err_kvm_init:
8650         __vmx_exit();
8651 err_l1d_flush:
8652         kvm_x86_vendor_exit();
8653         return r;
8654 }
8655 module_init(vmx_init);