KVM: SVM: Require nrips support for SEV guests (and beyond)
[platform/kernel/linux-starfive.git] / arch / x86 / kvm / svm / svm.c
1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
3 #include <linux/kvm_host.h>
4
5 #include "irq.h"
6 #include "mmu.h"
7 #include "kvm_cache_regs.h"
8 #include "x86.h"
9 #include "smm.h"
10 #include "cpuid.h"
11 #include "pmu.h"
12
13 #include <linux/module.h>
14 #include <linux/mod_devicetable.h>
15 #include <linux/kernel.h>
16 #include <linux/vmalloc.h>
17 #include <linux/highmem.h>
18 #include <linux/amd-iommu.h>
19 #include <linux/sched.h>
20 #include <linux/trace_events.h>
21 #include <linux/slab.h>
22 #include <linux/hashtable.h>
23 #include <linux/objtool.h>
24 #include <linux/psp-sev.h>
25 #include <linux/file.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/rwsem.h>
29 #include <linux/cc_platform.h>
30 #include <linux/smp.h>
31
32 #include <asm/apic.h>
33 #include <asm/perf_event.h>
34 #include <asm/tlbflush.h>
35 #include <asm/desc.h>
36 #include <asm/debugreg.h>
37 #include <asm/kvm_para.h>
38 #include <asm/irq_remapping.h>
39 #include <asm/spec-ctrl.h>
40 #include <asm/cpu_device_id.h>
41 #include <asm/traps.h>
42 #include <asm/fpu/api.h>
43
44 #include <asm/virtext.h>
45
46 #include <trace/events/ipi.h>
47
48 #include "trace.h"
49
50 #include "svm.h"
51 #include "svm_ops.h"
52
53 #include "kvm_onhyperv.h"
54 #include "svm_onhyperv.h"
55
56 MODULE_AUTHOR("Qumranet");
57 MODULE_LICENSE("GPL");
58
59 #ifdef MODULE
60 static const struct x86_cpu_id svm_cpu_id[] = {
61         X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
62         {}
63 };
64 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
65 #endif
66
67 #define SEG_TYPE_LDT 2
68 #define SEG_TYPE_BUSY_TSS16 3
69
70 static bool erratum_383_found __read_mostly;
71
72 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
73
74 /*
75  * Set osvw_len to higher value when updated Revision Guides
76  * are published and we know what the new status bits are
77  */
78 static uint64_t osvw_len = 4, osvw_status;
79
80 static DEFINE_PER_CPU(u64, current_tsc_ratio);
81
82 #define X2APIC_MSR(x)   (APIC_BASE_MSR + (x >> 4))
83
84 static const struct svm_direct_access_msrs {
85         u32 index;   /* Index of the MSR */
86         bool always; /* True if intercept is initially cleared */
87 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
88         { .index = MSR_STAR,                            .always = true  },
89         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
90         { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
91         { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
92 #ifdef CONFIG_X86_64
93         { .index = MSR_GS_BASE,                         .always = true  },
94         { .index = MSR_FS_BASE,                         .always = true  },
95         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
96         { .index = MSR_LSTAR,                           .always = true  },
97         { .index = MSR_CSTAR,                           .always = true  },
98         { .index = MSR_SYSCALL_MASK,                    .always = true  },
99 #endif
100         { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
101         { .index = MSR_IA32_PRED_CMD,                   .always = false },
102         { .index = MSR_IA32_FLUSH_CMD,                  .always = false },
103         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
104         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
105         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
106         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
107         { .index = MSR_EFER,                            .always = false },
108         { .index = MSR_IA32_CR_PAT,                     .always = false },
109         { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
110         { .index = MSR_TSC_AUX,                         .always = false },
111         { .index = X2APIC_MSR(APIC_ID),                 .always = false },
112         { .index = X2APIC_MSR(APIC_LVR),                .always = false },
113         { .index = X2APIC_MSR(APIC_TASKPRI),            .always = false },
114         { .index = X2APIC_MSR(APIC_ARBPRI),             .always = false },
115         { .index = X2APIC_MSR(APIC_PROCPRI),            .always = false },
116         { .index = X2APIC_MSR(APIC_EOI),                .always = false },
117         { .index = X2APIC_MSR(APIC_RRR),                .always = false },
118         { .index = X2APIC_MSR(APIC_LDR),                .always = false },
119         { .index = X2APIC_MSR(APIC_DFR),                .always = false },
120         { .index = X2APIC_MSR(APIC_SPIV),               .always = false },
121         { .index = X2APIC_MSR(APIC_ISR),                .always = false },
122         { .index = X2APIC_MSR(APIC_TMR),                .always = false },
123         { .index = X2APIC_MSR(APIC_IRR),                .always = false },
124         { .index = X2APIC_MSR(APIC_ESR),                .always = false },
125         { .index = X2APIC_MSR(APIC_ICR),                .always = false },
126         { .index = X2APIC_MSR(APIC_ICR2),               .always = false },
127
128         /*
129          * Note:
130          * AMD does not virtualize APIC TSC-deadline timer mode, but it is
131          * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
132          * the AVIC hardware would generate GP fault. Therefore, always
133          * intercept the MSR 0x832, and do not setup direct_access_msr.
134          */
135         { .index = X2APIC_MSR(APIC_LVTTHMR),            .always = false },
136         { .index = X2APIC_MSR(APIC_LVTPC),              .always = false },
137         { .index = X2APIC_MSR(APIC_LVT0),               .always = false },
138         { .index = X2APIC_MSR(APIC_LVT1),               .always = false },
139         { .index = X2APIC_MSR(APIC_LVTERR),             .always = false },
140         { .index = X2APIC_MSR(APIC_TMICT),              .always = false },
141         { .index = X2APIC_MSR(APIC_TMCCT),              .always = false },
142         { .index = X2APIC_MSR(APIC_TDCR),               .always = false },
143         { .index = MSR_INVALID,                         .always = false },
144 };
145
146 /*
147  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
148  * pause_filter_count: On processors that support Pause filtering(indicated
149  *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
150  *      count value. On VMRUN this value is loaded into an internal counter.
151  *      Each time a pause instruction is executed, this counter is decremented
152  *      until it reaches zero at which time a #VMEXIT is generated if pause
153  *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
154  *      Intercept Filtering for more details.
155  *      This also indicate if ple logic enabled.
156  *
157  * pause_filter_thresh: In addition, some processor families support advanced
158  *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
159  *      the amount of time a guest is allowed to execute in a pause loop.
160  *      In this mode, a 16-bit pause filter threshold field is added in the
161  *      VMCB. The threshold value is a cycle count that is used to reset the
162  *      pause counter. As with simple pause filtering, VMRUN loads the pause
163  *      count value from VMCB into an internal counter. Then, on each pause
164  *      instruction the hardware checks the elapsed number of cycles since
165  *      the most recent pause instruction against the pause filter threshold.
166  *      If the elapsed cycle count is greater than the pause filter threshold,
167  *      then the internal pause count is reloaded from the VMCB and execution
168  *      continues. If the elapsed cycle count is less than the pause filter
169  *      threshold, then the internal pause count is decremented. If the count
170  *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
171  *      triggered. If advanced pause filtering is supported and pause filter
172  *      threshold field is set to zero, the filter will operate in the simpler,
173  *      count only mode.
174  */
175
176 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
177 module_param(pause_filter_thresh, ushort, 0444);
178
179 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
180 module_param(pause_filter_count, ushort, 0444);
181
182 /* Default doubles per-vcpu window every exit. */
183 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
184 module_param(pause_filter_count_grow, ushort, 0444);
185
186 /* Default resets per-vcpu window every exit to pause_filter_count. */
187 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
188 module_param(pause_filter_count_shrink, ushort, 0444);
189
190 /* Default is to compute the maximum so we can never overflow. */
191 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
192 module_param(pause_filter_count_max, ushort, 0444);
193
194 /*
195  * Use nested page tables by default.  Note, NPT may get forced off by
196  * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
197  */
198 bool npt_enabled = true;
199 module_param_named(npt, npt_enabled, bool, 0444);
200
201 /* allow nested virtualization in KVM/SVM */
202 static int nested = true;
203 module_param(nested, int, S_IRUGO);
204
205 /* enable/disable Next RIP Save */
206 int nrips = true;
207 module_param(nrips, int, 0444);
208
209 /* enable/disable Virtual VMLOAD VMSAVE */
210 static int vls = true;
211 module_param(vls, int, 0444);
212
213 /* enable/disable Virtual GIF */
214 int vgif = true;
215 module_param(vgif, int, 0444);
216
217 /* enable/disable LBR virtualization */
218 static int lbrv = true;
219 module_param(lbrv, int, 0444);
220
221 static int tsc_scaling = true;
222 module_param(tsc_scaling, int, 0444);
223
224 /*
225  * enable / disable AVIC.  Because the defaults differ for APICv
226  * support between VMX and SVM we cannot use module_param_named.
227  */
228 static bool avic;
229 module_param(avic, bool, 0444);
230
231 bool __read_mostly dump_invalid_vmcb;
232 module_param(dump_invalid_vmcb, bool, 0644);
233
234
235 bool intercept_smi = true;
236 module_param(intercept_smi, bool, 0444);
237
238 bool vnmi = true;
239 module_param(vnmi, bool, 0444);
240
241 static bool svm_gp_erratum_intercept = true;
242
243 static u8 rsm_ins_bytes[] = "\x0f\xaa";
244
245 static unsigned long iopm_base;
246
247 DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
248
249 /*
250  * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
251  * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
252  *
253  * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
254  * defer the restoration of TSC_AUX until the CPU returns to userspace.
255  */
256 static int tsc_aux_uret_slot __read_mostly = -1;
257
258 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
259
260 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
261 #define MSRS_RANGE_SIZE 2048
262 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
263
264 u32 svm_msrpm_offset(u32 msr)
265 {
266         u32 offset;
267         int i;
268
269         for (i = 0; i < NUM_MSR_MAPS; i++) {
270                 if (msr < msrpm_ranges[i] ||
271                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
272                         continue;
273
274                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
275                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
276
277                 /* Now we have the u8 offset - but need the u32 offset */
278                 return offset / 4;
279         }
280
281         /* MSR not in any range */
282         return MSR_INVALID;
283 }
284
285 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
286
287 static int get_npt_level(void)
288 {
289 #ifdef CONFIG_X86_64
290         return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
291 #else
292         return PT32E_ROOT_LEVEL;
293 #endif
294 }
295
296 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
297 {
298         struct vcpu_svm *svm = to_svm(vcpu);
299         u64 old_efer = vcpu->arch.efer;
300         vcpu->arch.efer = efer;
301
302         if (!npt_enabled) {
303                 /* Shadow paging assumes NX to be available.  */
304                 efer |= EFER_NX;
305
306                 if (!(efer & EFER_LMA))
307                         efer &= ~EFER_LME;
308         }
309
310         if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
311                 if (!(efer & EFER_SVME)) {
312                         svm_leave_nested(vcpu);
313                         svm_set_gif(svm, true);
314                         /* #GP intercept is still needed for vmware backdoor */
315                         if (!enable_vmware_backdoor)
316                                 clr_exception_intercept(svm, GP_VECTOR);
317
318                         /*
319                          * Free the nested guest state, unless we are in SMM.
320                          * In this case we will return to the nested guest
321                          * as soon as we leave SMM.
322                          */
323                         if (!is_smm(vcpu))
324                                 svm_free_nested(svm);
325
326                 } else {
327                         int ret = svm_allocate_nested(svm);
328
329                         if (ret) {
330                                 vcpu->arch.efer = old_efer;
331                                 return ret;
332                         }
333
334                         /*
335                          * Never intercept #GP for SEV guests, KVM can't
336                          * decrypt guest memory to workaround the erratum.
337                          */
338                         if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
339                                 set_exception_intercept(svm, GP_VECTOR);
340                 }
341         }
342
343         svm->vmcb->save.efer = efer | EFER_SVME;
344         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
345         return 0;
346 }
347
348 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
349 {
350         struct vcpu_svm *svm = to_svm(vcpu);
351         u32 ret = 0;
352
353         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
354                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
355         return ret;
356 }
357
358 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
359 {
360         struct vcpu_svm *svm = to_svm(vcpu);
361
362         if (mask == 0)
363                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
364         else
365                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
366
367 }
368 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
369                                         void *insn, int insn_len);
370
371 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
372                                            bool commit_side_effects)
373 {
374         struct vcpu_svm *svm = to_svm(vcpu);
375         unsigned long old_rflags;
376
377         /*
378          * SEV-ES does not expose the next RIP. The RIP update is controlled by
379          * the type of exit and the #VC handler in the guest.
380          */
381         if (sev_es_guest(vcpu->kvm))
382                 goto done;
383
384         if (nrips && svm->vmcb->control.next_rip != 0) {
385                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
386                 svm->next_rip = svm->vmcb->control.next_rip;
387         }
388
389         if (!svm->next_rip) {
390                 /*
391                  * FIXME: Drop this when kvm_emulate_instruction() does the
392                  * right thing and treats "can't emulate" as outright failure
393                  * for EMULTYPE_SKIP.
394                  */
395                 if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
396                         return 0;
397
398                 if (unlikely(!commit_side_effects))
399                         old_rflags = svm->vmcb->save.rflags;
400
401                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
402                         return 0;
403
404                 if (unlikely(!commit_side_effects))
405                         svm->vmcb->save.rflags = old_rflags;
406         } else {
407                 kvm_rip_write(vcpu, svm->next_rip);
408         }
409
410 done:
411         if (likely(commit_side_effects))
412                 svm_set_interrupt_shadow(vcpu, 0);
413
414         return 1;
415 }
416
417 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
418 {
419         return __svm_skip_emulated_instruction(vcpu, true);
420 }
421
422 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
423 {
424         unsigned long rip, old_rip = kvm_rip_read(vcpu);
425         struct vcpu_svm *svm = to_svm(vcpu);
426
427         /*
428          * Due to architectural shortcomings, the CPU doesn't always provide
429          * NextRIP, e.g. if KVM intercepted an exception that occurred while
430          * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
431          * the instruction even if NextRIP is supported to acquire the next
432          * RIP so that it can be shoved into the NextRIP field, otherwise
433          * hardware will fail to advance guest RIP during event injection.
434          * Drop the exception/interrupt if emulation fails and effectively
435          * retry the instruction, it's the least awful option.  If NRIPS is
436          * in use, the skip must not commit any side effects such as clearing
437          * the interrupt shadow or RFLAGS.RF.
438          */
439         if (!__svm_skip_emulated_instruction(vcpu, !nrips))
440                 return -EIO;
441
442         rip = kvm_rip_read(vcpu);
443
444         /*
445          * Save the injection information, even when using next_rip, as the
446          * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
447          * doesn't complete due to a VM-Exit occurring while the CPU is
448          * vectoring the event.   Decoding the instruction isn't guaranteed to
449          * work as there may be no backing instruction, e.g. if the event is
450          * being injected by L1 for L2, or if the guest is patching INT3 into
451          * a different instruction.
452          */
453         svm->soft_int_injected = true;
454         svm->soft_int_csbase = svm->vmcb->save.cs.base;
455         svm->soft_int_old_rip = old_rip;
456         svm->soft_int_next_rip = rip;
457
458         if (nrips)
459                 kvm_rip_write(vcpu, old_rip);
460
461         if (static_cpu_has(X86_FEATURE_NRIPS))
462                 svm->vmcb->control.next_rip = rip;
463
464         return 0;
465 }
466
467 static void svm_inject_exception(struct kvm_vcpu *vcpu)
468 {
469         struct kvm_queued_exception *ex = &vcpu->arch.exception;
470         struct vcpu_svm *svm = to_svm(vcpu);
471
472         kvm_deliver_exception_payload(vcpu, ex);
473
474         if (kvm_exception_is_soft(ex->vector) &&
475             svm_update_soft_interrupt_rip(vcpu))
476                 return;
477
478         svm->vmcb->control.event_inj = ex->vector
479                 | SVM_EVTINJ_VALID
480                 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
481                 | SVM_EVTINJ_TYPE_EXEPT;
482         svm->vmcb->control.event_inj_err = ex->error_code;
483 }
484
485 static void svm_init_erratum_383(void)
486 {
487         u32 low, high;
488         int err;
489         u64 val;
490
491         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
492                 return;
493
494         /* Use _safe variants to not break nested virtualization */
495         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
496         if (err)
497                 return;
498
499         val |= (1ULL << 47);
500
501         low  = lower_32_bits(val);
502         high = upper_32_bits(val);
503
504         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
505
506         erratum_383_found = true;
507 }
508
509 static void svm_init_osvw(struct kvm_vcpu *vcpu)
510 {
511         /*
512          * Guests should see errata 400 and 415 as fixed (assuming that
513          * HLT and IO instructions are intercepted).
514          */
515         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
516         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
517
518         /*
519          * By increasing VCPU's osvw.length to 3 we are telling the guest that
520          * all osvw.status bits inside that length, including bit 0 (which is
521          * reserved for erratum 298), are valid. However, if host processor's
522          * osvw_len is 0 then osvw_status[0] carries no information. We need to
523          * be conservative here and therefore we tell the guest that erratum 298
524          * is present (because we really don't know).
525          */
526         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
527                 vcpu->arch.osvw.status |= 1;
528 }
529
530 static bool kvm_is_svm_supported(void)
531 {
532         int cpu = raw_smp_processor_id();
533         const char *msg;
534         u64 vm_cr;
535
536         if (!cpu_has_svm(&msg)) {
537                 pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
538                 return false;
539         }
540
541         if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
542                 pr_info("KVM is unsupported when running as an SEV guest\n");
543                 return false;
544         }
545
546         rdmsrl(MSR_VM_CR, vm_cr);
547         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
548                 pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
549                 return false;
550         }
551
552         return true;
553 }
554
555 static int svm_check_processor_compat(void)
556 {
557         if (!kvm_is_svm_supported())
558                 return -EIO;
559
560         return 0;
561 }
562
563 void __svm_write_tsc_multiplier(u64 multiplier)
564 {
565         preempt_disable();
566
567         if (multiplier == __this_cpu_read(current_tsc_ratio))
568                 goto out;
569
570         wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
571         __this_cpu_write(current_tsc_ratio, multiplier);
572 out:
573         preempt_enable();
574 }
575
576 static void svm_hardware_disable(void)
577 {
578         /* Make sure we clean up behind us */
579         if (tsc_scaling)
580                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
581
582         cpu_svm_disable();
583
584         amd_pmu_disable_virt();
585 }
586
587 static int svm_hardware_enable(void)
588 {
589
590         struct svm_cpu_data *sd;
591         uint64_t efer;
592         int me = raw_smp_processor_id();
593
594         rdmsrl(MSR_EFER, efer);
595         if (efer & EFER_SVME)
596                 return -EBUSY;
597
598         sd = per_cpu_ptr(&svm_data, me);
599         sd->asid_generation = 1;
600         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
601         sd->next_asid = sd->max_asid + 1;
602         sd->min_asid = max_sev_asid + 1;
603
604         wrmsrl(MSR_EFER, efer | EFER_SVME);
605
606         wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
607
608         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
609                 /*
610                  * Set the default value, even if we don't use TSC scaling
611                  * to avoid having stale value in the msr
612                  */
613                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
614         }
615
616
617         /*
618          * Get OSVW bits.
619          *
620          * Note that it is possible to have a system with mixed processor
621          * revisions and therefore different OSVW bits. If bits are not the same
622          * on different processors then choose the worst case (i.e. if erratum
623          * is present on one processor and not on another then assume that the
624          * erratum is present everywhere).
625          */
626         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
627                 uint64_t len, status = 0;
628                 int err;
629
630                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
631                 if (!err)
632                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
633                                                       &err);
634
635                 if (err)
636                         osvw_status = osvw_len = 0;
637                 else {
638                         if (len < osvw_len)
639                                 osvw_len = len;
640                         osvw_status |= status;
641                         osvw_status &= (1ULL << osvw_len) - 1;
642                 }
643         } else
644                 osvw_status = osvw_len = 0;
645
646         svm_init_erratum_383();
647
648         amd_pmu_enable_virt();
649
650         return 0;
651 }
652
653 static void svm_cpu_uninit(int cpu)
654 {
655         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
656
657         if (!sd->save_area)
658                 return;
659
660         kfree(sd->sev_vmcbs);
661         __free_page(sd->save_area);
662         sd->save_area_pa = 0;
663         sd->save_area = NULL;
664 }
665
666 static int svm_cpu_init(int cpu)
667 {
668         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
669         int ret = -ENOMEM;
670
671         memset(sd, 0, sizeof(struct svm_cpu_data));
672         sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
673         if (!sd->save_area)
674                 return ret;
675
676         ret = sev_cpu_init(sd);
677         if (ret)
678                 goto free_save_area;
679
680         sd->save_area_pa = __sme_page_pa(sd->save_area);
681         return 0;
682
683 free_save_area:
684         __free_page(sd->save_area);
685         sd->save_area = NULL;
686         return ret;
687
688 }
689
690 static void set_dr_intercepts(struct vcpu_svm *svm)
691 {
692         struct vmcb *vmcb = svm->vmcb01.ptr;
693
694         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
695         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
696         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
697         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
698         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
699         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
700         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
701         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
702         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
703         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
704         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
705         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
706         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
707         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
708         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
709         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
710
711         recalc_intercepts(svm);
712 }
713
714 static void clr_dr_intercepts(struct vcpu_svm *svm)
715 {
716         struct vmcb *vmcb = svm->vmcb01.ptr;
717
718         vmcb->control.intercepts[INTERCEPT_DR] = 0;
719
720         recalc_intercepts(svm);
721 }
722
723 static int direct_access_msr_slot(u32 msr)
724 {
725         u32 i;
726
727         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
728                 if (direct_access_msrs[i].index == msr)
729                         return i;
730
731         return -ENOENT;
732 }
733
734 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
735                                      int write)
736 {
737         struct vcpu_svm *svm = to_svm(vcpu);
738         int slot = direct_access_msr_slot(msr);
739
740         if (slot == -ENOENT)
741                 return;
742
743         /* Set the shadow bitmaps to the desired intercept states */
744         if (read)
745                 set_bit(slot, svm->shadow_msr_intercept.read);
746         else
747                 clear_bit(slot, svm->shadow_msr_intercept.read);
748
749         if (write)
750                 set_bit(slot, svm->shadow_msr_intercept.write);
751         else
752                 clear_bit(slot, svm->shadow_msr_intercept.write);
753 }
754
755 static bool valid_msr_intercept(u32 index)
756 {
757         return direct_access_msr_slot(index) != -ENOENT;
758 }
759
760 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
761 {
762         u8 bit_write;
763         unsigned long tmp;
764         u32 offset;
765         u32 *msrpm;
766
767         /*
768          * For non-nested case:
769          * If the L01 MSR bitmap does not intercept the MSR, then we need to
770          * save it.
771          *
772          * For nested case:
773          * If the L02 MSR bitmap does not intercept the MSR, then we need to
774          * save it.
775          */
776         msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
777                                       to_svm(vcpu)->msrpm;
778
779         offset    = svm_msrpm_offset(msr);
780         bit_write = 2 * (msr & 0x0f) + 1;
781         tmp       = msrpm[offset];
782
783         BUG_ON(offset == MSR_INVALID);
784
785         return test_bit(bit_write, &tmp);
786 }
787
788 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
789                                         u32 msr, int read, int write)
790 {
791         struct vcpu_svm *svm = to_svm(vcpu);
792         u8 bit_read, bit_write;
793         unsigned long tmp;
794         u32 offset;
795
796         /*
797          * If this warning triggers extend the direct_access_msrs list at the
798          * beginning of the file
799          */
800         WARN_ON(!valid_msr_intercept(msr));
801
802         /* Enforce non allowed MSRs to trap */
803         if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
804                 read = 0;
805
806         if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
807                 write = 0;
808
809         offset    = svm_msrpm_offset(msr);
810         bit_read  = 2 * (msr & 0x0f);
811         bit_write = 2 * (msr & 0x0f) + 1;
812         tmp       = msrpm[offset];
813
814         BUG_ON(offset == MSR_INVALID);
815
816         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
817         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
818
819         msrpm[offset] = tmp;
820
821         svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
822         svm->nested.force_msr_bitmap_recalc = true;
823 }
824
825 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
826                           int read, int write)
827 {
828         set_shadow_msr_intercept(vcpu, msr, read, write);
829         set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
830 }
831
832 u32 *svm_vcpu_alloc_msrpm(void)
833 {
834         unsigned int order = get_order(MSRPM_SIZE);
835         struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
836         u32 *msrpm;
837
838         if (!pages)
839                 return NULL;
840
841         msrpm = page_address(pages);
842         memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
843
844         return msrpm;
845 }
846
847 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
848 {
849         int i;
850
851         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
852                 if (!direct_access_msrs[i].always)
853                         continue;
854                 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
855         }
856 }
857
858 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
859 {
860         int i;
861
862         if (intercept == svm->x2avic_msrs_intercepted)
863                 return;
864
865         if (!x2avic_enabled ||
866             !apic_x2apic_mode(svm->vcpu.arch.apic))
867                 return;
868
869         for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
870                 int index = direct_access_msrs[i].index;
871
872                 if ((index < APIC_BASE_MSR) ||
873                     (index > APIC_BASE_MSR + 0xff))
874                         continue;
875                 set_msr_interception(&svm->vcpu, svm->msrpm, index,
876                                      !intercept, !intercept);
877         }
878
879         svm->x2avic_msrs_intercepted = intercept;
880 }
881
882 void svm_vcpu_free_msrpm(u32 *msrpm)
883 {
884         __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
885 }
886
887 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
888 {
889         struct vcpu_svm *svm = to_svm(vcpu);
890         u32 i;
891
892         /*
893          * Set intercept permissions for all direct access MSRs again. They
894          * will automatically get filtered through the MSR filter, so we are
895          * back in sync after this.
896          */
897         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
898                 u32 msr = direct_access_msrs[i].index;
899                 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
900                 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
901
902                 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
903         }
904 }
905
906 static void add_msr_offset(u32 offset)
907 {
908         int i;
909
910         for (i = 0; i < MSRPM_OFFSETS; ++i) {
911
912                 /* Offset already in list? */
913                 if (msrpm_offsets[i] == offset)
914                         return;
915
916                 /* Slot used by another offset? */
917                 if (msrpm_offsets[i] != MSR_INVALID)
918                         continue;
919
920                 /* Add offset to list */
921                 msrpm_offsets[i] = offset;
922
923                 return;
924         }
925
926         /*
927          * If this BUG triggers the msrpm_offsets table has an overflow. Just
928          * increase MSRPM_OFFSETS in this case.
929          */
930         BUG();
931 }
932
933 static void init_msrpm_offsets(void)
934 {
935         int i;
936
937         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
938
939         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
940                 u32 offset;
941
942                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
943                 BUG_ON(offset == MSR_INVALID);
944
945                 add_msr_offset(offset);
946         }
947 }
948
949 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
950 {
951         to_vmcb->save.dbgctl            = from_vmcb->save.dbgctl;
952         to_vmcb->save.br_from           = from_vmcb->save.br_from;
953         to_vmcb->save.br_to             = from_vmcb->save.br_to;
954         to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
955         to_vmcb->save.last_excp_to      = from_vmcb->save.last_excp_to;
956
957         vmcb_mark_dirty(to_vmcb, VMCB_LBR);
958 }
959
960 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
961 {
962         struct vcpu_svm *svm = to_svm(vcpu);
963
964         svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
965         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
966         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
967         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
968         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
969
970         /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
971         if (is_guest_mode(vcpu))
972                 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
973 }
974
975 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
976 {
977         struct vcpu_svm *svm = to_svm(vcpu);
978
979         svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
980         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
981         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
982         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
983         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
984
985         /*
986          * Move the LBR msrs back to the vmcb01 to avoid copying them
987          * on nested guest entries.
988          */
989         if (is_guest_mode(vcpu))
990                 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
991 }
992
993 static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
994 {
995         /*
996          * If LBR virtualization is disabled, the LBR MSRs are always kept in
997          * vmcb01.  If LBR virtualization is enabled and L1 is running VMs of
998          * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
999          */
1000         return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
1001                                                                    svm->vmcb01.ptr;
1002 }
1003
1004 void svm_update_lbrv(struct kvm_vcpu *vcpu)
1005 {
1006         struct vcpu_svm *svm = to_svm(vcpu);
1007         bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
1008         bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
1009                            (is_guest_mode(vcpu) && svm->lbrv_enabled &&
1010                             (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
1011
1012         if (enable_lbrv == current_enable_lbrv)
1013                 return;
1014
1015         if (enable_lbrv)
1016                 svm_enable_lbrv(vcpu);
1017         else
1018                 svm_disable_lbrv(vcpu);
1019 }
1020
1021 void disable_nmi_singlestep(struct vcpu_svm *svm)
1022 {
1023         svm->nmi_singlestep = false;
1024
1025         if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1026                 /* Clear our flags if they were not set by the guest */
1027                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1028                         svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1029                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1030                         svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1031         }
1032 }
1033
1034 static void grow_ple_window(struct kvm_vcpu *vcpu)
1035 {
1036         struct vcpu_svm *svm = to_svm(vcpu);
1037         struct vmcb_control_area *control = &svm->vmcb->control;
1038         int old = control->pause_filter_count;
1039
1040         if (kvm_pause_in_guest(vcpu->kvm))
1041                 return;
1042
1043         control->pause_filter_count = __grow_ple_window(old,
1044                                                         pause_filter_count,
1045                                                         pause_filter_count_grow,
1046                                                         pause_filter_count_max);
1047
1048         if (control->pause_filter_count != old) {
1049                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1050                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1051                                             control->pause_filter_count, old);
1052         }
1053 }
1054
1055 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1056 {
1057         struct vcpu_svm *svm = to_svm(vcpu);
1058         struct vmcb_control_area *control = &svm->vmcb->control;
1059         int old = control->pause_filter_count;
1060
1061         if (kvm_pause_in_guest(vcpu->kvm))
1062                 return;
1063
1064         control->pause_filter_count =
1065                                 __shrink_ple_window(old,
1066                                                     pause_filter_count,
1067                                                     pause_filter_count_shrink,
1068                                                     pause_filter_count);
1069         if (control->pause_filter_count != old) {
1070                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1071                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1072                                             control->pause_filter_count, old);
1073         }
1074 }
1075
1076 static void svm_hardware_unsetup(void)
1077 {
1078         int cpu;
1079
1080         sev_hardware_unsetup();
1081
1082         for_each_possible_cpu(cpu)
1083                 svm_cpu_uninit(cpu);
1084
1085         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1086         get_order(IOPM_SIZE));
1087         iopm_base = 0;
1088 }
1089
1090 static void init_seg(struct vmcb_seg *seg)
1091 {
1092         seg->selector = 0;
1093         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1094                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1095         seg->limit = 0xffff;
1096         seg->base = 0;
1097 }
1098
1099 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1100 {
1101         seg->selector = 0;
1102         seg->attrib = SVM_SELECTOR_P_MASK | type;
1103         seg->limit = 0xffff;
1104         seg->base = 0;
1105 }
1106
1107 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1108 {
1109         struct vcpu_svm *svm = to_svm(vcpu);
1110
1111         return svm->nested.ctl.tsc_offset;
1112 }
1113
1114 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1115 {
1116         struct vcpu_svm *svm = to_svm(vcpu);
1117
1118         return svm->tsc_ratio_msr;
1119 }
1120
1121 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1122 {
1123         struct vcpu_svm *svm = to_svm(vcpu);
1124
1125         svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1126         svm->vmcb->control.tsc_offset = offset;
1127         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1128 }
1129
1130 static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1131 {
1132         __svm_write_tsc_multiplier(multiplier);
1133 }
1134
1135
1136 /* Evaluate instruction intercepts that depend on guest CPUID features. */
1137 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1138                                               struct vcpu_svm *svm)
1139 {
1140         /*
1141          * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1142          * roots, or if INVPCID is disabled in the guest to inject #UD.
1143          */
1144         if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1145                 if (!npt_enabled ||
1146                     !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1147                         svm_set_intercept(svm, INTERCEPT_INVPCID);
1148                 else
1149                         svm_clr_intercept(svm, INTERCEPT_INVPCID);
1150         }
1151
1152         if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1153                 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1154                         svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1155                 else
1156                         svm_set_intercept(svm, INTERCEPT_RDTSCP);
1157         }
1158 }
1159
1160 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1161 {
1162         struct vcpu_svm *svm = to_svm(vcpu);
1163
1164         if (guest_cpuid_is_intel(vcpu)) {
1165                 /*
1166                  * We must intercept SYSENTER_EIP and SYSENTER_ESP
1167                  * accesses because the processor only stores 32 bits.
1168                  * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1169                  */
1170                 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1171                 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1172                 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1173
1174                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1175                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1176
1177                 svm->v_vmload_vmsave_enabled = false;
1178         } else {
1179                 /*
1180                  * If hardware supports Virtual VMLOAD VMSAVE then enable it
1181                  * in VMCB and clear intercepts to avoid #VMEXIT.
1182                  */
1183                 if (vls) {
1184                         svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1185                         svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1186                         svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1187                 }
1188                 /* No need to intercept these MSRs */
1189                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1190                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1191         }
1192 }
1193
1194 static void init_vmcb(struct kvm_vcpu *vcpu)
1195 {
1196         struct vcpu_svm *svm = to_svm(vcpu);
1197         struct vmcb *vmcb = svm->vmcb01.ptr;
1198         struct vmcb_control_area *control = &vmcb->control;
1199         struct vmcb_save_area *save = &vmcb->save;
1200
1201         svm_set_intercept(svm, INTERCEPT_CR0_READ);
1202         svm_set_intercept(svm, INTERCEPT_CR3_READ);
1203         svm_set_intercept(svm, INTERCEPT_CR4_READ);
1204         svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1205         svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1206         svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1207         if (!kvm_vcpu_apicv_active(vcpu))
1208                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1209
1210         set_dr_intercepts(svm);
1211
1212         set_exception_intercept(svm, PF_VECTOR);
1213         set_exception_intercept(svm, UD_VECTOR);
1214         set_exception_intercept(svm, MC_VECTOR);
1215         set_exception_intercept(svm, AC_VECTOR);
1216         set_exception_intercept(svm, DB_VECTOR);
1217         /*
1218          * Guest access to VMware backdoor ports could legitimately
1219          * trigger #GP because of TSS I/O permission bitmap.
1220          * We intercept those #GP and allow access to them anyway
1221          * as VMware does.
1222          */
1223         if (enable_vmware_backdoor)
1224                 set_exception_intercept(svm, GP_VECTOR);
1225
1226         svm_set_intercept(svm, INTERCEPT_INTR);
1227         svm_set_intercept(svm, INTERCEPT_NMI);
1228
1229         if (intercept_smi)
1230                 svm_set_intercept(svm, INTERCEPT_SMI);
1231
1232         svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1233         svm_set_intercept(svm, INTERCEPT_RDPMC);
1234         svm_set_intercept(svm, INTERCEPT_CPUID);
1235         svm_set_intercept(svm, INTERCEPT_INVD);
1236         svm_set_intercept(svm, INTERCEPT_INVLPG);
1237         svm_set_intercept(svm, INTERCEPT_INVLPGA);
1238         svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1239         svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1240         svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1241         svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1242         svm_set_intercept(svm, INTERCEPT_VMRUN);
1243         svm_set_intercept(svm, INTERCEPT_VMMCALL);
1244         svm_set_intercept(svm, INTERCEPT_VMLOAD);
1245         svm_set_intercept(svm, INTERCEPT_VMSAVE);
1246         svm_set_intercept(svm, INTERCEPT_STGI);
1247         svm_set_intercept(svm, INTERCEPT_CLGI);
1248         svm_set_intercept(svm, INTERCEPT_SKINIT);
1249         svm_set_intercept(svm, INTERCEPT_WBINVD);
1250         svm_set_intercept(svm, INTERCEPT_XSETBV);
1251         svm_set_intercept(svm, INTERCEPT_RDPRU);
1252         svm_set_intercept(svm, INTERCEPT_RSM);
1253
1254         if (!kvm_mwait_in_guest(vcpu->kvm)) {
1255                 svm_set_intercept(svm, INTERCEPT_MONITOR);
1256                 svm_set_intercept(svm, INTERCEPT_MWAIT);
1257         }
1258
1259         if (!kvm_hlt_in_guest(vcpu->kvm))
1260                 svm_set_intercept(svm, INTERCEPT_HLT);
1261
1262         control->iopm_base_pa = __sme_set(iopm_base);
1263         control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1264         control->int_ctl = V_INTR_MASKING_MASK;
1265
1266         init_seg(&save->es);
1267         init_seg(&save->ss);
1268         init_seg(&save->ds);
1269         init_seg(&save->fs);
1270         init_seg(&save->gs);
1271
1272         save->cs.selector = 0xf000;
1273         save->cs.base = 0xffff0000;
1274         /* Executable/Readable Code Segment */
1275         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1276                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1277         save->cs.limit = 0xffff;
1278
1279         save->gdtr.base = 0;
1280         save->gdtr.limit = 0xffff;
1281         save->idtr.base = 0;
1282         save->idtr.limit = 0xffff;
1283
1284         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1285         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1286
1287         if (npt_enabled) {
1288                 /* Setup VMCB for Nested Paging */
1289                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1290                 svm_clr_intercept(svm, INTERCEPT_INVLPG);
1291                 clr_exception_intercept(svm, PF_VECTOR);
1292                 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1293                 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1294                 save->g_pat = vcpu->arch.pat;
1295                 save->cr3 = 0;
1296         }
1297         svm->current_vmcb->asid_generation = 0;
1298         svm->asid = 0;
1299
1300         svm->nested.vmcb12_gpa = INVALID_GPA;
1301         svm->nested.last_vmcb12_gpa = INVALID_GPA;
1302
1303         if (!kvm_pause_in_guest(vcpu->kvm)) {
1304                 control->pause_filter_count = pause_filter_count;
1305                 if (pause_filter_thresh)
1306                         control->pause_filter_thresh = pause_filter_thresh;
1307                 svm_set_intercept(svm, INTERCEPT_PAUSE);
1308         } else {
1309                 svm_clr_intercept(svm, INTERCEPT_PAUSE);
1310         }
1311
1312         svm_recalc_instruction_intercepts(vcpu, svm);
1313
1314         /*
1315          * If the host supports V_SPEC_CTRL then disable the interception
1316          * of MSR_IA32_SPEC_CTRL.
1317          */
1318         if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1319                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1320
1321         if (kvm_vcpu_apicv_active(vcpu))
1322                 avic_init_vmcb(svm, vmcb);
1323
1324         if (vnmi)
1325                 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
1326
1327         if (vgif) {
1328                 svm_clr_intercept(svm, INTERCEPT_STGI);
1329                 svm_clr_intercept(svm, INTERCEPT_CLGI);
1330                 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1331         }
1332
1333         if (sev_guest(vcpu->kvm))
1334                 sev_init_vmcb(svm);
1335
1336         svm_hv_init_vmcb(vmcb);
1337         init_vmcb_after_set_cpuid(vcpu);
1338
1339         vmcb_mark_all_dirty(vmcb);
1340
1341         enable_gif(svm);
1342 }
1343
1344 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1345 {
1346         struct vcpu_svm *svm = to_svm(vcpu);
1347
1348         svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1349
1350         svm_init_osvw(vcpu);
1351         vcpu->arch.microcode_version = 0x01000065;
1352         svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1353
1354         svm->nmi_masked = false;
1355         svm->awaiting_iret_completion = false;
1356
1357         if (sev_es_guest(vcpu->kvm))
1358                 sev_es_vcpu_reset(svm);
1359 }
1360
1361 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1362 {
1363         struct vcpu_svm *svm = to_svm(vcpu);
1364
1365         svm->spec_ctrl = 0;
1366         svm->virt_spec_ctrl = 0;
1367
1368         init_vmcb(vcpu);
1369
1370         if (!init_event)
1371                 __svm_vcpu_reset(vcpu);
1372 }
1373
1374 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1375 {
1376         svm->current_vmcb = target_vmcb;
1377         svm->vmcb = target_vmcb->ptr;
1378 }
1379
1380 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1381 {
1382         struct vcpu_svm *svm;
1383         struct page *vmcb01_page;
1384         struct page *vmsa_page = NULL;
1385         int err;
1386
1387         BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1388         svm = to_svm(vcpu);
1389
1390         err = -ENOMEM;
1391         vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1392         if (!vmcb01_page)
1393                 goto out;
1394
1395         if (sev_es_guest(vcpu->kvm)) {
1396                 /*
1397                  * SEV-ES guests require a separate VMSA page used to contain
1398                  * the encrypted register state of the guest.
1399                  */
1400                 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1401                 if (!vmsa_page)
1402                         goto error_free_vmcb_page;
1403
1404                 /*
1405                  * SEV-ES guests maintain an encrypted version of their FPU
1406                  * state which is restored and saved on VMRUN and VMEXIT.
1407                  * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1408                  * do xsave/xrstor on it.
1409                  */
1410                 fpstate_set_confidential(&vcpu->arch.guest_fpu);
1411         }
1412
1413         err = avic_init_vcpu(svm);
1414         if (err)
1415                 goto error_free_vmsa_page;
1416
1417         svm->msrpm = svm_vcpu_alloc_msrpm();
1418         if (!svm->msrpm) {
1419                 err = -ENOMEM;
1420                 goto error_free_vmsa_page;
1421         }
1422
1423         svm->x2avic_msrs_intercepted = true;
1424
1425         svm->vmcb01.ptr = page_address(vmcb01_page);
1426         svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1427         svm_switch_vmcb(svm, &svm->vmcb01);
1428
1429         if (vmsa_page)
1430                 svm->sev_es.vmsa = page_address(vmsa_page);
1431
1432         svm->guest_state_loaded = false;
1433
1434         return 0;
1435
1436 error_free_vmsa_page:
1437         if (vmsa_page)
1438                 __free_page(vmsa_page);
1439 error_free_vmcb_page:
1440         __free_page(vmcb01_page);
1441 out:
1442         return err;
1443 }
1444
1445 static void svm_clear_current_vmcb(struct vmcb *vmcb)
1446 {
1447         int i;
1448
1449         for_each_online_cpu(i)
1450                 cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
1451 }
1452
1453 static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1454 {
1455         struct vcpu_svm *svm = to_svm(vcpu);
1456
1457         /*
1458          * The vmcb page can be recycled, causing a false negative in
1459          * svm_vcpu_load(). So, ensure that no logical CPU has this
1460          * vmcb page recorded as its current vmcb.
1461          */
1462         svm_clear_current_vmcb(svm->vmcb);
1463
1464         svm_leave_nested(vcpu);
1465         svm_free_nested(svm);
1466
1467         sev_free_vcpu(vcpu);
1468
1469         __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1470         __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1471 }
1472
1473 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1474 {
1475         struct vcpu_svm *svm = to_svm(vcpu);
1476         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1477
1478         if (sev_es_guest(vcpu->kvm))
1479                 sev_es_unmap_ghcb(svm);
1480
1481         if (svm->guest_state_loaded)
1482                 return;
1483
1484         /*
1485          * Save additional host state that will be restored on VMEXIT (sev-es)
1486          * or subsequent vmload of host save area.
1487          */
1488         vmsave(sd->save_area_pa);
1489         if (sev_es_guest(vcpu->kvm)) {
1490                 struct sev_es_save_area *hostsa;
1491                 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1492
1493                 sev_es_prepare_switch_to_guest(hostsa);
1494         }
1495
1496         if (tsc_scaling)
1497                 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1498
1499         if (likely(tsc_aux_uret_slot >= 0))
1500                 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1501
1502         svm->guest_state_loaded = true;
1503 }
1504
1505 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1506 {
1507         to_svm(vcpu)->guest_state_loaded = false;
1508 }
1509
1510 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1511 {
1512         struct vcpu_svm *svm = to_svm(vcpu);
1513         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
1514
1515         if (sd->current_vmcb != svm->vmcb) {
1516                 sd->current_vmcb = svm->vmcb;
1517                 indirect_branch_prediction_barrier();
1518         }
1519         if (kvm_vcpu_apicv_active(vcpu))
1520                 avic_vcpu_load(vcpu, cpu);
1521 }
1522
1523 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1524 {
1525         if (kvm_vcpu_apicv_active(vcpu))
1526                 avic_vcpu_put(vcpu);
1527
1528         svm_prepare_host_switch(vcpu);
1529
1530         ++vcpu->stat.host_state_reload;
1531 }
1532
1533 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1534 {
1535         struct vcpu_svm *svm = to_svm(vcpu);
1536         unsigned long rflags = svm->vmcb->save.rflags;
1537
1538         if (svm->nmi_singlestep) {
1539                 /* Hide our flags if they were not set by the guest */
1540                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1541                         rflags &= ~X86_EFLAGS_TF;
1542                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1543                         rflags &= ~X86_EFLAGS_RF;
1544         }
1545         return rflags;
1546 }
1547
1548 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1549 {
1550         if (to_svm(vcpu)->nmi_singlestep)
1551                 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1552
1553        /*
1554         * Any change of EFLAGS.VM is accompanied by a reload of SS
1555         * (caused by either a task switch or an inter-privilege IRET),
1556         * so we do not need to update the CPL here.
1557         */
1558         to_svm(vcpu)->vmcb->save.rflags = rflags;
1559 }
1560
1561 static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1562 {
1563         struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1564
1565         return sev_es_guest(vcpu->kvm)
1566                 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1567                 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1568 }
1569
1570 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1571 {
1572         kvm_register_mark_available(vcpu, reg);
1573
1574         switch (reg) {
1575         case VCPU_EXREG_PDPTR:
1576                 /*
1577                  * When !npt_enabled, mmu->pdptrs[] is already available since
1578                  * it is always updated per SDM when moving to CRs.
1579                  */
1580                 if (npt_enabled)
1581                         load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1582                 break;
1583         default:
1584                 KVM_BUG_ON(1, vcpu->kvm);
1585         }
1586 }
1587
1588 static void svm_set_vintr(struct vcpu_svm *svm)
1589 {
1590         struct vmcb_control_area *control;
1591
1592         /*
1593          * The following fields are ignored when AVIC is enabled
1594          */
1595         WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1596
1597         svm_set_intercept(svm, INTERCEPT_VINTR);
1598
1599         /*
1600          * Recalculating intercepts may have cleared the VINTR intercept.  If
1601          * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
1602          * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
1603          * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
1604          * interrupts will never be unblocked while L2 is running.
1605          */
1606         if (!svm_is_intercept(svm, INTERCEPT_VINTR))
1607                 return;
1608
1609         /*
1610          * This is just a dummy VINTR to actually cause a vmexit to happen.
1611          * Actual injection of virtual interrupts happens through EVENTINJ.
1612          */
1613         control = &svm->vmcb->control;
1614         control->int_vector = 0x0;
1615         control->int_ctl &= ~V_INTR_PRIO_MASK;
1616         control->int_ctl |= V_IRQ_MASK |
1617                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1618         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1619 }
1620
1621 static void svm_clear_vintr(struct vcpu_svm *svm)
1622 {
1623         svm_clr_intercept(svm, INTERCEPT_VINTR);
1624
1625         /* Drop int_ctl fields related to VINTR injection.  */
1626         svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1627         if (is_guest_mode(&svm->vcpu)) {
1628                 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1629
1630                 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1631                         (svm->nested.ctl.int_ctl & V_TPR_MASK));
1632
1633                 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1634                         V_IRQ_INJECTION_BITS_MASK;
1635
1636                 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1637         }
1638
1639         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1640 }
1641
1642 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1643 {
1644         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1645         struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1646
1647         switch (seg) {
1648         case VCPU_SREG_CS: return &save->cs;
1649         case VCPU_SREG_DS: return &save->ds;
1650         case VCPU_SREG_ES: return &save->es;
1651         case VCPU_SREG_FS: return &save01->fs;
1652         case VCPU_SREG_GS: return &save01->gs;
1653         case VCPU_SREG_SS: return &save->ss;
1654         case VCPU_SREG_TR: return &save01->tr;
1655         case VCPU_SREG_LDTR: return &save01->ldtr;
1656         }
1657         BUG();
1658         return NULL;
1659 }
1660
1661 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1662 {
1663         struct vmcb_seg *s = svm_seg(vcpu, seg);
1664
1665         return s->base;
1666 }
1667
1668 static void svm_get_segment(struct kvm_vcpu *vcpu,
1669                             struct kvm_segment *var, int seg)
1670 {
1671         struct vmcb_seg *s = svm_seg(vcpu, seg);
1672
1673         var->base = s->base;
1674         var->limit = s->limit;
1675         var->selector = s->selector;
1676         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1677         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1678         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1679         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1680         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1681         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1682         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1683
1684         /*
1685          * AMD CPUs circa 2014 track the G bit for all segments except CS.
1686          * However, the SVM spec states that the G bit is not observed by the
1687          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1688          * So let's synthesize a legal G bit for all segments, this helps
1689          * running KVM nested. It also helps cross-vendor migration, because
1690          * Intel's vmentry has a check on the 'G' bit.
1691          */
1692         var->g = s->limit > 0xfffff;
1693
1694         /*
1695          * AMD's VMCB does not have an explicit unusable field, so emulate it
1696          * for cross vendor migration purposes by "not present"
1697          */
1698         var->unusable = !var->present;
1699
1700         switch (seg) {
1701         case VCPU_SREG_TR:
1702                 /*
1703                  * Work around a bug where the busy flag in the tr selector
1704                  * isn't exposed
1705                  */
1706                 var->type |= 0x2;
1707                 break;
1708         case VCPU_SREG_DS:
1709         case VCPU_SREG_ES:
1710         case VCPU_SREG_FS:
1711         case VCPU_SREG_GS:
1712                 /*
1713                  * The accessed bit must always be set in the segment
1714                  * descriptor cache, although it can be cleared in the
1715                  * descriptor, the cached bit always remains at 1. Since
1716                  * Intel has a check on this, set it here to support
1717                  * cross-vendor migration.
1718                  */
1719                 if (!var->unusable)
1720                         var->type |= 0x1;
1721                 break;
1722         case VCPU_SREG_SS:
1723                 /*
1724                  * On AMD CPUs sometimes the DB bit in the segment
1725                  * descriptor is left as 1, although the whole segment has
1726                  * been made unusable. Clear it here to pass an Intel VMX
1727                  * entry check when cross vendor migrating.
1728                  */
1729                 if (var->unusable)
1730                         var->db = 0;
1731                 /* This is symmetric with svm_set_segment() */
1732                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1733                 break;
1734         }
1735 }
1736
1737 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1738 {
1739         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1740
1741         return save->cpl;
1742 }
1743
1744 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1745 {
1746         struct kvm_segment cs;
1747
1748         svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1749         *db = cs.db;
1750         *l = cs.l;
1751 }
1752
1753 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1754 {
1755         struct vcpu_svm *svm = to_svm(vcpu);
1756
1757         dt->size = svm->vmcb->save.idtr.limit;
1758         dt->address = svm->vmcb->save.idtr.base;
1759 }
1760
1761 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1762 {
1763         struct vcpu_svm *svm = to_svm(vcpu);
1764
1765         svm->vmcb->save.idtr.limit = dt->size;
1766         svm->vmcb->save.idtr.base = dt->address ;
1767         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1768 }
1769
1770 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1771 {
1772         struct vcpu_svm *svm = to_svm(vcpu);
1773
1774         dt->size = svm->vmcb->save.gdtr.limit;
1775         dt->address = svm->vmcb->save.gdtr.base;
1776 }
1777
1778 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1779 {
1780         struct vcpu_svm *svm = to_svm(vcpu);
1781
1782         svm->vmcb->save.gdtr.limit = dt->size;
1783         svm->vmcb->save.gdtr.base = dt->address ;
1784         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1785 }
1786
1787 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1788 {
1789         struct vcpu_svm *svm = to_svm(vcpu);
1790
1791         /*
1792          * For guests that don't set guest_state_protected, the cr3 update is
1793          * handled via kvm_mmu_load() while entering the guest. For guests
1794          * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1795          * VMCB save area now, since the save area will become the initial
1796          * contents of the VMSA, and future VMCB save area updates won't be
1797          * seen.
1798          */
1799         if (sev_es_guest(vcpu->kvm)) {
1800                 svm->vmcb->save.cr3 = cr3;
1801                 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1802         }
1803 }
1804
1805 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1806 {
1807         struct vcpu_svm *svm = to_svm(vcpu);
1808         u64 hcr0 = cr0;
1809         bool old_paging = is_paging(vcpu);
1810
1811 #ifdef CONFIG_X86_64
1812         if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1813                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1814                         vcpu->arch.efer |= EFER_LMA;
1815                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1816                 }
1817
1818                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1819                         vcpu->arch.efer &= ~EFER_LMA;
1820                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1821                 }
1822         }
1823 #endif
1824         vcpu->arch.cr0 = cr0;
1825
1826         if (!npt_enabled) {
1827                 hcr0 |= X86_CR0_PG | X86_CR0_WP;
1828                 if (old_paging != is_paging(vcpu))
1829                         svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1830         }
1831
1832         /*
1833          * re-enable caching here because the QEMU bios
1834          * does not do it - this results in some delay at
1835          * reboot
1836          */
1837         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1838                 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1839
1840         svm->vmcb->save.cr0 = hcr0;
1841         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1842
1843         /*
1844          * SEV-ES guests must always keep the CR intercepts cleared. CR
1845          * tracking is done using the CR write traps.
1846          */
1847         if (sev_es_guest(vcpu->kvm))
1848                 return;
1849
1850         if (hcr0 == cr0) {
1851                 /* Selective CR0 write remains on.  */
1852                 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1853                 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1854         } else {
1855                 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1856                 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1857         }
1858 }
1859
1860 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1861 {
1862         return true;
1863 }
1864
1865 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1866 {
1867         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1868         unsigned long old_cr4 = vcpu->arch.cr4;
1869
1870         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1871                 svm_flush_tlb_current(vcpu);
1872
1873         vcpu->arch.cr4 = cr4;
1874         if (!npt_enabled) {
1875                 cr4 |= X86_CR4_PAE;
1876
1877                 if (!is_paging(vcpu))
1878                         cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1879         }
1880         cr4 |= host_cr4_mce;
1881         to_svm(vcpu)->vmcb->save.cr4 = cr4;
1882         vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1883
1884         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1885                 kvm_update_cpuid_runtime(vcpu);
1886 }
1887
1888 static void svm_set_segment(struct kvm_vcpu *vcpu,
1889                             struct kvm_segment *var, int seg)
1890 {
1891         struct vcpu_svm *svm = to_svm(vcpu);
1892         struct vmcb_seg *s = svm_seg(vcpu, seg);
1893
1894         s->base = var->base;
1895         s->limit = var->limit;
1896         s->selector = var->selector;
1897         s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1898         s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1899         s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1900         s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1901         s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1902         s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1903         s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1904         s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1905
1906         /*
1907          * This is always accurate, except if SYSRET returned to a segment
1908          * with SS.DPL != 3.  Intel does not have this quirk, and always
1909          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1910          * would entail passing the CPL to userspace and back.
1911          */
1912         if (seg == VCPU_SREG_SS)
1913                 /* This is symmetric with svm_get_segment() */
1914                 svm->vmcb->save.cpl = (var->dpl & 3);
1915
1916         vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1917 }
1918
1919 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1920 {
1921         struct vcpu_svm *svm = to_svm(vcpu);
1922
1923         clr_exception_intercept(svm, BP_VECTOR);
1924
1925         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1926                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1927                         set_exception_intercept(svm, BP_VECTOR);
1928         }
1929 }
1930
1931 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1932 {
1933         if (sd->next_asid > sd->max_asid) {
1934                 ++sd->asid_generation;
1935                 sd->next_asid = sd->min_asid;
1936                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1937                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1938         }
1939
1940         svm->current_vmcb->asid_generation = sd->asid_generation;
1941         svm->asid = sd->next_asid++;
1942 }
1943
1944 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1945 {
1946         struct vmcb *vmcb = svm->vmcb;
1947
1948         if (svm->vcpu.arch.guest_state_protected)
1949                 return;
1950
1951         if (unlikely(value != vmcb->save.dr6)) {
1952                 vmcb->save.dr6 = value;
1953                 vmcb_mark_dirty(vmcb, VMCB_DR);
1954         }
1955 }
1956
1957 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1958 {
1959         struct vcpu_svm *svm = to_svm(vcpu);
1960
1961         if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
1962                 return;
1963
1964         get_debugreg(vcpu->arch.db[0], 0);
1965         get_debugreg(vcpu->arch.db[1], 1);
1966         get_debugreg(vcpu->arch.db[2], 2);
1967         get_debugreg(vcpu->arch.db[3], 3);
1968         /*
1969          * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1970          * because db_interception might need it.  We can do it before vmentry.
1971          */
1972         vcpu->arch.dr6 = svm->vmcb->save.dr6;
1973         vcpu->arch.dr7 = svm->vmcb->save.dr7;
1974         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1975         set_dr_intercepts(svm);
1976 }
1977
1978 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1979 {
1980         struct vcpu_svm *svm = to_svm(vcpu);
1981
1982         if (vcpu->arch.guest_state_protected)
1983                 return;
1984
1985         svm->vmcb->save.dr7 = value;
1986         vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1987 }
1988
1989 static int pf_interception(struct kvm_vcpu *vcpu)
1990 {
1991         struct vcpu_svm *svm = to_svm(vcpu);
1992
1993         u64 fault_address = svm->vmcb->control.exit_info_2;
1994         u64 error_code = svm->vmcb->control.exit_info_1;
1995
1996         return kvm_handle_page_fault(vcpu, error_code, fault_address,
1997                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1998                         svm->vmcb->control.insn_bytes : NULL,
1999                         svm->vmcb->control.insn_len);
2000 }
2001
2002 static int npf_interception(struct kvm_vcpu *vcpu)
2003 {
2004         struct vcpu_svm *svm = to_svm(vcpu);
2005
2006         u64 fault_address = svm->vmcb->control.exit_info_2;
2007         u64 error_code = svm->vmcb->control.exit_info_1;
2008
2009         trace_kvm_page_fault(vcpu, fault_address, error_code);
2010         return kvm_mmu_page_fault(vcpu, fault_address, error_code,
2011                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2012                         svm->vmcb->control.insn_bytes : NULL,
2013                         svm->vmcb->control.insn_len);
2014 }
2015
2016 static int db_interception(struct kvm_vcpu *vcpu)
2017 {
2018         struct kvm_run *kvm_run = vcpu->run;
2019         struct vcpu_svm *svm = to_svm(vcpu);
2020
2021         if (!(vcpu->guest_debug &
2022               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2023                 !svm->nmi_singlestep) {
2024                 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
2025                 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
2026                 return 1;
2027         }
2028
2029         if (svm->nmi_singlestep) {
2030                 disable_nmi_singlestep(svm);
2031                 /* Make sure we check for pending NMIs upon entry */
2032                 kvm_make_request(KVM_REQ_EVENT, vcpu);
2033         }
2034
2035         if (vcpu->guest_debug &
2036             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2037                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2038                 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2039                 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
2040                 kvm_run->debug.arch.pc =
2041                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2042                 kvm_run->debug.arch.exception = DB_VECTOR;
2043                 return 0;
2044         }
2045
2046         return 1;
2047 }
2048
2049 static int bp_interception(struct kvm_vcpu *vcpu)
2050 {
2051         struct vcpu_svm *svm = to_svm(vcpu);
2052         struct kvm_run *kvm_run = vcpu->run;
2053
2054         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2055         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2056         kvm_run->debug.arch.exception = BP_VECTOR;
2057         return 0;
2058 }
2059
2060 static int ud_interception(struct kvm_vcpu *vcpu)
2061 {
2062         return handle_ud(vcpu);
2063 }
2064
2065 static int ac_interception(struct kvm_vcpu *vcpu)
2066 {
2067         kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2068         return 1;
2069 }
2070
2071 static bool is_erratum_383(void)
2072 {
2073         int err, i;
2074         u64 value;
2075
2076         if (!erratum_383_found)
2077                 return false;
2078
2079         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2080         if (err)
2081                 return false;
2082
2083         /* Bit 62 may or may not be set for this mce */
2084         value &= ~(1ULL << 62);
2085
2086         if (value != 0xb600000000010015ULL)
2087                 return false;
2088
2089         /* Clear MCi_STATUS registers */
2090         for (i = 0; i < 6; ++i)
2091                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2092
2093         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2094         if (!err) {
2095                 u32 low, high;
2096
2097                 value &= ~(1ULL << 2);
2098                 low    = lower_32_bits(value);
2099                 high   = upper_32_bits(value);
2100
2101                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2102         }
2103
2104         /* Flush tlb to evict multi-match entries */
2105         __flush_tlb_all();
2106
2107         return true;
2108 }
2109
2110 static void svm_handle_mce(struct kvm_vcpu *vcpu)
2111 {
2112         if (is_erratum_383()) {
2113                 /*
2114                  * Erratum 383 triggered. Guest state is corrupt so kill the
2115                  * guest.
2116                  */
2117                 pr_err("Guest triggered AMD Erratum 383\n");
2118
2119                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2120
2121                 return;
2122         }
2123
2124         /*
2125          * On an #MC intercept the MCE handler is not called automatically in
2126          * the host. So do it by hand here.
2127          */
2128         kvm_machine_check();
2129 }
2130
2131 static int mc_interception(struct kvm_vcpu *vcpu)
2132 {
2133         return 1;
2134 }
2135
2136 static int shutdown_interception(struct kvm_vcpu *vcpu)
2137 {
2138         struct kvm_run *kvm_run = vcpu->run;
2139         struct vcpu_svm *svm = to_svm(vcpu);
2140
2141         /*
2142          * The VM save area has already been encrypted so it
2143          * cannot be reinitialized - just terminate.
2144          */
2145         if (sev_es_guest(vcpu->kvm))
2146                 return -EINVAL;
2147
2148         /*
2149          * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2150          * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2151          * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2152          * userspace.  At a platform view, INIT is acceptable behavior as
2153          * there exist bare metal platforms that automatically INIT the CPU
2154          * in response to shutdown.
2155          */
2156         clear_page(svm->vmcb);
2157         kvm_vcpu_reset(vcpu, true);
2158
2159         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2160         return 0;
2161 }
2162
2163 static int io_interception(struct kvm_vcpu *vcpu)
2164 {
2165         struct vcpu_svm *svm = to_svm(vcpu);
2166         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2167         int size, in, string;
2168         unsigned port;
2169
2170         ++vcpu->stat.io_exits;
2171         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2172         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2173         port = io_info >> 16;
2174         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2175
2176         if (string) {
2177                 if (sev_es_guest(vcpu->kvm))
2178                         return sev_es_string_io(svm, size, port, in);
2179                 else
2180                         return kvm_emulate_instruction(vcpu, 0);
2181         }
2182
2183         svm->next_rip = svm->vmcb->control.exit_info_2;
2184
2185         return kvm_fast_pio(vcpu, size, port, in);
2186 }
2187
2188 static int nmi_interception(struct kvm_vcpu *vcpu)
2189 {
2190         return 1;
2191 }
2192
2193 static int smi_interception(struct kvm_vcpu *vcpu)
2194 {
2195         return 1;
2196 }
2197
2198 static int intr_interception(struct kvm_vcpu *vcpu)
2199 {
2200         ++vcpu->stat.irq_exits;
2201         return 1;
2202 }
2203
2204 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2205 {
2206         struct vcpu_svm *svm = to_svm(vcpu);
2207         struct vmcb *vmcb12;
2208         struct kvm_host_map map;
2209         int ret;
2210
2211         if (nested_svm_check_permissions(vcpu))
2212                 return 1;
2213
2214         ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2215         if (ret) {
2216                 if (ret == -EINVAL)
2217                         kvm_inject_gp(vcpu, 0);
2218                 return 1;
2219         }
2220
2221         vmcb12 = map.hva;
2222
2223         ret = kvm_skip_emulated_instruction(vcpu);
2224
2225         if (vmload) {
2226                 svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2227                 svm->sysenter_eip_hi = 0;
2228                 svm->sysenter_esp_hi = 0;
2229         } else {
2230                 svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2231         }
2232
2233         kvm_vcpu_unmap(vcpu, &map, true);
2234
2235         return ret;
2236 }
2237
2238 static int vmload_interception(struct kvm_vcpu *vcpu)
2239 {
2240         return vmload_vmsave_interception(vcpu, true);
2241 }
2242
2243 static int vmsave_interception(struct kvm_vcpu *vcpu)
2244 {
2245         return vmload_vmsave_interception(vcpu, false);
2246 }
2247
2248 static int vmrun_interception(struct kvm_vcpu *vcpu)
2249 {
2250         if (nested_svm_check_permissions(vcpu))
2251                 return 1;
2252
2253         return nested_svm_vmrun(vcpu);
2254 }
2255
2256 enum {
2257         NONE_SVM_INSTR,
2258         SVM_INSTR_VMRUN,
2259         SVM_INSTR_VMLOAD,
2260         SVM_INSTR_VMSAVE,
2261 };
2262
2263 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2264 static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2265 {
2266         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2267
2268         if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2269                 return NONE_SVM_INSTR;
2270
2271         switch (ctxt->modrm) {
2272         case 0xd8: /* VMRUN */
2273                 return SVM_INSTR_VMRUN;
2274         case 0xda: /* VMLOAD */
2275                 return SVM_INSTR_VMLOAD;
2276         case 0xdb: /* VMSAVE */
2277                 return SVM_INSTR_VMSAVE;
2278         default:
2279                 break;
2280         }
2281
2282         return NONE_SVM_INSTR;
2283 }
2284
2285 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2286 {
2287         const int guest_mode_exit_codes[] = {
2288                 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2289                 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2290                 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2291         };
2292         int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2293                 [SVM_INSTR_VMRUN] = vmrun_interception,
2294                 [SVM_INSTR_VMLOAD] = vmload_interception,
2295                 [SVM_INSTR_VMSAVE] = vmsave_interception,
2296         };
2297         struct vcpu_svm *svm = to_svm(vcpu);
2298         int ret;
2299
2300         if (is_guest_mode(vcpu)) {
2301                 /* Returns '1' or -errno on failure, '0' on success. */
2302                 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2303                 if (ret)
2304                         return ret;
2305                 return 1;
2306         }
2307         return svm_instr_handlers[opcode](vcpu);
2308 }
2309
2310 /*
2311  * #GP handling code. Note that #GP can be triggered under the following two
2312  * cases:
2313  *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2314  *      some AMD CPUs when EAX of these instructions are in the reserved memory
2315  *      regions (e.g. SMM memory on host).
2316  *   2) VMware backdoor
2317  */
2318 static int gp_interception(struct kvm_vcpu *vcpu)
2319 {
2320         struct vcpu_svm *svm = to_svm(vcpu);
2321         u32 error_code = svm->vmcb->control.exit_info_1;
2322         int opcode;
2323
2324         /* Both #GP cases have zero error_code */
2325         if (error_code)
2326                 goto reinject;
2327
2328         /* Decode the instruction for usage later */
2329         if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2330                 goto reinject;
2331
2332         opcode = svm_instr_opcode(vcpu);
2333
2334         if (opcode == NONE_SVM_INSTR) {
2335                 if (!enable_vmware_backdoor)
2336                         goto reinject;
2337
2338                 /*
2339                  * VMware backdoor emulation on #GP interception only handles
2340                  * IN{S}, OUT{S}, and RDPMC.
2341                  */
2342                 if (!is_guest_mode(vcpu))
2343                         return kvm_emulate_instruction(vcpu,
2344                                 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2345         } else {
2346                 /* All SVM instructions expect page aligned RAX */
2347                 if (svm->vmcb->save.rax & ~PAGE_MASK)
2348                         goto reinject;
2349
2350                 return emulate_svm_instr(vcpu, opcode);
2351         }
2352
2353 reinject:
2354         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2355         return 1;
2356 }
2357
2358 void svm_set_gif(struct vcpu_svm *svm, bool value)
2359 {
2360         if (value) {
2361                 /*
2362                  * If VGIF is enabled, the STGI intercept is only added to
2363                  * detect the opening of the SMI/NMI window; remove it now.
2364                  * Likewise, clear the VINTR intercept, we will set it
2365                  * again while processing KVM_REQ_EVENT if needed.
2366                  */
2367                 if (vgif)
2368                         svm_clr_intercept(svm, INTERCEPT_STGI);
2369                 if (svm_is_intercept(svm, INTERCEPT_VINTR))
2370                         svm_clear_vintr(svm);
2371
2372                 enable_gif(svm);
2373                 if (svm->vcpu.arch.smi_pending ||
2374                     svm->vcpu.arch.nmi_pending ||
2375                     kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2376                     kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2377                         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2378         } else {
2379                 disable_gif(svm);
2380
2381                 /*
2382                  * After a CLGI no interrupts should come.  But if vGIF is
2383                  * in use, we still rely on the VINTR intercept (rather than
2384                  * STGI) to detect an open interrupt window.
2385                 */
2386                 if (!vgif)
2387                         svm_clear_vintr(svm);
2388         }
2389 }
2390
2391 static int stgi_interception(struct kvm_vcpu *vcpu)
2392 {
2393         int ret;
2394
2395         if (nested_svm_check_permissions(vcpu))
2396                 return 1;
2397
2398         ret = kvm_skip_emulated_instruction(vcpu);
2399         svm_set_gif(to_svm(vcpu), true);
2400         return ret;
2401 }
2402
2403 static int clgi_interception(struct kvm_vcpu *vcpu)
2404 {
2405         int ret;
2406
2407         if (nested_svm_check_permissions(vcpu))
2408                 return 1;
2409
2410         ret = kvm_skip_emulated_instruction(vcpu);
2411         svm_set_gif(to_svm(vcpu), false);
2412         return ret;
2413 }
2414
2415 static int invlpga_interception(struct kvm_vcpu *vcpu)
2416 {
2417         gva_t gva = kvm_rax_read(vcpu);
2418         u32 asid = kvm_rcx_read(vcpu);
2419
2420         /* FIXME: Handle an address size prefix. */
2421         if (!is_long_mode(vcpu))
2422                 gva = (u32)gva;
2423
2424         trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2425
2426         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2427         kvm_mmu_invlpg(vcpu, gva);
2428
2429         return kvm_skip_emulated_instruction(vcpu);
2430 }
2431
2432 static int skinit_interception(struct kvm_vcpu *vcpu)
2433 {
2434         trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2435
2436         kvm_queue_exception(vcpu, UD_VECTOR);
2437         return 1;
2438 }
2439
2440 static int task_switch_interception(struct kvm_vcpu *vcpu)
2441 {
2442         struct vcpu_svm *svm = to_svm(vcpu);
2443         u16 tss_selector;
2444         int reason;
2445         int int_type = svm->vmcb->control.exit_int_info &
2446                 SVM_EXITINTINFO_TYPE_MASK;
2447         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2448         uint32_t type =
2449                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2450         uint32_t idt_v =
2451                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2452         bool has_error_code = false;
2453         u32 error_code = 0;
2454
2455         tss_selector = (u16)svm->vmcb->control.exit_info_1;
2456
2457         if (svm->vmcb->control.exit_info_2 &
2458             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2459                 reason = TASK_SWITCH_IRET;
2460         else if (svm->vmcb->control.exit_info_2 &
2461                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2462                 reason = TASK_SWITCH_JMP;
2463         else if (idt_v)
2464                 reason = TASK_SWITCH_GATE;
2465         else
2466                 reason = TASK_SWITCH_CALL;
2467
2468         if (reason == TASK_SWITCH_GATE) {
2469                 switch (type) {
2470                 case SVM_EXITINTINFO_TYPE_NMI:
2471                         vcpu->arch.nmi_injected = false;
2472                         break;
2473                 case SVM_EXITINTINFO_TYPE_EXEPT:
2474                         if (svm->vmcb->control.exit_info_2 &
2475                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2476                                 has_error_code = true;
2477                                 error_code =
2478                                         (u32)svm->vmcb->control.exit_info_2;
2479                         }
2480                         kvm_clear_exception_queue(vcpu);
2481                         break;
2482                 case SVM_EXITINTINFO_TYPE_INTR:
2483                 case SVM_EXITINTINFO_TYPE_SOFT:
2484                         kvm_clear_interrupt_queue(vcpu);
2485                         break;
2486                 default:
2487                         break;
2488                 }
2489         }
2490
2491         if (reason != TASK_SWITCH_GATE ||
2492             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2493             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2494              (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2495                 if (!svm_skip_emulated_instruction(vcpu))
2496                         return 0;
2497         }
2498
2499         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2500                 int_vec = -1;
2501
2502         return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2503                                has_error_code, error_code);
2504 }
2505
2506 static void svm_clr_iret_intercept(struct vcpu_svm *svm)
2507 {
2508         if (!sev_es_guest(svm->vcpu.kvm))
2509                 svm_clr_intercept(svm, INTERCEPT_IRET);
2510 }
2511
2512 static void svm_set_iret_intercept(struct vcpu_svm *svm)
2513 {
2514         if (!sev_es_guest(svm->vcpu.kvm))
2515                 svm_set_intercept(svm, INTERCEPT_IRET);
2516 }
2517
2518 static int iret_interception(struct kvm_vcpu *vcpu)
2519 {
2520         struct vcpu_svm *svm = to_svm(vcpu);
2521
2522         WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
2523
2524         ++vcpu->stat.nmi_window_exits;
2525         svm->awaiting_iret_completion = true;
2526
2527         svm_clr_iret_intercept(svm);
2528         svm->nmi_iret_rip = kvm_rip_read(vcpu);
2529
2530         kvm_make_request(KVM_REQ_EVENT, vcpu);
2531         return 1;
2532 }
2533
2534 static int invlpg_interception(struct kvm_vcpu *vcpu)
2535 {
2536         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2537                 return kvm_emulate_instruction(vcpu, 0);
2538
2539         kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2540         return kvm_skip_emulated_instruction(vcpu);
2541 }
2542
2543 static int emulate_on_interception(struct kvm_vcpu *vcpu)
2544 {
2545         return kvm_emulate_instruction(vcpu, 0);
2546 }
2547
2548 static int rsm_interception(struct kvm_vcpu *vcpu)
2549 {
2550         return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2551 }
2552
2553 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2554                                             unsigned long val)
2555 {
2556         struct vcpu_svm *svm = to_svm(vcpu);
2557         unsigned long cr0 = vcpu->arch.cr0;
2558         bool ret = false;
2559
2560         if (!is_guest_mode(vcpu) ||
2561             (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2562                 return false;
2563
2564         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2565         val &= ~SVM_CR0_SELECTIVE_MASK;
2566
2567         if (cr0 ^ val) {
2568                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2569                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2570         }
2571
2572         return ret;
2573 }
2574
2575 #define CR_VALID (1ULL << 63)
2576
2577 static int cr_interception(struct kvm_vcpu *vcpu)
2578 {
2579         struct vcpu_svm *svm = to_svm(vcpu);
2580         int reg, cr;
2581         unsigned long val;
2582         int err;
2583
2584         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2585                 return emulate_on_interception(vcpu);
2586
2587         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2588                 return emulate_on_interception(vcpu);
2589
2590         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2591         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2592                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2593         else
2594                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2595
2596         err = 0;
2597         if (cr >= 16) { /* mov to cr */
2598                 cr -= 16;
2599                 val = kvm_register_read(vcpu, reg);
2600                 trace_kvm_cr_write(cr, val);
2601                 switch (cr) {
2602                 case 0:
2603                         if (!check_selective_cr0_intercepted(vcpu, val))
2604                                 err = kvm_set_cr0(vcpu, val);
2605                         else
2606                                 return 1;
2607
2608                         break;
2609                 case 3:
2610                         err = kvm_set_cr3(vcpu, val);
2611                         break;
2612                 case 4:
2613                         err = kvm_set_cr4(vcpu, val);
2614                         break;
2615                 case 8:
2616                         err = kvm_set_cr8(vcpu, val);
2617                         break;
2618                 default:
2619                         WARN(1, "unhandled write to CR%d", cr);
2620                         kvm_queue_exception(vcpu, UD_VECTOR);
2621                         return 1;
2622                 }
2623         } else { /* mov from cr */
2624                 switch (cr) {
2625                 case 0:
2626                         val = kvm_read_cr0(vcpu);
2627                         break;
2628                 case 2:
2629                         val = vcpu->arch.cr2;
2630                         break;
2631                 case 3:
2632                         val = kvm_read_cr3(vcpu);
2633                         break;
2634                 case 4:
2635                         val = kvm_read_cr4(vcpu);
2636                         break;
2637                 case 8:
2638                         val = kvm_get_cr8(vcpu);
2639                         break;
2640                 default:
2641                         WARN(1, "unhandled read from CR%d", cr);
2642                         kvm_queue_exception(vcpu, UD_VECTOR);
2643                         return 1;
2644                 }
2645                 kvm_register_write(vcpu, reg, val);
2646                 trace_kvm_cr_read(cr, val);
2647         }
2648         return kvm_complete_insn_gp(vcpu, err);
2649 }
2650
2651 static int cr_trap(struct kvm_vcpu *vcpu)
2652 {
2653         struct vcpu_svm *svm = to_svm(vcpu);
2654         unsigned long old_value, new_value;
2655         unsigned int cr;
2656         int ret = 0;
2657
2658         new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2659
2660         cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2661         switch (cr) {
2662         case 0:
2663                 old_value = kvm_read_cr0(vcpu);
2664                 svm_set_cr0(vcpu, new_value);
2665
2666                 kvm_post_set_cr0(vcpu, old_value, new_value);
2667                 break;
2668         case 4:
2669                 old_value = kvm_read_cr4(vcpu);
2670                 svm_set_cr4(vcpu, new_value);
2671
2672                 kvm_post_set_cr4(vcpu, old_value, new_value);
2673                 break;
2674         case 8:
2675                 ret = kvm_set_cr8(vcpu, new_value);
2676                 break;
2677         default:
2678                 WARN(1, "unhandled CR%d write trap", cr);
2679                 kvm_queue_exception(vcpu, UD_VECTOR);
2680                 return 1;
2681         }
2682
2683         return kvm_complete_insn_gp(vcpu, ret);
2684 }
2685
2686 static int dr_interception(struct kvm_vcpu *vcpu)
2687 {
2688         struct vcpu_svm *svm = to_svm(vcpu);
2689         int reg, dr;
2690         unsigned long val;
2691         int err = 0;
2692
2693         /*
2694          * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
2695          * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
2696          */
2697         if (sev_es_guest(vcpu->kvm))
2698                 return 1;
2699
2700         if (vcpu->guest_debug == 0) {
2701                 /*
2702                  * No more DR vmexits; force a reload of the debug registers
2703                  * and reenter on this instruction.  The next vmexit will
2704                  * retrieve the full state of the debug registers.
2705                  */
2706                 clr_dr_intercepts(svm);
2707                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2708                 return 1;
2709         }
2710
2711         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2712                 return emulate_on_interception(vcpu);
2713
2714         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2715         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2716         if (dr >= 16) { /* mov to DRn  */
2717                 dr -= 16;
2718                 val = kvm_register_read(vcpu, reg);
2719                 err = kvm_set_dr(vcpu, dr, val);
2720         } else {
2721                 kvm_get_dr(vcpu, dr, &val);
2722                 kvm_register_write(vcpu, reg, val);
2723         }
2724
2725         return kvm_complete_insn_gp(vcpu, err);
2726 }
2727
2728 static int cr8_write_interception(struct kvm_vcpu *vcpu)
2729 {
2730         int r;
2731
2732         u8 cr8_prev = kvm_get_cr8(vcpu);
2733         /* instruction emulation calls kvm_set_cr8() */
2734         r = cr_interception(vcpu);
2735         if (lapic_in_kernel(vcpu))
2736                 return r;
2737         if (cr8_prev <= kvm_get_cr8(vcpu))
2738                 return r;
2739         vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2740         return 0;
2741 }
2742
2743 static int efer_trap(struct kvm_vcpu *vcpu)
2744 {
2745         struct msr_data msr_info;
2746         int ret;
2747
2748         /*
2749          * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2750          * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2751          * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2752          * the guest doesn't have X86_FEATURE_SVM.
2753          */
2754         msr_info.host_initiated = false;
2755         msr_info.index = MSR_EFER;
2756         msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2757         ret = kvm_set_msr_common(vcpu, &msr_info);
2758
2759         return kvm_complete_insn_gp(vcpu, ret);
2760 }
2761
2762 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2763 {
2764         msr->data = 0;
2765
2766         switch (msr->index) {
2767         case MSR_AMD64_DE_CFG:
2768                 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2769                         msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2770                 break;
2771         default:
2772                 return KVM_MSR_RET_INVALID;
2773         }
2774
2775         return 0;
2776 }
2777
2778 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2779 {
2780         struct vcpu_svm *svm = to_svm(vcpu);
2781
2782         switch (msr_info->index) {
2783         case MSR_AMD64_TSC_RATIO:
2784                 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
2785                         return 1;
2786                 msr_info->data = svm->tsc_ratio_msr;
2787                 break;
2788         case MSR_STAR:
2789                 msr_info->data = svm->vmcb01.ptr->save.star;
2790                 break;
2791 #ifdef CONFIG_X86_64
2792         case MSR_LSTAR:
2793                 msr_info->data = svm->vmcb01.ptr->save.lstar;
2794                 break;
2795         case MSR_CSTAR:
2796                 msr_info->data = svm->vmcb01.ptr->save.cstar;
2797                 break;
2798         case MSR_KERNEL_GS_BASE:
2799                 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2800                 break;
2801         case MSR_SYSCALL_MASK:
2802                 msr_info->data = svm->vmcb01.ptr->save.sfmask;
2803                 break;
2804 #endif
2805         case MSR_IA32_SYSENTER_CS:
2806                 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2807                 break;
2808         case MSR_IA32_SYSENTER_EIP:
2809                 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2810                 if (guest_cpuid_is_intel(vcpu))
2811                         msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2812                 break;
2813         case MSR_IA32_SYSENTER_ESP:
2814                 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2815                 if (guest_cpuid_is_intel(vcpu))
2816                         msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2817                 break;
2818         case MSR_TSC_AUX:
2819                 msr_info->data = svm->tsc_aux;
2820                 break;
2821         case MSR_IA32_DEBUGCTLMSR:
2822                 msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
2823                 break;
2824         case MSR_IA32_LASTBRANCHFROMIP:
2825                 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
2826                 break;
2827         case MSR_IA32_LASTBRANCHTOIP:
2828                 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
2829                 break;
2830         case MSR_IA32_LASTINTFROMIP:
2831                 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
2832                 break;
2833         case MSR_IA32_LASTINTTOIP:
2834                 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
2835                 break;
2836         case MSR_VM_HSAVE_PA:
2837                 msr_info->data = svm->nested.hsave_msr;
2838                 break;
2839         case MSR_VM_CR:
2840                 msr_info->data = svm->nested.vm_cr_msr;
2841                 break;
2842         case MSR_IA32_SPEC_CTRL:
2843                 if (!msr_info->host_initiated &&
2844                     !guest_has_spec_ctrl_msr(vcpu))
2845                         return 1;
2846
2847                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2848                         msr_info->data = svm->vmcb->save.spec_ctrl;
2849                 else
2850                         msr_info->data = svm->spec_ctrl;
2851                 break;
2852         case MSR_AMD64_VIRT_SPEC_CTRL:
2853                 if (!msr_info->host_initiated &&
2854                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2855                         return 1;
2856
2857                 msr_info->data = svm->virt_spec_ctrl;
2858                 break;
2859         case MSR_F15H_IC_CFG: {
2860
2861                 int family, model;
2862
2863                 family = guest_cpuid_family(vcpu);
2864                 model  = guest_cpuid_model(vcpu);
2865
2866                 if (family < 0 || model < 0)
2867                         return kvm_get_msr_common(vcpu, msr_info);
2868
2869                 msr_info->data = 0;
2870
2871                 if (family == 0x15 &&
2872                     (model >= 0x2 && model < 0x20))
2873                         msr_info->data = 0x1E;
2874                 }
2875                 break;
2876         case MSR_AMD64_DE_CFG:
2877                 msr_info->data = svm->msr_decfg;
2878                 break;
2879         default:
2880                 return kvm_get_msr_common(vcpu, msr_info);
2881         }
2882         return 0;
2883 }
2884
2885 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2886 {
2887         struct vcpu_svm *svm = to_svm(vcpu);
2888         if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2889                 return kvm_complete_insn_gp(vcpu, err);
2890
2891         ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2892         ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2893                                 X86_TRAP_GP |
2894                                 SVM_EVTINJ_TYPE_EXEPT |
2895                                 SVM_EVTINJ_VALID);
2896         return 1;
2897 }
2898
2899 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2900 {
2901         struct vcpu_svm *svm = to_svm(vcpu);
2902         int svm_dis, chg_mask;
2903
2904         if (data & ~SVM_VM_CR_VALID_MASK)
2905                 return 1;
2906
2907         chg_mask = SVM_VM_CR_VALID_MASK;
2908
2909         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2910                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2911
2912         svm->nested.vm_cr_msr &= ~chg_mask;
2913         svm->nested.vm_cr_msr |= (data & chg_mask);
2914
2915         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2916
2917         /* check for svm_disable while efer.svme is set */
2918         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2919                 return 1;
2920
2921         return 0;
2922 }
2923
2924 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2925 {
2926         struct vcpu_svm *svm = to_svm(vcpu);
2927         int ret = 0;
2928
2929         u32 ecx = msr->index;
2930         u64 data = msr->data;
2931         switch (ecx) {
2932         case MSR_AMD64_TSC_RATIO:
2933
2934                 if (!svm->tsc_scaling_enabled) {
2935
2936                         if (!msr->host_initiated)
2937                                 return 1;
2938                         /*
2939                          * In case TSC scaling is not enabled, always
2940                          * leave this MSR at the default value.
2941                          *
2942                          * Due to bug in qemu 6.2.0, it would try to set
2943                          * this msr to 0 if tsc scaling is not enabled.
2944                          * Ignore this value as well.
2945                          */
2946                         if (data != 0 && data != svm->tsc_ratio_msr)
2947                                 return 1;
2948                         break;
2949                 }
2950
2951                 if (data & SVM_TSC_RATIO_RSVD)
2952                         return 1;
2953
2954                 svm->tsc_ratio_msr = data;
2955
2956                 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
2957                         nested_svm_update_tsc_ratio_msr(vcpu);
2958
2959                 break;
2960         case MSR_IA32_CR_PAT:
2961                 ret = kvm_set_msr_common(vcpu, msr);
2962                 if (ret)
2963                         break;
2964
2965                 svm->vmcb01.ptr->save.g_pat = data;
2966                 if (is_guest_mode(vcpu))
2967                         nested_vmcb02_compute_g_pat(svm);
2968                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2969                 break;
2970         case MSR_IA32_SPEC_CTRL:
2971                 if (!msr->host_initiated &&
2972                     !guest_has_spec_ctrl_msr(vcpu))
2973                         return 1;
2974
2975                 if (kvm_spec_ctrl_test_value(data))
2976                         return 1;
2977
2978                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2979                         svm->vmcb->save.spec_ctrl = data;
2980                 else
2981                         svm->spec_ctrl = data;
2982                 if (!data)
2983                         break;
2984
2985                 /*
2986                  * For non-nested:
2987                  * When it's written (to non-zero) for the first time, pass
2988                  * it through.
2989                  *
2990                  * For nested:
2991                  * The handling of the MSR bitmap for L2 guests is done in
2992                  * nested_svm_vmrun_msrpm.
2993                  * We update the L1 MSR bit as well since it will end up
2994                  * touching the MSR anyway now.
2995                  */
2996                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2997                 break;
2998         case MSR_AMD64_VIRT_SPEC_CTRL:
2999                 if (!msr->host_initiated &&
3000                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
3001                         return 1;
3002
3003                 if (data & ~SPEC_CTRL_SSBD)
3004                         return 1;
3005
3006                 svm->virt_spec_ctrl = data;
3007                 break;
3008         case MSR_STAR:
3009                 svm->vmcb01.ptr->save.star = data;
3010                 break;
3011 #ifdef CONFIG_X86_64
3012         case MSR_LSTAR:
3013                 svm->vmcb01.ptr->save.lstar = data;
3014                 break;
3015         case MSR_CSTAR:
3016                 svm->vmcb01.ptr->save.cstar = data;
3017                 break;
3018         case MSR_KERNEL_GS_BASE:
3019                 svm->vmcb01.ptr->save.kernel_gs_base = data;
3020                 break;
3021         case MSR_SYSCALL_MASK:
3022                 svm->vmcb01.ptr->save.sfmask = data;
3023                 break;
3024 #endif
3025         case MSR_IA32_SYSENTER_CS:
3026                 svm->vmcb01.ptr->save.sysenter_cs = data;
3027                 break;
3028         case MSR_IA32_SYSENTER_EIP:
3029                 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
3030                 /*
3031                  * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
3032                  * when we spoof an Intel vendor ID (for cross vendor migration).
3033                  * In this case we use this intercept to track the high
3034                  * 32 bit part of these msrs to support Intel's
3035                  * implementation of SYSENTER/SYSEXIT.
3036                  */
3037                 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3038                 break;
3039         case MSR_IA32_SYSENTER_ESP:
3040                 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
3041                 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3042                 break;
3043         case MSR_TSC_AUX:
3044                 /*
3045                  * TSC_AUX is usually changed only during boot and never read
3046                  * directly.  Intercept TSC_AUX instead of exposing it to the
3047                  * guest via direct_access_msrs, and switch it via user return.
3048                  */
3049                 preempt_disable();
3050                 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
3051                 preempt_enable();
3052                 if (ret)
3053                         break;
3054
3055                 svm->tsc_aux = data;
3056                 break;
3057         case MSR_IA32_DEBUGCTLMSR:
3058                 if (!lbrv) {
3059                         kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3060                         break;
3061                 }
3062                 if (data & DEBUGCTL_RESERVED_BITS)
3063                         return 1;
3064
3065                 svm_get_lbr_vmcb(svm)->save.dbgctl = data;
3066                 svm_update_lbrv(vcpu);
3067                 break;
3068         case MSR_VM_HSAVE_PA:
3069                 /*
3070                  * Old kernels did not validate the value written to
3071                  * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
3072                  * value to allow live migrating buggy or malicious guests
3073                  * originating from those kernels.
3074                  */
3075                 if (!msr->host_initiated && !page_address_valid(vcpu, data))
3076                         return 1;
3077
3078                 svm->nested.hsave_msr = data & PAGE_MASK;
3079                 break;
3080         case MSR_VM_CR:
3081                 return svm_set_vm_cr(vcpu, data);
3082         case MSR_VM_IGNNE:
3083                 kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3084                 break;
3085         case MSR_AMD64_DE_CFG: {
3086                 struct kvm_msr_entry msr_entry;
3087
3088                 msr_entry.index = msr->index;
3089                 if (svm_get_msr_feature(&msr_entry))
3090                         return 1;
3091
3092                 /* Check the supported bits */
3093                 if (data & ~msr_entry.data)
3094                         return 1;
3095
3096                 /* Don't allow the guest to change a bit, #GP */
3097                 if (!msr->host_initiated && (data ^ msr_entry.data))
3098                         return 1;
3099
3100                 svm->msr_decfg = data;
3101                 break;
3102         }
3103         default:
3104                 return kvm_set_msr_common(vcpu, msr);
3105         }
3106         return ret;
3107 }
3108
3109 static int msr_interception(struct kvm_vcpu *vcpu)
3110 {
3111         if (to_svm(vcpu)->vmcb->control.exit_info_1)
3112                 return kvm_emulate_wrmsr(vcpu);
3113         else
3114                 return kvm_emulate_rdmsr(vcpu);
3115 }
3116
3117 static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3118 {
3119         kvm_make_request(KVM_REQ_EVENT, vcpu);
3120         svm_clear_vintr(to_svm(vcpu));
3121
3122         /*
3123          * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3124          * In this case AVIC was temporarily disabled for
3125          * requesting the IRQ window and we have to re-enable it.
3126          *
3127          * If running nested, still remove the VM wide AVIC inhibit to
3128          * support case in which the interrupt window was requested when the
3129          * vCPU was not running nested.
3130
3131          * All vCPUs which run still run nested, will remain to have their
3132          * AVIC still inhibited due to per-cpu AVIC inhibition.
3133          */
3134         kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3135
3136         ++vcpu->stat.irq_window_exits;
3137         return 1;
3138 }
3139
3140 static int pause_interception(struct kvm_vcpu *vcpu)
3141 {
3142         bool in_kernel;
3143         /*
3144          * CPL is not made available for an SEV-ES guest, therefore
3145          * vcpu->arch.preempted_in_kernel can never be true.  Just
3146          * set in_kernel to false as well.
3147          */
3148         in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3149
3150         grow_ple_window(vcpu);
3151
3152         kvm_vcpu_on_spin(vcpu, in_kernel);
3153         return kvm_skip_emulated_instruction(vcpu);
3154 }
3155
3156 static int invpcid_interception(struct kvm_vcpu *vcpu)
3157 {
3158         struct vcpu_svm *svm = to_svm(vcpu);
3159         unsigned long type;
3160         gva_t gva;
3161
3162         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3163                 kvm_queue_exception(vcpu, UD_VECTOR);
3164                 return 1;
3165         }
3166
3167         /*
3168          * For an INVPCID intercept:
3169          * EXITINFO1 provides the linear address of the memory operand.
3170          * EXITINFO2 provides the contents of the register operand.
3171          */
3172         type = svm->vmcb->control.exit_info_2;
3173         gva = svm->vmcb->control.exit_info_1;
3174
3175         return kvm_handle_invpcid(vcpu, type, gva);
3176 }
3177
3178 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3179         [SVM_EXIT_READ_CR0]                     = cr_interception,
3180         [SVM_EXIT_READ_CR3]                     = cr_interception,
3181         [SVM_EXIT_READ_CR4]                     = cr_interception,
3182         [SVM_EXIT_READ_CR8]                     = cr_interception,
3183         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3184         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3185         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3186         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3187         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3188         [SVM_EXIT_READ_DR0]                     = dr_interception,
3189         [SVM_EXIT_READ_DR1]                     = dr_interception,
3190         [SVM_EXIT_READ_DR2]                     = dr_interception,
3191         [SVM_EXIT_READ_DR3]                     = dr_interception,
3192         [SVM_EXIT_READ_DR4]                     = dr_interception,
3193         [SVM_EXIT_READ_DR5]                     = dr_interception,
3194         [SVM_EXIT_READ_DR6]                     = dr_interception,
3195         [SVM_EXIT_READ_DR7]                     = dr_interception,
3196         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3197         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3198         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3199         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3200         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3201         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3202         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3203         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3204         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3205         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3206         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3207         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3208         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3209         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3210         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
3211         [SVM_EXIT_INTR]                         = intr_interception,
3212         [SVM_EXIT_NMI]                          = nmi_interception,
3213         [SVM_EXIT_SMI]                          = smi_interception,
3214         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3215         [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
3216         [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
3217         [SVM_EXIT_IRET]                         = iret_interception,
3218         [SVM_EXIT_INVD]                         = kvm_emulate_invd,
3219         [SVM_EXIT_PAUSE]                        = pause_interception,
3220         [SVM_EXIT_HLT]                          = kvm_emulate_halt,
3221         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3222         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3223         [SVM_EXIT_IOIO]                         = io_interception,
3224         [SVM_EXIT_MSR]                          = msr_interception,
3225         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3226         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3227         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3228         [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
3229         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3230         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3231         [SVM_EXIT_STGI]                         = stgi_interception,
3232         [SVM_EXIT_CLGI]                         = clgi_interception,
3233         [SVM_EXIT_SKINIT]                       = skinit_interception,
3234         [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
3235         [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3236         [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
3237         [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
3238         [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
3239         [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
3240         [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
3241         [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
3242         [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
3243         [SVM_EXIT_CR8_WRITE_TRAP]               = cr_trap,
3244         [SVM_EXIT_INVPCID]                      = invpcid_interception,
3245         [SVM_EXIT_NPF]                          = npf_interception,
3246         [SVM_EXIT_RSM]                          = rsm_interception,
3247         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3248         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3249         [SVM_EXIT_VMGEXIT]                      = sev_handle_vmgexit,
3250 };
3251
3252 static void dump_vmcb(struct kvm_vcpu *vcpu)
3253 {
3254         struct vcpu_svm *svm = to_svm(vcpu);
3255         struct vmcb_control_area *control = &svm->vmcb->control;
3256         struct vmcb_save_area *save = &svm->vmcb->save;
3257         struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3258
3259         if (!dump_invalid_vmcb) {
3260                 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3261                 return;
3262         }
3263
3264         pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3265                svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3266         pr_err("VMCB Control Area:\n");
3267         pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3268         pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3269         pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3270         pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3271         pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3272         pr_err("%-20s%08x %08x\n", "intercepts:",
3273               control->intercepts[INTERCEPT_WORD3],
3274                control->intercepts[INTERCEPT_WORD4]);
3275         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3276         pr_err("%-20s%d\n", "pause filter threshold:",
3277                control->pause_filter_thresh);
3278         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3279         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3280         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3281         pr_err("%-20s%d\n", "asid:", control->asid);
3282         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3283         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3284         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3285         pr_err("%-20s%08x\n", "int_state:", control->int_state);
3286         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3287         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3288         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3289         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3290         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3291         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3292         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3293         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3294         pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3295         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3296         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3297         pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3298         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3299         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3300         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3301         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3302         pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3303         pr_err("VMCB State Save Area:\n");
3304         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3305                "es:",
3306                save->es.selector, save->es.attrib,
3307                save->es.limit, save->es.base);
3308         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3309                "cs:",
3310                save->cs.selector, save->cs.attrib,
3311                save->cs.limit, save->cs.base);
3312         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3313                "ss:",
3314                save->ss.selector, save->ss.attrib,
3315                save->ss.limit, save->ss.base);
3316         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3317                "ds:",
3318                save->ds.selector, save->ds.attrib,
3319                save->ds.limit, save->ds.base);
3320         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3321                "fs:",
3322                save01->fs.selector, save01->fs.attrib,
3323                save01->fs.limit, save01->fs.base);
3324         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3325                "gs:",
3326                save01->gs.selector, save01->gs.attrib,
3327                save01->gs.limit, save01->gs.base);
3328         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3329                "gdtr:",
3330                save->gdtr.selector, save->gdtr.attrib,
3331                save->gdtr.limit, save->gdtr.base);
3332         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3333                "ldtr:",
3334                save01->ldtr.selector, save01->ldtr.attrib,
3335                save01->ldtr.limit, save01->ldtr.base);
3336         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3337                "idtr:",
3338                save->idtr.selector, save->idtr.attrib,
3339                save->idtr.limit, save->idtr.base);
3340         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3341                "tr:",
3342                save01->tr.selector, save01->tr.attrib,
3343                save01->tr.limit, save01->tr.base);
3344         pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3345                save->vmpl, save->cpl, save->efer);
3346         pr_err("%-15s %016llx %-13s %016llx\n",
3347                "cr0:", save->cr0, "cr2:", save->cr2);
3348         pr_err("%-15s %016llx %-13s %016llx\n",
3349                "cr3:", save->cr3, "cr4:", save->cr4);
3350         pr_err("%-15s %016llx %-13s %016llx\n",
3351                "dr6:", save->dr6, "dr7:", save->dr7);
3352         pr_err("%-15s %016llx %-13s %016llx\n",
3353                "rip:", save->rip, "rflags:", save->rflags);
3354         pr_err("%-15s %016llx %-13s %016llx\n",
3355                "rsp:", save->rsp, "rax:", save->rax);
3356         pr_err("%-15s %016llx %-13s %016llx\n",
3357                "star:", save01->star, "lstar:", save01->lstar);
3358         pr_err("%-15s %016llx %-13s %016llx\n",
3359                "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3360         pr_err("%-15s %016llx %-13s %016llx\n",
3361                "kernel_gs_base:", save01->kernel_gs_base,
3362                "sysenter_cs:", save01->sysenter_cs);
3363         pr_err("%-15s %016llx %-13s %016llx\n",
3364                "sysenter_esp:", save01->sysenter_esp,
3365                "sysenter_eip:", save01->sysenter_eip);
3366         pr_err("%-15s %016llx %-13s %016llx\n",
3367                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3368         pr_err("%-15s %016llx %-13s %016llx\n",
3369                "br_from:", save->br_from, "br_to:", save->br_to);
3370         pr_err("%-15s %016llx %-13s %016llx\n",
3371                "excp_from:", save->last_excp_from,
3372                "excp_to:", save->last_excp_to);
3373 }
3374
3375 static bool svm_check_exit_valid(u64 exit_code)
3376 {
3377         return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3378                 svm_exit_handlers[exit_code]);
3379 }
3380
3381 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3382 {
3383         vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3384         dump_vmcb(vcpu);
3385         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3386         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3387         vcpu->run->internal.ndata = 2;
3388         vcpu->run->internal.data[0] = exit_code;
3389         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3390         return 0;
3391 }
3392
3393 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3394 {
3395         if (!svm_check_exit_valid(exit_code))
3396                 return svm_handle_invalid_exit(vcpu, exit_code);
3397
3398 #ifdef CONFIG_RETPOLINE
3399         if (exit_code == SVM_EXIT_MSR)
3400                 return msr_interception(vcpu);
3401         else if (exit_code == SVM_EXIT_VINTR)
3402                 return interrupt_window_interception(vcpu);
3403         else if (exit_code == SVM_EXIT_INTR)
3404                 return intr_interception(vcpu);
3405         else if (exit_code == SVM_EXIT_HLT)
3406                 return kvm_emulate_halt(vcpu);
3407         else if (exit_code == SVM_EXIT_NPF)
3408                 return npf_interception(vcpu);
3409 #endif
3410         return svm_exit_handlers[exit_code](vcpu);
3411 }
3412
3413 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3414                               u64 *info1, u64 *info2,
3415                               u32 *intr_info, u32 *error_code)
3416 {
3417         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3418
3419         *reason = control->exit_code;
3420         *info1 = control->exit_info_1;
3421         *info2 = control->exit_info_2;
3422         *intr_info = control->exit_int_info;
3423         if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3424             (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3425                 *error_code = control->exit_int_info_err;
3426         else
3427                 *error_code = 0;
3428 }
3429
3430 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3431 {
3432         struct vcpu_svm *svm = to_svm(vcpu);
3433         struct kvm_run *kvm_run = vcpu->run;
3434         u32 exit_code = svm->vmcb->control.exit_code;
3435
3436         /* SEV-ES guests must use the CR write traps to track CR registers. */
3437         if (!sev_es_guest(vcpu->kvm)) {
3438                 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3439                         vcpu->arch.cr0 = svm->vmcb->save.cr0;
3440                 if (npt_enabled)
3441                         vcpu->arch.cr3 = svm->vmcb->save.cr3;
3442         }
3443
3444         if (is_guest_mode(vcpu)) {
3445                 int vmexit;
3446
3447                 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3448
3449                 vmexit = nested_svm_exit_special(svm);
3450
3451                 if (vmexit == NESTED_EXIT_CONTINUE)
3452                         vmexit = nested_svm_exit_handled(svm);
3453
3454                 if (vmexit == NESTED_EXIT_DONE)
3455                         return 1;
3456         }
3457
3458         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3459                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3460                 kvm_run->fail_entry.hardware_entry_failure_reason
3461                         = svm->vmcb->control.exit_code;
3462                 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3463                 dump_vmcb(vcpu);
3464                 return 0;
3465         }
3466
3467         if (exit_fastpath != EXIT_FASTPATH_NONE)
3468                 return 1;
3469
3470         return svm_invoke_exit_handler(vcpu, exit_code);
3471 }
3472
3473 static void pre_svm_run(struct kvm_vcpu *vcpu)
3474 {
3475         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3476         struct vcpu_svm *svm = to_svm(vcpu);
3477
3478         /*
3479          * If the previous vmrun of the vmcb occurred on a different physical
3480          * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3481          * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3482          */
3483         if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3484                 svm->current_vmcb->asid_generation = 0;
3485                 vmcb_mark_all_dirty(svm->vmcb);
3486                 svm->current_vmcb->cpu = vcpu->cpu;
3487         }
3488
3489         if (sev_guest(vcpu->kvm))
3490                 return pre_sev_run(svm, vcpu->cpu);
3491
3492         /* FIXME: handle wraparound of asid_generation */
3493         if (svm->current_vmcb->asid_generation != sd->asid_generation)
3494                 new_asid(svm, sd);
3495 }
3496
3497 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3498 {
3499         struct vcpu_svm *svm = to_svm(vcpu);
3500
3501         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3502
3503         if (svm->nmi_l1_to_l2)
3504                 return;
3505
3506         svm->nmi_masked = true;
3507         svm_set_iret_intercept(svm);
3508         ++vcpu->stat.nmi_injections;
3509 }
3510
3511 static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
3512 {
3513         struct vcpu_svm *svm = to_svm(vcpu);
3514
3515         if (!is_vnmi_enabled(svm))
3516                 return false;
3517
3518         return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
3519 }
3520
3521 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
3522 {
3523         struct vcpu_svm *svm = to_svm(vcpu);
3524
3525         if (!is_vnmi_enabled(svm))
3526                 return false;
3527
3528         if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
3529                 return false;
3530
3531         svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
3532         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
3533
3534         /*
3535          * Because the pending NMI is serviced by hardware, KVM can't know when
3536          * the NMI is "injected", but for all intents and purposes, passing the
3537          * NMI off to hardware counts as injection.
3538          */
3539         ++vcpu->stat.nmi_injections;
3540
3541         return true;
3542 }
3543
3544 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3545 {
3546         struct vcpu_svm *svm = to_svm(vcpu);
3547         u32 type;
3548
3549         if (vcpu->arch.interrupt.soft) {
3550                 if (svm_update_soft_interrupt_rip(vcpu))
3551                         return;
3552
3553                 type = SVM_EVTINJ_TYPE_SOFT;
3554         } else {
3555                 type = SVM_EVTINJ_TYPE_INTR;
3556         }
3557
3558         trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3559                            vcpu->arch.interrupt.soft, reinjected);
3560         ++vcpu->stat.irq_injections;
3561
3562         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3563                                        SVM_EVTINJ_VALID | type;
3564 }
3565
3566 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3567                                      int trig_mode, int vector)
3568 {
3569         /*
3570          * apic->apicv_active must be read after vcpu->mode.
3571          * Pairs with smp_store_release in vcpu_enter_guest.
3572          */
3573         bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3574
3575         /* Note, this is called iff the local APIC is in-kernel. */
3576         if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3577                 /* Process the interrupt via kvm_check_and_inject_events(). */
3578                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3579                 kvm_vcpu_kick(vcpu);
3580                 return;
3581         }
3582
3583         trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3584         if (in_guest_mode) {
3585                 /*
3586                  * Signal the doorbell to tell hardware to inject the IRQ.  If
3587                  * the vCPU exits the guest before the doorbell chimes, hardware
3588                  * will automatically process AVIC interrupts at the next VMRUN.
3589                  */
3590                 avic_ring_doorbell(vcpu);
3591         } else {
3592                 /*
3593                  * Wake the vCPU if it was blocking.  KVM will then detect the
3594                  * pending IRQ when checking if the vCPU has a wake event.
3595                  */
3596                 kvm_vcpu_wake_up(vcpu);
3597         }
3598 }
3599
3600 static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3601                                   int trig_mode, int vector)
3602 {
3603         kvm_lapic_set_irr(vector, apic);
3604
3605         /*
3606          * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3607          * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3608          * the read of guest_mode.  This guarantees that either VMRUN will see
3609          * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3610          * will signal the doorbell if the CPU has already entered the guest.
3611          */
3612         smp_mb__after_atomic();
3613         svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3614 }
3615
3616 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3617 {
3618         struct vcpu_svm *svm = to_svm(vcpu);
3619
3620         /*
3621          * SEV-ES guests must always keep the CR intercepts cleared. CR
3622          * tracking is done using the CR write traps.
3623          */
3624         if (sev_es_guest(vcpu->kvm))
3625                 return;
3626
3627         if (nested_svm_virtualize_tpr(vcpu))
3628                 return;
3629
3630         svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3631
3632         if (irr == -1)
3633                 return;
3634
3635         if (tpr >= irr)
3636                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3637 }
3638
3639 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3640 {
3641         struct vcpu_svm *svm = to_svm(vcpu);
3642
3643         if (is_vnmi_enabled(svm))
3644                 return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
3645         else
3646                 return svm->nmi_masked;
3647 }
3648
3649 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3650 {
3651         struct vcpu_svm *svm = to_svm(vcpu);
3652
3653         if (is_vnmi_enabled(svm)) {
3654                 if (masked)
3655                         svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
3656                 else
3657                         svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
3658
3659         } else {
3660                 svm->nmi_masked = masked;
3661                 if (masked)
3662                         svm_set_iret_intercept(svm);
3663                 else
3664                         svm_clr_iret_intercept(svm);
3665         }
3666 }
3667
3668 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3669 {
3670         struct vcpu_svm *svm = to_svm(vcpu);
3671         struct vmcb *vmcb = svm->vmcb;
3672
3673         if (!gif_set(svm))
3674                 return true;
3675
3676         if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3677                 return false;
3678
3679         if (svm_get_nmi_mask(vcpu))
3680                 return true;
3681
3682         return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
3683 }
3684
3685 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3686 {
3687         struct vcpu_svm *svm = to_svm(vcpu);
3688         if (svm->nested.nested_run_pending)
3689                 return -EBUSY;
3690
3691         if (svm_nmi_blocked(vcpu))
3692                 return 0;
3693
3694         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3695         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3696                 return -EBUSY;
3697         return 1;
3698 }
3699
3700 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3701 {
3702         struct vcpu_svm *svm = to_svm(vcpu);
3703         struct vmcb *vmcb = svm->vmcb;
3704
3705         if (!gif_set(svm))
3706                 return true;
3707
3708         if (is_guest_mode(vcpu)) {
3709                 /* As long as interrupts are being delivered...  */
3710                 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3711                     ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3712                     : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3713                         return true;
3714
3715                 /* ... vmexits aren't blocked by the interrupt shadow  */
3716                 if (nested_exit_on_intr(svm))
3717                         return false;
3718         } else {
3719                 if (!svm_get_if_flag(vcpu))
3720                         return true;
3721         }
3722
3723         return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3724 }
3725
3726 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3727 {
3728         struct vcpu_svm *svm = to_svm(vcpu);
3729
3730         if (svm->nested.nested_run_pending)
3731                 return -EBUSY;
3732
3733         if (svm_interrupt_blocked(vcpu))
3734                 return 0;
3735
3736         /*
3737          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3738          * e.g. if the IRQ arrived asynchronously after checking nested events.
3739          */
3740         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3741                 return -EBUSY;
3742
3743         return 1;
3744 }
3745
3746 static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3747 {
3748         struct vcpu_svm *svm = to_svm(vcpu);
3749
3750         /*
3751          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3752          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3753          * get that intercept, this function will be called again though and
3754          * we'll get the vintr intercept. However, if the vGIF feature is
3755          * enabled, the STGI interception will not occur. Enable the irq
3756          * window under the assumption that the hardware will set the GIF.
3757          */
3758         if (vgif || gif_set(svm)) {
3759                 /*
3760                  * IRQ window is not needed when AVIC is enabled,
3761                  * unless we have pending ExtINT since it cannot be injected
3762                  * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3763                  * and fallback to injecting IRQ via V_IRQ.
3764                  *
3765                  * If running nested, AVIC is already locally inhibited
3766                  * on this vCPU, therefore there is no need to request
3767                  * the VM wide AVIC inhibition.
3768                  */
3769                 if (!is_guest_mode(vcpu))
3770                         kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3771
3772                 svm_set_vintr(svm);
3773         }
3774 }
3775
3776 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3777 {
3778         struct vcpu_svm *svm = to_svm(vcpu);
3779
3780         /*
3781          * KVM should never request an NMI window when vNMI is enabled, as KVM
3782          * allows at most one to-be-injected NMI and one pending NMI, i.e. if
3783          * two NMIs arrive simultaneously, KVM will inject one and set
3784          * V_NMI_PENDING for the other.  WARN, but continue with the standard
3785          * single-step approach to try and salvage the pending NMI.
3786          */
3787         WARN_ON_ONCE(is_vnmi_enabled(svm));
3788
3789         if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
3790                 return; /* IRET will cause a vm exit */
3791
3792         /*
3793          * SEV-ES guests are responsible for signaling when a vCPU is ready to
3794          * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
3795          * KVM can't intercept and single-step IRET to detect when NMIs are
3796          * unblocked (architecturally speaking).  See SVM_VMGEXIT_NMI_COMPLETE.
3797          *
3798          * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
3799          * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
3800          * supported NAEs in the GHCB protocol.
3801          */
3802         if (sev_es_guest(vcpu->kvm))
3803                 return;
3804
3805         if (!gif_set(svm)) {
3806                 if (vgif)
3807                         svm_set_intercept(svm, INTERCEPT_STGI);
3808                 return; /* STGI will cause a vm exit */
3809         }
3810
3811         /*
3812          * Something prevents NMI from been injected. Single step over possible
3813          * problem (IRET or exception injection or interrupt shadow)
3814          */
3815         svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3816         svm->nmi_singlestep = true;
3817         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3818 }
3819
3820 static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
3821 {
3822         struct vcpu_svm *svm = to_svm(vcpu);
3823
3824         /*
3825          * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
3826          * A TLB flush for the current ASID flushes both "host" and "guest" TLB
3827          * entries, and thus is a superset of Hyper-V's fine grained flushing.
3828          */
3829         kvm_hv_vcpu_purge_flush_tlb(vcpu);
3830
3831         /*
3832          * Flush only the current ASID even if the TLB flush was invoked via
3833          * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3834          * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3835          * unconditionally does a TLB flush on both nested VM-Enter and nested
3836          * VM-Exit (via kvm_mmu_reset_context()).
3837          */
3838         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3839                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3840         else
3841                 svm->current_vmcb->asid_generation--;
3842 }
3843
3844 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3845 {
3846         hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
3847
3848         /*
3849          * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
3850          * flush the NPT mappings via hypercall as flushing the ASID only
3851          * affects virtual to physical mappings, it does not invalidate guest
3852          * physical to host physical mappings.
3853          */
3854         if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
3855                 hyperv_flush_guest_mapping(root_tdp);
3856
3857         svm_flush_tlb_asid(vcpu);
3858 }
3859
3860 static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
3861 {
3862         /*
3863          * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
3864          * flushes should be routed to hv_flush_remote_tlbs() without requesting
3865          * a "regular" remote flush.  Reaching this point means either there's
3866          * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
3867          * which might be fatal to the guest.  Yell, but try to recover.
3868          */
3869         if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
3870                 hv_flush_remote_tlbs(vcpu->kvm);
3871
3872         svm_flush_tlb_asid(vcpu);
3873 }
3874
3875 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3876 {
3877         struct vcpu_svm *svm = to_svm(vcpu);
3878
3879         invlpga(gva, svm->vmcb->control.asid);
3880 }
3881
3882 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3883 {
3884         struct vcpu_svm *svm = to_svm(vcpu);
3885
3886         if (nested_svm_virtualize_tpr(vcpu))
3887                 return;
3888
3889         if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3890                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3891                 kvm_set_cr8(vcpu, cr8);
3892         }
3893 }
3894
3895 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3896 {
3897         struct vcpu_svm *svm = to_svm(vcpu);
3898         u64 cr8;
3899
3900         if (nested_svm_virtualize_tpr(vcpu) ||
3901             kvm_vcpu_apicv_active(vcpu))
3902                 return;
3903
3904         cr8 = kvm_get_cr8(vcpu);
3905         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3906         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3907 }
3908
3909 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3910                                         int type)
3911 {
3912         bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3913         bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3914         struct vcpu_svm *svm = to_svm(vcpu);
3915
3916         /*
3917          * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3918          * associated with the original soft exception/interrupt.  next_rip is
3919          * cleared on all exits that can occur while vectoring an event, so KVM
3920          * needs to manually set next_rip for re-injection.  Unlike the !nrips
3921          * case below, this needs to be done if and only if KVM is re-injecting
3922          * the same event, i.e. if the event is a soft exception/interrupt,
3923          * otherwise next_rip is unused on VMRUN.
3924          */
3925         if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3926             kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3927                 svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3928         /*
3929          * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3930          * injecting the soft exception/interrupt.  That advancement needs to
3931          * be unwound if vectoring didn't complete.  Note, the new event may
3932          * not be the injected event, e.g. if KVM injected an INTn, the INTn
3933          * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3934          * be the reported vectored event, but RIP still needs to be unwound.
3935          */
3936         else if (!nrips && (is_soft || is_exception) &&
3937                  kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3938                 kvm_rip_write(vcpu, svm->soft_int_old_rip);
3939 }
3940
3941 static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3942 {
3943         struct vcpu_svm *svm = to_svm(vcpu);
3944         u8 vector;
3945         int type;
3946         u32 exitintinfo = svm->vmcb->control.exit_int_info;
3947         bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
3948         bool soft_int_injected = svm->soft_int_injected;
3949
3950         svm->nmi_l1_to_l2 = false;
3951         svm->soft_int_injected = false;
3952
3953         /*
3954          * If we've made progress since setting awaiting_iret_completion, we've
3955          * executed an IRET and can allow NMI injection.
3956          */
3957         if (svm->awaiting_iret_completion &&
3958             kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
3959                 svm->awaiting_iret_completion = false;
3960                 svm->nmi_masked = false;
3961                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3962         }
3963
3964         vcpu->arch.nmi_injected = false;
3965         kvm_clear_exception_queue(vcpu);
3966         kvm_clear_interrupt_queue(vcpu);
3967
3968         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3969                 return;
3970
3971         kvm_make_request(KVM_REQ_EVENT, vcpu);
3972
3973         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3974         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3975
3976         if (soft_int_injected)
3977                 svm_complete_soft_interrupt(vcpu, vector, type);
3978
3979         switch (type) {
3980         case SVM_EXITINTINFO_TYPE_NMI:
3981                 vcpu->arch.nmi_injected = true;
3982                 svm->nmi_l1_to_l2 = nmi_l1_to_l2;
3983                 break;
3984         case SVM_EXITINTINFO_TYPE_EXEPT:
3985                 /*
3986                  * Never re-inject a #VC exception.
3987                  */
3988                 if (vector == X86_TRAP_VC)
3989                         break;
3990
3991                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3992                         u32 err = svm->vmcb->control.exit_int_info_err;
3993                         kvm_requeue_exception_e(vcpu, vector, err);
3994
3995                 } else
3996                         kvm_requeue_exception(vcpu, vector);
3997                 break;
3998         case SVM_EXITINTINFO_TYPE_INTR:
3999                 kvm_queue_interrupt(vcpu, vector, false);
4000                 break;
4001         case SVM_EXITINTINFO_TYPE_SOFT:
4002                 kvm_queue_interrupt(vcpu, vector, true);
4003                 break;
4004         default:
4005                 break;
4006         }
4007
4008 }
4009
4010 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4011 {
4012         struct vcpu_svm *svm = to_svm(vcpu);
4013         struct vmcb_control_area *control = &svm->vmcb->control;
4014
4015         control->exit_int_info = control->event_inj;
4016         control->exit_int_info_err = control->event_inj_err;
4017         control->event_inj = 0;
4018         svm_complete_interrupts(vcpu);
4019 }
4020
4021 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
4022 {
4023         return 1;
4024 }
4025
4026 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
4027 {
4028         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
4029
4030         /*
4031          * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
4032          * can't read guest memory (dereference memslots) to decode the WRMSR.
4033          */
4034         if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
4035             nrips && control->next_rip)
4036                 return handle_fastpath_set_msr_irqoff(vcpu);
4037
4038         return EXIT_FASTPATH_NONE;
4039 }
4040
4041 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
4042 {
4043         struct vcpu_svm *svm = to_svm(vcpu);
4044
4045         guest_state_enter_irqoff();
4046
4047         if (sev_es_guest(vcpu->kvm))
4048                 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
4049         else
4050                 __svm_vcpu_run(svm, spec_ctrl_intercepted);
4051
4052         guest_state_exit_irqoff();
4053 }
4054
4055 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
4056 {
4057         struct vcpu_svm *svm = to_svm(vcpu);
4058         bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
4059
4060         trace_kvm_entry(vcpu);
4061
4062         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4063         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4064         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4065
4066         /*
4067          * Disable singlestep if we're injecting an interrupt/exception.
4068          * We don't want our modified rflags to be pushed on the stack where
4069          * we might not be able to easily reset them if we disabled NMI
4070          * singlestep later.
4071          */
4072         if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
4073                 /*
4074                  * Event injection happens before external interrupts cause a
4075                  * vmexit and interrupts are disabled here, so smp_send_reschedule
4076                  * is enough to force an immediate vmexit.
4077                  */
4078                 disable_nmi_singlestep(svm);
4079                 smp_send_reschedule(vcpu->cpu);
4080         }
4081
4082         pre_svm_run(vcpu);
4083
4084         sync_lapic_to_cr8(vcpu);
4085
4086         if (unlikely(svm->asid != svm->vmcb->control.asid)) {
4087                 svm->vmcb->control.asid = svm->asid;
4088                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
4089         }
4090         svm->vmcb->save.cr2 = vcpu->arch.cr2;
4091
4092         svm_hv_update_vp_id(svm->vmcb, vcpu);
4093
4094         /*
4095          * Run with all-zero DR6 unless needed, so that we can get the exact cause
4096          * of a #DB.
4097          */
4098         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
4099                 svm_set_dr6(svm, vcpu->arch.dr6);
4100         else
4101                 svm_set_dr6(svm, DR6_ACTIVE_LOW);
4102
4103         clgi();
4104         kvm_load_guest_xsave_state(vcpu);
4105
4106         kvm_wait_lapic_expire(vcpu);
4107
4108         /*
4109          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
4110          * it's non-zero. Since vmentry is serialising on affected CPUs, there
4111          * is no need to worry about the conditional branch over the wrmsr
4112          * being speculatively taken.
4113          */
4114         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4115                 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
4116
4117         svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
4118
4119         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4120                 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
4121
4122         if (!sev_es_guest(vcpu->kvm)) {
4123                 vcpu->arch.cr2 = svm->vmcb->save.cr2;
4124                 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4125                 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4126                 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4127         }
4128         vcpu->arch.regs_dirty = 0;
4129
4130         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4131                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4132
4133         kvm_load_host_xsave_state(vcpu);
4134         stgi();
4135
4136         /* Any pending NMI will happen here */
4137
4138         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4139                 kvm_after_interrupt(vcpu);
4140
4141         sync_cr8_to_lapic(vcpu);
4142
4143         svm->next_rip = 0;
4144         if (is_guest_mode(vcpu)) {
4145                 nested_sync_control_from_vmcb02(svm);
4146
4147                 /* Track VMRUNs that have made past consistency checking */
4148                 if (svm->nested.nested_run_pending &&
4149                     svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4150                         ++vcpu->stat.nested_run;
4151
4152                 svm->nested.nested_run_pending = 0;
4153         }
4154
4155         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4156         vmcb_mark_all_clean(svm->vmcb);
4157
4158         /* if exit due to PF check for async PF */
4159         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4160                 vcpu->arch.apf.host_apf_flags =
4161                         kvm_read_and_reset_apf_flags();
4162
4163         vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4164
4165         /*
4166          * We need to handle MC intercepts here before the vcpu has a chance to
4167          * change the physical cpu
4168          */
4169         if (unlikely(svm->vmcb->control.exit_code ==
4170                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
4171                 svm_handle_mce(vcpu);
4172
4173         trace_kvm_exit(vcpu, KVM_ISA_SVM);
4174
4175         svm_complete_interrupts(vcpu);
4176
4177         if (is_guest_mode(vcpu))
4178                 return EXIT_FASTPATH_NONE;
4179
4180         return svm_exit_handlers_fastpath(vcpu);
4181 }
4182
4183 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4184                              int root_level)
4185 {
4186         struct vcpu_svm *svm = to_svm(vcpu);
4187         unsigned long cr3;
4188
4189         if (npt_enabled) {
4190                 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4191                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4192
4193                 hv_track_root_tdp(vcpu, root_hpa);
4194
4195                 cr3 = vcpu->arch.cr3;
4196         } else if (root_level >= PT64_ROOT_4LEVEL) {
4197                 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4198         } else {
4199                 /* PCID in the guest should be impossible with a 32-bit MMU. */
4200                 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4201                 cr3 = root_hpa;
4202         }
4203
4204         svm->vmcb->save.cr3 = cr3;
4205         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4206 }
4207
4208 static void
4209 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4210 {
4211         /*
4212          * Patch in the VMMCALL instruction:
4213          */
4214         hypercall[0] = 0x0f;
4215         hypercall[1] = 0x01;
4216         hypercall[2] = 0xd9;
4217 }
4218
4219 /*
4220  * The kvm parameter can be NULL (module initialization, or invocation before
4221  * VM creation). Be sure to check the kvm parameter before using it.
4222  */
4223 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4224 {
4225         switch (index) {
4226         case MSR_IA32_MCG_EXT_CTL:
4227         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
4228                 return false;
4229         case MSR_IA32_SMBASE:
4230                 if (!IS_ENABLED(CONFIG_KVM_SMM))
4231                         return false;
4232                 /* SEV-ES guests do not support SMM, so report false */
4233                 if (kvm && sev_es_guest(kvm))
4234                         return false;
4235                 break;
4236         default:
4237                 break;
4238         }
4239
4240         return true;
4241 }
4242
4243 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4244 {
4245         struct vcpu_svm *svm = to_svm(vcpu);
4246         struct kvm_cpuid_entry2 *best;
4247
4248         vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4249                                     boot_cpu_has(X86_FEATURE_XSAVE) &&
4250                                     boot_cpu_has(X86_FEATURE_XSAVES);
4251
4252         /* Update nrips enabled cache */
4253         svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4254                              guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
4255
4256         svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
4257         svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
4258
4259         svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4260
4261         svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
4262                         guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
4263
4264         svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
4265                         guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
4266
4267         svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
4268
4269         svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
4270
4271         svm_recalc_instruction_intercepts(vcpu, svm);
4272
4273         if (boot_cpu_has(X86_FEATURE_IBPB))
4274                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
4275                                      !!guest_has_pred_cmd_msr(vcpu));
4276
4277         if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
4278                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
4279                                      !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
4280
4281         /* For sev guests, the memory encryption bit is not reserved in CR3.  */
4282         if (sev_guest(vcpu->kvm)) {
4283                 best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4284                 if (best)
4285                         vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4286         }
4287
4288         init_vmcb_after_set_cpuid(vcpu);
4289 }
4290
4291 static bool svm_has_wbinvd_exit(void)
4292 {
4293         return true;
4294 }
4295
4296 #define PRE_EX(exit)  { .exit_code = (exit), \
4297                         .stage = X86_ICPT_PRE_EXCEPT, }
4298 #define POST_EX(exit) { .exit_code = (exit), \
4299                         .stage = X86_ICPT_POST_EXCEPT, }
4300 #define POST_MEM(exit) { .exit_code = (exit), \
4301                         .stage = X86_ICPT_POST_MEMACCESS, }
4302
4303 static const struct __x86_intercept {
4304         u32 exit_code;
4305         enum x86_intercept_stage stage;
4306 } x86_intercept_map[] = {
4307         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4308         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4309         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4310         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4311         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4312         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4313         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4314         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4315         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4316         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4317         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4318         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4319         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4320         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4321         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4322         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4323         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4324         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4325         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4326         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4327         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4328         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4329         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4330         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4331         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4332         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4333         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4334         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4335         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4336         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4337         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4338         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4339         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4340         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4341         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4342         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4343         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4344         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4345         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4346         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4347         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4348         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4349         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4350         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4351         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4352         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4353         [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
4354 };
4355
4356 #undef PRE_EX
4357 #undef POST_EX
4358 #undef POST_MEM
4359
4360 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4361                                struct x86_instruction_info *info,
4362                                enum x86_intercept_stage stage,
4363                                struct x86_exception *exception)
4364 {
4365         struct vcpu_svm *svm = to_svm(vcpu);
4366         int vmexit, ret = X86EMUL_CONTINUE;
4367         struct __x86_intercept icpt_info;
4368         struct vmcb *vmcb = svm->vmcb;
4369
4370         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4371                 goto out;
4372
4373         icpt_info = x86_intercept_map[info->intercept];
4374
4375         if (stage != icpt_info.stage)
4376                 goto out;
4377
4378         switch (icpt_info.exit_code) {
4379         case SVM_EXIT_READ_CR0:
4380                 if (info->intercept == x86_intercept_cr_read)
4381                         icpt_info.exit_code += info->modrm_reg;
4382                 break;
4383         case SVM_EXIT_WRITE_CR0: {
4384                 unsigned long cr0, val;
4385
4386                 if (info->intercept == x86_intercept_cr_write)
4387                         icpt_info.exit_code += info->modrm_reg;
4388
4389                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4390                     info->intercept == x86_intercept_clts)
4391                         break;
4392
4393                 if (!(vmcb12_is_intercept(&svm->nested.ctl,
4394                                         INTERCEPT_SELECTIVE_CR0)))
4395                         break;
4396
4397                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4398                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4399
4400                 if (info->intercept == x86_intercept_lmsw) {
4401                         cr0 &= 0xfUL;
4402                         val &= 0xfUL;
4403                         /* lmsw can't clear PE - catch this here */
4404                         if (cr0 & X86_CR0_PE)
4405                                 val |= X86_CR0_PE;
4406                 }
4407
4408                 if (cr0 ^ val)
4409                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4410
4411                 break;
4412         }
4413         case SVM_EXIT_READ_DR0:
4414         case SVM_EXIT_WRITE_DR0:
4415                 icpt_info.exit_code += info->modrm_reg;
4416                 break;
4417         case SVM_EXIT_MSR:
4418                 if (info->intercept == x86_intercept_wrmsr)
4419                         vmcb->control.exit_info_1 = 1;
4420                 else
4421                         vmcb->control.exit_info_1 = 0;
4422                 break;
4423         case SVM_EXIT_PAUSE:
4424                 /*
4425                  * We get this for NOP only, but pause
4426                  * is rep not, check this here
4427                  */
4428                 if (info->rep_prefix != REPE_PREFIX)
4429                         goto out;
4430                 break;
4431         case SVM_EXIT_IOIO: {
4432                 u64 exit_info;
4433                 u32 bytes;
4434
4435                 if (info->intercept == x86_intercept_in ||
4436                     info->intercept == x86_intercept_ins) {
4437                         exit_info = ((info->src_val & 0xffff) << 16) |
4438                                 SVM_IOIO_TYPE_MASK;
4439                         bytes = info->dst_bytes;
4440                 } else {
4441                         exit_info = (info->dst_val & 0xffff) << 16;
4442                         bytes = info->src_bytes;
4443                 }
4444
4445                 if (info->intercept == x86_intercept_outs ||
4446                     info->intercept == x86_intercept_ins)
4447                         exit_info |= SVM_IOIO_STR_MASK;
4448
4449                 if (info->rep_prefix)
4450                         exit_info |= SVM_IOIO_REP_MASK;
4451
4452                 bytes = min(bytes, 4u);
4453
4454                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4455
4456                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4457
4458                 vmcb->control.exit_info_1 = exit_info;
4459                 vmcb->control.exit_info_2 = info->next_rip;
4460
4461                 break;
4462         }
4463         default:
4464                 break;
4465         }
4466
4467         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4468         if (static_cpu_has(X86_FEATURE_NRIPS))
4469                 vmcb->control.next_rip  = info->next_rip;
4470         vmcb->control.exit_code = icpt_info.exit_code;
4471         vmexit = nested_svm_exit_handled(svm);
4472
4473         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4474                                            : X86EMUL_CONTINUE;
4475
4476 out:
4477         return ret;
4478 }
4479
4480 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4481 {
4482         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4483                 vcpu->arch.at_instruction_boundary = true;
4484 }
4485
4486 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4487 {
4488         if (!kvm_pause_in_guest(vcpu->kvm))
4489                 shrink_ple_window(vcpu);
4490 }
4491
4492 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4493 {
4494         /* [63:9] are reserved. */
4495         vcpu->arch.mcg_cap &= 0x1ff;
4496 }
4497
4498 #ifdef CONFIG_KVM_SMM
4499 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4500 {
4501         struct vcpu_svm *svm = to_svm(vcpu);
4502
4503         /* Per APM Vol.2 15.22.2 "Response to SMI" */
4504         if (!gif_set(svm))
4505                 return true;
4506
4507         return is_smm(vcpu);
4508 }
4509
4510 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4511 {
4512         struct vcpu_svm *svm = to_svm(vcpu);
4513         if (svm->nested.nested_run_pending)
4514                 return -EBUSY;
4515
4516         if (svm_smi_blocked(vcpu))
4517                 return 0;
4518
4519         /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4520         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4521                 return -EBUSY;
4522
4523         return 1;
4524 }
4525
4526 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4527 {
4528         struct vcpu_svm *svm = to_svm(vcpu);
4529         struct kvm_host_map map_save;
4530         int ret;
4531
4532         if (!is_guest_mode(vcpu))
4533                 return 0;
4534
4535         /*
4536          * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
4537          * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4538          */
4539
4540         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4541                 return 1;
4542
4543         smram->smram64.svm_guest_flag = 1;
4544         smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4545
4546         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4547         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4548         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4549
4550         ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4551         if (ret)
4552                 return ret;
4553
4554         /*
4555          * KVM uses VMCB01 to store L1 host state while L2 runs but
4556          * VMCB01 is going to be used during SMM and thus the state will
4557          * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4558          * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4559          * format of the area is identical to guest save area offsetted
4560          * by 0x400 (matches the offset of 'struct vmcb_save_area'
4561          * within 'struct vmcb'). Note: HSAVE area may also be used by
4562          * L1 hypervisor to save additional host context (e.g. KVM does
4563          * that, see svm_prepare_switch_to_guest()) which must be
4564          * preserved.
4565          */
4566         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4567                 return 1;
4568
4569         BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4570
4571         svm_copy_vmrun_state(map_save.hva + 0x400,
4572                              &svm->vmcb01.ptr->save);
4573
4574         kvm_vcpu_unmap(vcpu, &map_save, true);
4575         return 0;
4576 }
4577
4578 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4579 {
4580         struct vcpu_svm *svm = to_svm(vcpu);
4581         struct kvm_host_map map, map_save;
4582         struct vmcb *vmcb12;
4583         int ret;
4584
4585         const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4586
4587         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4588                 return 0;
4589
4590         /* Non-zero if SMI arrived while vCPU was in guest mode. */
4591         if (!smram64->svm_guest_flag)
4592                 return 0;
4593
4594         if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4595                 return 1;
4596
4597         if (!(smram64->efer & EFER_SVME))
4598                 return 1;
4599
4600         if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4601                 return 1;
4602
4603         ret = 1;
4604         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4605                 goto unmap_map;
4606
4607         if (svm_allocate_nested(svm))
4608                 goto unmap_save;
4609
4610         /*
4611          * Restore L1 host state from L1 HSAVE area as VMCB01 was
4612          * used during SMM (see svm_enter_smm())
4613          */
4614
4615         svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4616
4617         /*
4618          * Enter the nested guest now
4619          */
4620
4621         vmcb_mark_all_dirty(svm->vmcb01.ptr);
4622
4623         vmcb12 = map.hva;
4624         nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4625         nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4626         ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
4627
4628         if (ret)
4629                 goto unmap_save;
4630
4631         svm->nested.nested_run_pending = 1;
4632
4633 unmap_save:
4634         kvm_vcpu_unmap(vcpu, &map_save, true);
4635 unmap_map:
4636         kvm_vcpu_unmap(vcpu, &map, true);
4637         return ret;
4638 }
4639
4640 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4641 {
4642         struct vcpu_svm *svm = to_svm(vcpu);
4643
4644         if (!gif_set(svm)) {
4645                 if (vgif)
4646                         svm_set_intercept(svm, INTERCEPT_STGI);
4647                 /* STGI will cause a vm exit */
4648         } else {
4649                 /* We must be in SMM; RSM will cause a vmexit anyway.  */
4650         }
4651 }
4652 #endif
4653
4654 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4655                                         void *insn, int insn_len)
4656 {
4657         bool smep, smap, is_user;
4658         u64 error_code;
4659
4660         /* Emulation is always possible when KVM has access to all guest state. */
4661         if (!sev_guest(vcpu->kvm))
4662                 return true;
4663
4664         /* #UD and #GP should never be intercepted for SEV guests. */
4665         WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4666                                   EMULTYPE_TRAP_UD_FORCED |
4667                                   EMULTYPE_VMWARE_GP));
4668
4669         /*
4670          * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4671          * to guest register state.
4672          */
4673         if (sev_es_guest(vcpu->kvm))
4674                 return false;
4675
4676         /*
4677          * Emulation is possible if the instruction is already decoded, e.g.
4678          * when completing I/O after returning from userspace.
4679          */
4680         if (emul_type & EMULTYPE_NO_DECODE)
4681                 return true;
4682
4683         /*
4684          * Emulation is possible for SEV guests if and only if a prefilled
4685          * buffer containing the bytes of the intercepted instruction is
4686          * available. SEV guest memory is encrypted with a guest specific key
4687          * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4688          * decode garbage.
4689          *
4690          * If KVM is NOT trying to simply skip an instruction, inject #UD if
4691          * KVM reached this point without an instruction buffer.  In practice,
4692          * this path should never be hit by a well-behaved guest, e.g. KVM
4693          * doesn't intercept #UD or #GP for SEV guests, but this path is still
4694          * theoretically reachable, e.g. via unaccelerated fault-like AVIC
4695          * access, and needs to be handled by KVM to avoid putting the guest
4696          * into an infinite loop.   Injecting #UD is somewhat arbitrary, but
4697          * its the least awful option given lack of insight into the guest.
4698          *
4699          * If KVM is trying to skip an instruction, simply resume the guest.
4700          * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
4701          * will attempt to re-inject the INT3/INTO and skip the instruction.
4702          * In that scenario, retrying the INT3/INTO and hoping the guest will
4703          * make forward progress is the only option that has a chance of
4704          * success (and in practice it will work the vast majority of the time).
4705          */
4706         if (unlikely(!insn)) {
4707                 if (!(emul_type & EMULTYPE_SKIP))
4708                         kvm_queue_exception(vcpu, UD_VECTOR);
4709                 return false;
4710         }
4711
4712         /*
4713          * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4714          * will be empty if the DecodeAssist microcode cannot fetch bytes for
4715          * the faulting instruction because the code fetch itself faulted, e.g.
4716          * the guest attempted to fetch from emulated MMIO or a guest page
4717          * table used to translate CS:RIP resides in emulated MMIO.
4718          */
4719         if (likely(insn_len))
4720                 return true;
4721
4722         /*
4723          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4724          *
4725          * Errata:
4726          * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4727          * possible that CPU microcode implementing DecodeAssist will fail to
4728          * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4729          * be '0'.  This happens because microcode reads CS:RIP using a _data_
4730          * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4731          * gives up and does not fill the instruction bytes buffer.
4732          *
4733          * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4734          * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4735          * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4736          * GuestIntrBytes field of the VMCB.
4737          *
4738          * This does _not_ mean that the erratum has been encountered, as the
4739          * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4740          * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4741          * encountered a reserved/not-present #PF.
4742          *
4743          * To hit the erratum, the following conditions must be true:
4744          *    1. CR4.SMAP=1 (obviously).
4745          *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4746          *       have been hit as the guest would have encountered a SMEP
4747          *       violation #PF, not a #NPF.
4748          *    3. The #NPF is not due to a code fetch, in which case failure to
4749          *       retrieve the instruction bytes is legitimate (see abvoe).
4750          *
4751          * In addition, don't apply the erratum workaround if the #NPF occurred
4752          * while translating guest page tables (see below).
4753          */
4754         error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4755         if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4756                 goto resume_guest;
4757
4758         smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
4759         smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
4760         is_user = svm_get_cpl(vcpu) == 3;
4761         if (smap && (!smep || is_user)) {
4762                 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
4763
4764                 /*
4765                  * If the fault occurred in userspace, arbitrarily inject #GP
4766                  * to avoid killing the guest and to hopefully avoid confusing
4767                  * the guest kernel too much, e.g. injecting #PF would not be
4768                  * coherent with respect to the guest's page tables.  Request
4769                  * triple fault if the fault occurred in the kernel as there's
4770                  * no fault that KVM can inject without confusing the guest.
4771                  * In practice, the triple fault is moot as no sane SEV kernel
4772                  * will execute from user memory while also running with SMAP=1.
4773                  */
4774                 if (is_user)
4775                         kvm_inject_gp(vcpu, 0);
4776                 else
4777                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4778         }
4779
4780 resume_guest:
4781         /*
4782          * If the erratum was not hit, simply resume the guest and let it fault
4783          * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4784          * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4785          * userspace will kill the guest, and letting the emulator read garbage
4786          * will yield random behavior and potentially corrupt the guest.
4787          *
4788          * Simply resuming the guest is technically not a violation of the SEV
4789          * architecture.  AMD's APM states that all code fetches and page table
4790          * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4791          * APM also states that encrypted accesses to MMIO are "ignored", but
4792          * doesn't explicitly define "ignored", i.e. doing nothing and letting
4793          * the guest spin is technically "ignoring" the access.
4794          */
4795         return false;
4796 }
4797
4798 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4799 {
4800         struct vcpu_svm *svm = to_svm(vcpu);
4801
4802         return !gif_set(svm);
4803 }
4804
4805 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4806 {
4807         if (!sev_es_guest(vcpu->kvm))
4808                 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4809
4810         sev_vcpu_deliver_sipi_vector(vcpu, vector);
4811 }
4812
4813 static void svm_vm_destroy(struct kvm *kvm)
4814 {
4815         avic_vm_destroy(kvm);
4816         sev_vm_destroy(kvm);
4817 }
4818
4819 static int svm_vm_init(struct kvm *kvm)
4820 {
4821         if (!pause_filter_count || !pause_filter_thresh)
4822                 kvm->arch.pause_in_guest = true;
4823
4824         if (enable_apicv) {
4825                 int ret = avic_vm_init(kvm);
4826                 if (ret)
4827                         return ret;
4828         }
4829
4830         return 0;
4831 }
4832
4833 static struct kvm_x86_ops svm_x86_ops __initdata = {
4834         .name = KBUILD_MODNAME,
4835
4836         .check_processor_compatibility = svm_check_processor_compat,
4837
4838         .hardware_unsetup = svm_hardware_unsetup,
4839         .hardware_enable = svm_hardware_enable,
4840         .hardware_disable = svm_hardware_disable,
4841         .has_emulated_msr = svm_has_emulated_msr,
4842
4843         .vcpu_create = svm_vcpu_create,
4844         .vcpu_free = svm_vcpu_free,
4845         .vcpu_reset = svm_vcpu_reset,
4846
4847         .vm_size = sizeof(struct kvm_svm),
4848         .vm_init = svm_vm_init,
4849         .vm_destroy = svm_vm_destroy,
4850
4851         .prepare_switch_to_guest = svm_prepare_switch_to_guest,
4852         .vcpu_load = svm_vcpu_load,
4853         .vcpu_put = svm_vcpu_put,
4854         .vcpu_blocking = avic_vcpu_blocking,
4855         .vcpu_unblocking = avic_vcpu_unblocking,
4856
4857         .update_exception_bitmap = svm_update_exception_bitmap,
4858         .get_msr_feature = svm_get_msr_feature,
4859         .get_msr = svm_get_msr,
4860         .set_msr = svm_set_msr,
4861         .get_segment_base = svm_get_segment_base,
4862         .get_segment = svm_get_segment,
4863         .set_segment = svm_set_segment,
4864         .get_cpl = svm_get_cpl,
4865         .get_cs_db_l_bits = svm_get_cs_db_l_bits,
4866         .set_cr0 = svm_set_cr0,
4867         .post_set_cr3 = sev_post_set_cr3,
4868         .is_valid_cr4 = svm_is_valid_cr4,
4869         .set_cr4 = svm_set_cr4,
4870         .set_efer = svm_set_efer,
4871         .get_idt = svm_get_idt,
4872         .set_idt = svm_set_idt,
4873         .get_gdt = svm_get_gdt,
4874         .set_gdt = svm_set_gdt,
4875         .set_dr7 = svm_set_dr7,
4876         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4877         .cache_reg = svm_cache_reg,
4878         .get_rflags = svm_get_rflags,
4879         .set_rflags = svm_set_rflags,
4880         .get_if_flag = svm_get_if_flag,
4881
4882         .flush_tlb_all = svm_flush_tlb_all,
4883         .flush_tlb_current = svm_flush_tlb_current,
4884         .flush_tlb_gva = svm_flush_tlb_gva,
4885         .flush_tlb_guest = svm_flush_tlb_asid,
4886
4887         .vcpu_pre_run = svm_vcpu_pre_run,
4888         .vcpu_run = svm_vcpu_run,
4889         .handle_exit = svm_handle_exit,
4890         .skip_emulated_instruction = svm_skip_emulated_instruction,
4891         .update_emulated_instruction = NULL,
4892         .set_interrupt_shadow = svm_set_interrupt_shadow,
4893         .get_interrupt_shadow = svm_get_interrupt_shadow,
4894         .patch_hypercall = svm_patch_hypercall,
4895         .inject_irq = svm_inject_irq,
4896         .inject_nmi = svm_inject_nmi,
4897         .is_vnmi_pending = svm_is_vnmi_pending,
4898         .set_vnmi_pending = svm_set_vnmi_pending,
4899         .inject_exception = svm_inject_exception,
4900         .cancel_injection = svm_cancel_injection,
4901         .interrupt_allowed = svm_interrupt_allowed,
4902         .nmi_allowed = svm_nmi_allowed,
4903         .get_nmi_mask = svm_get_nmi_mask,
4904         .set_nmi_mask = svm_set_nmi_mask,
4905         .enable_nmi_window = svm_enable_nmi_window,
4906         .enable_irq_window = svm_enable_irq_window,
4907         .update_cr8_intercept = svm_update_cr8_intercept,
4908         .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
4909         .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4910         .apicv_post_state_restore = avic_apicv_post_state_restore,
4911         .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
4912
4913         .get_exit_info = svm_get_exit_info,
4914
4915         .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4916
4917         .has_wbinvd_exit = svm_has_wbinvd_exit,
4918
4919         .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4920         .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4921         .write_tsc_offset = svm_write_tsc_offset,
4922         .write_tsc_multiplier = svm_write_tsc_multiplier,
4923
4924         .load_mmu_pgd = svm_load_mmu_pgd,
4925
4926         .check_intercept = svm_check_intercept,
4927         .handle_exit_irqoff = svm_handle_exit_irqoff,
4928
4929         .request_immediate_exit = __kvm_request_immediate_exit,
4930
4931         .sched_in = svm_sched_in,
4932
4933         .nested_ops = &svm_nested_ops,
4934
4935         .deliver_interrupt = svm_deliver_interrupt,
4936         .pi_update_irte = avic_pi_update_irte,
4937         .setup_mce = svm_setup_mce,
4938
4939 #ifdef CONFIG_KVM_SMM
4940         .smi_allowed = svm_smi_allowed,
4941         .enter_smm = svm_enter_smm,
4942         .leave_smm = svm_leave_smm,
4943         .enable_smi_window = svm_enable_smi_window,
4944 #endif
4945
4946         .mem_enc_ioctl = sev_mem_enc_ioctl,
4947         .mem_enc_register_region = sev_mem_enc_register_region,
4948         .mem_enc_unregister_region = sev_mem_enc_unregister_region,
4949         .guest_memory_reclaimed = sev_guest_memory_reclaimed,
4950
4951         .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4952         .vm_move_enc_context_from = sev_vm_move_enc_context_from,
4953
4954         .can_emulate_instruction = svm_can_emulate_instruction,
4955
4956         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
4957
4958         .msr_filter_changed = svm_msr_filter_changed,
4959         .complete_emulated_msr = svm_complete_emulated_msr,
4960
4961         .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
4962         .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
4963 };
4964
4965 /*
4966  * The default MMIO mask is a single bit (excluding the present bit),
4967  * which could conflict with the memory encryption bit. Check for
4968  * memory encryption support and override the default MMIO mask if
4969  * memory encryption is enabled.
4970  */
4971 static __init void svm_adjust_mmio_mask(void)
4972 {
4973         unsigned int enc_bit, mask_bit;
4974         u64 msr, mask;
4975
4976         /* If there is no memory encryption support, use existing mask */
4977         if (cpuid_eax(0x80000000) < 0x8000001f)
4978                 return;
4979
4980         /* If memory encryption is not enabled, use existing mask */
4981         rdmsrl(MSR_AMD64_SYSCFG, msr);
4982         if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
4983                 return;
4984
4985         enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
4986         mask_bit = boot_cpu_data.x86_phys_bits;
4987
4988         /* Increment the mask bit if it is the same as the encryption bit */
4989         if (enc_bit == mask_bit)
4990                 mask_bit++;
4991
4992         /*
4993          * If the mask bit location is below 52, then some bits above the
4994          * physical addressing limit will always be reserved, so use the
4995          * rsvd_bits() function to generate the mask. This mask, along with
4996          * the present bit, will be used to generate a page fault with
4997          * PFER.RSV = 1.
4998          *
4999          * If the mask bit location is 52 (or above), then clear the mask.
5000          */
5001         mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
5002
5003         kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
5004 }
5005
5006 static __init void svm_set_cpu_caps(void)
5007 {
5008         kvm_set_cpu_caps();
5009
5010         kvm_caps.supported_perf_cap = 0;
5011         kvm_caps.supported_xss = 0;
5012
5013         /* CPUID 0x80000001 and 0x8000000A (SVM features) */
5014         if (nested) {
5015                 kvm_cpu_cap_set(X86_FEATURE_SVM);
5016                 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
5017
5018                 if (nrips)
5019                         kvm_cpu_cap_set(X86_FEATURE_NRIPS);
5020
5021                 if (npt_enabled)
5022                         kvm_cpu_cap_set(X86_FEATURE_NPT);
5023
5024                 if (tsc_scaling)
5025                         kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
5026
5027                 if (vls)
5028                         kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
5029                 if (lbrv)
5030                         kvm_cpu_cap_set(X86_FEATURE_LBRV);
5031
5032                 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
5033                         kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
5034
5035                 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
5036                         kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
5037
5038                 if (vgif)
5039                         kvm_cpu_cap_set(X86_FEATURE_VGIF);
5040
5041                 if (vnmi)
5042                         kvm_cpu_cap_set(X86_FEATURE_VNMI);
5043
5044                 /* Nested VM can receive #VMEXIT instead of triggering #GP */
5045                 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
5046         }
5047
5048         /* CPUID 0x80000008 */
5049         if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
5050             boot_cpu_has(X86_FEATURE_AMD_SSBD))
5051                 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
5052
5053         if (enable_pmu) {
5054                 /*
5055                  * Enumerate support for PERFCTR_CORE if and only if KVM has
5056                  * access to enough counters to virtualize "core" support,
5057                  * otherwise limit vPMU support to the legacy number of counters.
5058                  */
5059                 if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
5060                         kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
5061                                                           kvm_pmu_cap.num_counters_gp);
5062                 else
5063                         kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
5064
5065                 if (kvm_pmu_cap.version != 2 ||
5066                     !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
5067                         kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
5068         }
5069
5070         /* CPUID 0x8000001F (SME/SEV features) */
5071         sev_set_cpu_caps();
5072 }
5073
5074 static __init int svm_hardware_setup(void)
5075 {
5076         int cpu;
5077         struct page *iopm_pages;
5078         void *iopm_va;
5079         int r;
5080         unsigned int order = get_order(IOPM_SIZE);
5081
5082         /*
5083          * NX is required for shadow paging and for NPT if the NX huge pages
5084          * mitigation is enabled.
5085          */
5086         if (!boot_cpu_has(X86_FEATURE_NX)) {
5087                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
5088                 return -EOPNOTSUPP;
5089         }
5090         kvm_enable_efer_bits(EFER_NX);
5091
5092         iopm_pages = alloc_pages(GFP_KERNEL, order);
5093
5094         if (!iopm_pages)
5095                 return -ENOMEM;
5096
5097         iopm_va = page_address(iopm_pages);
5098         memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
5099         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
5100
5101         init_msrpm_offsets();
5102
5103         kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
5104                                      XFEATURE_MASK_BNDCSR);
5105
5106         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
5107                 kvm_enable_efer_bits(EFER_FFXSR);
5108
5109         if (tsc_scaling) {
5110                 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
5111                         tsc_scaling = false;
5112                 } else {
5113                         pr_info("TSC scaling supported\n");
5114                         kvm_caps.has_tsc_control = true;
5115                 }
5116         }
5117         kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
5118         kvm_caps.tsc_scaling_ratio_frac_bits = 32;
5119
5120         tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
5121
5122         if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
5123                 kvm_enable_efer_bits(EFER_AUTOIBRS);
5124
5125         /* Check for pause filtering support */
5126         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
5127                 pause_filter_count = 0;
5128                 pause_filter_thresh = 0;
5129         } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
5130                 pause_filter_thresh = 0;
5131         }
5132
5133         if (nested) {
5134                 pr_info("Nested Virtualization enabled\n");
5135                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
5136         }
5137
5138         /*
5139          * KVM's MMU doesn't support using 2-level paging for itself, and thus
5140          * NPT isn't supported if the host is using 2-level paging since host
5141          * CR4 is unchanged on VMRUN.
5142          */
5143         if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
5144                 npt_enabled = false;
5145
5146         if (!boot_cpu_has(X86_FEATURE_NPT))
5147                 npt_enabled = false;
5148
5149         /* Force VM NPT level equal to the host's paging level */
5150         kvm_configure_mmu(npt_enabled, get_npt_level(),
5151                           get_npt_level(), PG_LEVEL_1G);
5152         pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5153
5154         /* Setup shadow_me_value and shadow_me_mask */
5155         kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5156
5157         svm_adjust_mmio_mask();
5158
5159         nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
5160
5161         /*
5162          * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5163          * may be modified by svm_adjust_mmio_mask()), as well as nrips.
5164          */
5165         sev_hardware_setup();
5166
5167         svm_hv_hardware_setup();
5168
5169         for_each_possible_cpu(cpu) {
5170                 r = svm_cpu_init(cpu);
5171                 if (r)
5172                         goto err;
5173         }
5174
5175         enable_apicv = avic = avic && avic_hardware_setup();
5176
5177         if (!enable_apicv) {
5178                 svm_x86_ops.vcpu_blocking = NULL;
5179                 svm_x86_ops.vcpu_unblocking = NULL;
5180                 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5181         } else if (!x2avic_enabled) {
5182                 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
5183         }
5184
5185         if (vls) {
5186                 if (!npt_enabled ||
5187                     !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5188                     !IS_ENABLED(CONFIG_X86_64)) {
5189                         vls = false;
5190                 } else {
5191                         pr_info("Virtual VMLOAD VMSAVE supported\n");
5192                 }
5193         }
5194
5195         if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5196                 svm_gp_erratum_intercept = false;
5197
5198         if (vgif) {
5199                 if (!boot_cpu_has(X86_FEATURE_VGIF))
5200                         vgif = false;
5201                 else
5202                         pr_info("Virtual GIF supported\n");
5203         }
5204
5205         vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
5206         if (vnmi)
5207                 pr_info("Virtual NMI enabled\n");
5208
5209         if (!vnmi) {
5210                 svm_x86_ops.is_vnmi_pending = NULL;
5211                 svm_x86_ops.set_vnmi_pending = NULL;
5212         }
5213
5214
5215         if (lbrv) {
5216                 if (!boot_cpu_has(X86_FEATURE_LBRV))
5217                         lbrv = false;
5218                 else
5219                         pr_info("LBR virtualization supported\n");
5220         }
5221
5222         if (!enable_pmu)
5223                 pr_info("PMU virtualization is disabled\n");
5224
5225         svm_set_cpu_caps();
5226
5227         /*
5228          * It seems that on AMD processors PTE's accessed bit is
5229          * being set by the CPU hardware before the NPF vmexit.
5230          * This is not expected behaviour and our tests fail because
5231          * of it.
5232          * A workaround here is to disable support for
5233          * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5234          * In this case userspace can know if there is support using
5235          * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5236          * it
5237          * If future AMD CPU models change the behaviour described above,
5238          * this variable can be changed accordingly
5239          */
5240         allow_smaller_maxphyaddr = !npt_enabled;
5241
5242         return 0;
5243
5244 err:
5245         svm_hardware_unsetup();
5246         return r;
5247 }
5248
5249
5250 static struct kvm_x86_init_ops svm_init_ops __initdata = {
5251         .hardware_setup = svm_hardware_setup,
5252
5253         .runtime_ops = &svm_x86_ops,
5254         .pmu_ops = &amd_pmu_ops,
5255 };
5256
5257 static int __init svm_init(void)
5258 {
5259         int r;
5260
5261         __unused_size_checks();
5262
5263         if (!kvm_is_svm_supported())
5264                 return -EOPNOTSUPP;
5265
5266         r = kvm_x86_vendor_init(&svm_init_ops);
5267         if (r)
5268                 return r;
5269
5270         /*
5271          * Common KVM initialization _must_ come last, after this, /dev/kvm is
5272          * exposed to userspace!
5273          */
5274         r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
5275                      THIS_MODULE);
5276         if (r)
5277                 goto err_kvm_init;
5278
5279         return 0;
5280
5281 err_kvm_init:
5282         kvm_x86_vendor_exit();
5283         return r;
5284 }
5285
5286 static void __exit svm_exit(void)
5287 {
5288         kvm_exit();
5289         kvm_x86_vendor_exit();
5290 }
5291
5292 module_init(svm_init)
5293 module_exit(svm_exit)