arch/x86/kvm/svm/svm.c

   1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   2
   3 #include <linux/kvm_host.h>
   4
   5 #include "irq.h"
   6 #include "mmu.h"
   7 #include "kvm_cache_regs.h"
   8 #include "x86.h"
   9 #include "smm.h"
  10 #include "cpuid.h"
  11 #include "pmu.h"
  12
  13 #include <linux/module.h>
  14 #include <linux/mod_devicetable.h>
  15 #include <linux/kernel.h>
  16 #include <linux/vmalloc.h>
  17 #include <linux/highmem.h>
  18 #include <linux/amd-iommu.h>
  19 #include <linux/sched.h>
  20 #include <linux/trace_events.h>
  21 #include <linux/slab.h>
  22 #include <linux/hashtable.h>
  23 #include <linux/objtool.h>
  24 #include <linux/psp-sev.h>
  25 #include <linux/file.h>
  26 #include <linux/pagemap.h>
  27 #include <linux/swap.h>
  28 #include <linux/rwsem.h>
  29 #include <linux/cc_platform.h>
  30 #include <linux/smp.h>
  31
  32 #include <asm/apic.h>
  33 #include <asm/perf_event.h>
  34 #include <asm/tlbflush.h>
  35 #include <asm/desc.h>
  36 #include <asm/debugreg.h>
  37 #include <asm/kvm_para.h>
  38 #include <asm/irq_remapping.h>
  39 #include <asm/spec-ctrl.h>
  40 #include <asm/cpu_device_id.h>
  41 #include <asm/traps.h>
  42 #include <asm/reboot.h>
  43 #include <asm/fpu/api.h>
  44
  45 #include <asm/virtext.h>
  46
  47 #include <trace/events/ipi.h>
  48
  49 #include "trace.h"
  50
  51 #include "svm.h"
  52 #include "svm_ops.h"
  53
  54 #include "kvm_onhyperv.h"
  55 #include "svm_onhyperv.h"
  56
  57 MODULE_AUTHOR("Qumranet");
  58 MODULE_LICENSE("GPL");
  59
  60 #ifdef MODULE
  61 static const struct x86_cpu_id svm_cpu_id[] = {
  62         X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  63         {}
  64 };
  65 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  66 #endif
  67
  68 #define SEG_TYPE_LDT 2
  69 #define SEG_TYPE_BUSY_TSS16 3
  70
  71 static bool erratum_383_found __read_mostly;
  72
  73 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  74
  75 /*
  76  * Set osvw_len to higher value when updated Revision Guides
  77  * are published and we know what the new status bits are
  78  */
  79 static uint64_t osvw_len = 4, osvw_status;
  80
  81 static DEFINE_PER_CPU(u64, current_tsc_ratio);
  82
  83 #define X2APIC_MSR(x)   (APIC_BASE_MSR + (x >> 4))
  84
  85 static const struct svm_direct_access_msrs {
  86         u32 index;   /* Index of the MSR */
  87         bool always; /* True if intercept is initially cleared */
  88 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  89         { .index = MSR_STAR,                            .always = true  },
  90         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
  91         { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
  92         { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
  93 #ifdef CONFIG_X86_64
  94         { .index = MSR_GS_BASE,                         .always = true  },
  95         { .index = MSR_FS_BASE,                         .always = true  },
  96         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
  97         { .index = MSR_LSTAR,                           .always = true  },
  98         { .index = MSR_CSTAR,                           .always = true  },
  99         { .index = MSR_SYSCALL_MASK,                    .always = true  },
 100 #endif
 101         { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
 102         { .index = MSR_IA32_PRED_CMD,                   .always = false },
 103         { .index = MSR_IA32_FLUSH_CMD,                  .always = false },
 104         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 105         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 106         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 107         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 108         { .index = MSR_EFER,                            .always = false },
 109         { .index = MSR_IA32_CR_PAT,                     .always = false },
 110         { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
 111         { .index = MSR_TSC_AUX,                         .always = false },
 112         { .index = X2APIC_MSR(APIC_ID),                 .always = false },
 113         { .index = X2APIC_MSR(APIC_LVR),                .always = false },
 114         { .index = X2APIC_MSR(APIC_TASKPRI),            .always = false },
 115         { .index = X2APIC_MSR(APIC_ARBPRI),             .always = false },
 116         { .index = X2APIC_MSR(APIC_PROCPRI),            .always = false },
 117         { .index = X2APIC_MSR(APIC_EOI),                .always = false },
 118         { .index = X2APIC_MSR(APIC_RRR),                .always = false },
 119         { .index = X2APIC_MSR(APIC_LDR),                .always = false },
 120         { .index = X2APIC_MSR(APIC_DFR),                .always = false },
 121         { .index = X2APIC_MSR(APIC_SPIV),               .always = false },
 122         { .index = X2APIC_MSR(APIC_ISR),                .always = false },
 123         { .index = X2APIC_MSR(APIC_TMR),                .always = false },
 124         { .index = X2APIC_MSR(APIC_IRR),                .always = false },
 125         { .index = X2APIC_MSR(APIC_ESR),                .always = false },
 126         { .index = X2APIC_MSR(APIC_ICR),                .always = false },
 127         { .index = X2APIC_MSR(APIC_ICR2),               .always = false },
 128
 129         /*
 130          * Note:
 131          * AMD does not virtualize APIC TSC-deadline timer mode, but it is
 132          * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
 133          * the AVIC hardware would generate GP fault. Therefore, always
 134          * intercept the MSR 0x832, and do not setup direct_access_msr.
 135          */
 136         { .index = X2APIC_MSR(APIC_LVTTHMR),            .always = false },
 137         { .index = X2APIC_MSR(APIC_LVTPC),              .always = false },
 138         { .index = X2APIC_MSR(APIC_LVT0),               .always = false },
 139         { .index = X2APIC_MSR(APIC_LVT1),               .always = false },
 140         { .index = X2APIC_MSR(APIC_LVTERR),             .always = false },
 141         { .index = X2APIC_MSR(APIC_TMICT),              .always = false },
 142         { .index = X2APIC_MSR(APIC_TMCCT),              .always = false },
 143         { .index = X2APIC_MSR(APIC_TDCR),               .always = false },
 144         { .index = MSR_INVALID,                         .always = false },
 145 };
 146
 147 /*
 148  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 149  * pause_filter_count: On processors that support Pause filtering(indicated
 150  *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 151  *      count value. On VMRUN this value is loaded into an internal counter.
 152  *      Each time a pause instruction is executed, this counter is decremented
 153  *      until it reaches zero at which time a #VMEXIT is generated if pause
 154  *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 155  *      Intercept Filtering for more details.
 156  *      This also indicate if ple logic enabled.
 157  *
 158  * pause_filter_thresh: In addition, some processor families support advanced
 159  *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 160  *      the amount of time a guest is allowed to execute in a pause loop.
 161  *      In this mode, a 16-bit pause filter threshold field is added in the
 162  *      VMCB. The threshold value is a cycle count that is used to reset the
 163  *      pause counter. As with simple pause filtering, VMRUN loads the pause
 164  *      count value from VMCB into an internal counter. Then, on each pause
 165  *      instruction the hardware checks the elapsed number of cycles since
 166  *      the most recent pause instruction against the pause filter threshold.
 167  *      If the elapsed cycle count is greater than the pause filter threshold,
 168  *      then the internal pause count is reloaded from the VMCB and execution
 169  *      continues. If the elapsed cycle count is less than the pause filter
 170  *      threshold, then the internal pause count is decremented. If the count
 171  *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 172  *      triggered. If advanced pause filtering is supported and pause filter
 173  *      threshold field is set to zero, the filter will operate in the simpler,
 174  *      count only mode.
 175  */
 176
 177 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 178 module_param(pause_filter_thresh, ushort, 0444);
 179
 180 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 181 module_param(pause_filter_count, ushort, 0444);
 182
 183 /* Default doubles per-vcpu window every exit. */
 184 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 185 module_param(pause_filter_count_grow, ushort, 0444);
 186
 187 /* Default resets per-vcpu window every exit to pause_filter_count. */
 188 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 189 module_param(pause_filter_count_shrink, ushort, 0444);
 190
 191 /* Default is to compute the maximum so we can never overflow. */
 192 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 193 module_param(pause_filter_count_max, ushort, 0444);
 194
 195 /*
 196  * Use nested page tables by default.  Note, NPT may get forced off by
 197  * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 198  */
 199 bool npt_enabled = true;
 200 module_param_named(npt, npt_enabled, bool, 0444);
 201
 202 /* allow nested virtualization in KVM/SVM */
 203 static int nested = true;
 204 module_param(nested, int, S_IRUGO);
 205
 206 /* enable/disable Next RIP Save */
 207 static int nrips = true;
 208 module_param(nrips, int, 0444);
 209
 210 /* enable/disable Virtual VMLOAD VMSAVE */
 211 static int vls = true;
 212 module_param(vls, int, 0444);
 213
 214 /* enable/disable Virtual GIF */
 215 int vgif = true;
 216 module_param(vgif, int, 0444);
 217
 218 /* enable/disable LBR virtualization */
 219 static int lbrv = true;
 220 module_param(lbrv, int, 0444);
 221
 222 static int tsc_scaling = true;
 223 module_param(tsc_scaling, int, 0444);
 224
 225 /*
 226  * enable / disable AVIC.  Because the defaults differ for APICv
 227  * support between VMX and SVM we cannot use module_param_named.
 228  */
 229 static bool avic;
 230 module_param(avic, bool, 0444);
 231
 232 bool __read_mostly dump_invalid_vmcb;
 233 module_param(dump_invalid_vmcb, bool, 0644);
 234
 235
 236 bool intercept_smi = true;
 237 module_param(intercept_smi, bool, 0444);
 238
 239 bool vnmi = true;
 240 module_param(vnmi, bool, 0444);
 241
 242 static bool svm_gp_erratum_intercept = true;
 243
 244 static u8 rsm_ins_bytes[] = "\x0f\xaa";
 245
 246 static unsigned long iopm_base;
 247
 248 DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
 249
 250 /*
 251  * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 252  * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 253  *
 254  * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 255  * defer the restoration of TSC_AUX until the CPU returns to userspace.
 256  */
 257 static int tsc_aux_uret_slot __read_mostly = -1;
 258
 259 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 260
 261 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 262 #define MSRS_RANGE_SIZE 2048
 263 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 264
 265 u32 svm_msrpm_offset(u32 msr)
 266 {
 267         u32 offset;
 268         int i;
 269
 270         for (i = 0; i < NUM_MSR_MAPS; i++) {
 271                 if (msr < msrpm_ranges[i] ||
 272                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 273                         continue;
 274
 275                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 276                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 277
 278                 /* Now we have the u8 offset - but need the u32 offset */
 279                 return offset / 4;
 280         }
 281
 282         /* MSR not in any range */
 283         return MSR_INVALID;
 284 }
 285
 286 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
 287
 288 static int get_npt_level(void)
 289 {
 290 #ifdef CONFIG_X86_64
 291         return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
 292 #else
 293         return PT32E_ROOT_LEVEL;
 294 #endif
 295 }
 296
 297 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 298 {
 299         struct vcpu_svm *svm = to_svm(vcpu);
 300         u64 old_efer = vcpu->arch.efer;
 301         vcpu->arch.efer = efer;
 302
 303         if (!npt_enabled) {
 304                 /* Shadow paging assumes NX to be available.  */
 305                 efer |= EFER_NX;
 306
 307                 if (!(efer & EFER_LMA))
 308                         efer &= ~EFER_LME;
 309         }
 310
 311         if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 312                 if (!(efer & EFER_SVME)) {
 313                         svm_leave_nested(vcpu);
 314                         svm_set_gif(svm, true);
 315                         /* #GP intercept is still needed for vmware backdoor */
 316                         if (!enable_vmware_backdoor)
 317                                 clr_exception_intercept(svm, GP_VECTOR);
 318
 319                         /*
 320                          * Free the nested guest state, unless we are in SMM.
 321                          * In this case we will return to the nested guest
 322                          * as soon as we leave SMM.
 323                          */
 324                         if (!is_smm(vcpu))
 325                                 svm_free_nested(svm);
 326
 327                 } else {
 328                         int ret = svm_allocate_nested(svm);
 329
 330                         if (ret) {
 331                                 vcpu->arch.efer = old_efer;
 332                                 return ret;
 333                         }
 334
 335                         /*
 336                          * Never intercept #GP for SEV guests, KVM can't
 337                          * decrypt guest memory to workaround the erratum.
 338                          */
 339                         if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
 340                                 set_exception_intercept(svm, GP_VECTOR);
 341                 }
 342         }
 343
 344         svm->vmcb->save.efer = efer | EFER_SVME;
 345         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 346         return 0;
 347 }
 348
 349 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 350 {
 351         struct vcpu_svm *svm = to_svm(vcpu);
 352         u32 ret = 0;
 353
 354         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 355                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 356         return ret;
 357 }
 358
 359 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 360 {
 361         struct vcpu_svm *svm = to_svm(vcpu);
 362
 363         if (mask == 0)
 364                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 365         else
 366                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 367
 368 }
 369
 370 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
 371                                            bool commit_side_effects)
 372 {
 373         struct vcpu_svm *svm = to_svm(vcpu);
 374         unsigned long old_rflags;
 375
 376         /*
 377          * SEV-ES does not expose the next RIP. The RIP update is controlled by
 378          * the type of exit and the #VC handler in the guest.
 379          */
 380         if (sev_es_guest(vcpu->kvm))
 381                 goto done;
 382
 383         if (nrips && svm->vmcb->control.next_rip != 0) {
 384                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 385                 svm->next_rip = svm->vmcb->control.next_rip;
 386         }
 387
 388         if (!svm->next_rip) {
 389                 if (unlikely(!commit_side_effects))
 390                         old_rflags = svm->vmcb->save.rflags;
 391
 392                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 393                         return 0;
 394
 395                 if (unlikely(!commit_side_effects))
 396                         svm->vmcb->save.rflags = old_rflags;
 397         } else {
 398                 kvm_rip_write(vcpu, svm->next_rip);
 399         }
 400
 401 done:
 402         if (likely(commit_side_effects))
 403                 svm_set_interrupt_shadow(vcpu, 0);
 404
 405         return 1;
 406 }
 407
 408 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 409 {
 410         return __svm_skip_emulated_instruction(vcpu, true);
 411 }
 412
 413 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
 414 {
 415         unsigned long rip, old_rip = kvm_rip_read(vcpu);
 416         struct vcpu_svm *svm = to_svm(vcpu);
 417
 418         /*
 419          * Due to architectural shortcomings, the CPU doesn't always provide
 420          * NextRIP, e.g. if KVM intercepted an exception that occurred while
 421          * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
 422          * the instruction even if NextRIP is supported to acquire the next
 423          * RIP so that it can be shoved into the NextRIP field, otherwise
 424          * hardware will fail to advance guest RIP during event injection.
 425          * Drop the exception/interrupt if emulation fails and effectively
 426          * retry the instruction, it's the least awful option.  If NRIPS is
 427          * in use, the skip must not commit any side effects such as clearing
 428          * the interrupt shadow or RFLAGS.RF.
 429          */
 430         if (!__svm_skip_emulated_instruction(vcpu, !nrips))
 431                 return -EIO;
 432
 433         rip = kvm_rip_read(vcpu);
 434
 435         /*
 436          * Save the injection information, even when using next_rip, as the
 437          * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
 438          * doesn't complete due to a VM-Exit occurring while the CPU is
 439          * vectoring the event.   Decoding the instruction isn't guaranteed to
 440          * work as there may be no backing instruction, e.g. if the event is
 441          * being injected by L1 for L2, or if the guest is patching INT3 into
 442          * a different instruction.
 443          */
 444         svm->soft_int_injected = true;
 445         svm->soft_int_csbase = svm->vmcb->save.cs.base;
 446         svm->soft_int_old_rip = old_rip;
 447         svm->soft_int_next_rip = rip;
 448
 449         if (nrips)
 450                 kvm_rip_write(vcpu, old_rip);
 451
 452         if (static_cpu_has(X86_FEATURE_NRIPS))
 453                 svm->vmcb->control.next_rip = rip;
 454
 455         return 0;
 456 }
 457
 458 static void svm_inject_exception(struct kvm_vcpu *vcpu)
 459 {
 460         struct kvm_queued_exception *ex = &vcpu->arch.exception;
 461         struct vcpu_svm *svm = to_svm(vcpu);
 462
 463         kvm_deliver_exception_payload(vcpu, ex);
 464
 465         if (kvm_exception_is_soft(ex->vector) &&
 466             svm_update_soft_interrupt_rip(vcpu))
 467                 return;
 468
 469         svm->vmcb->control.event_inj = ex->vector
 470                 | SVM_EVTINJ_VALID
 471                 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 472                 | SVM_EVTINJ_TYPE_EXEPT;
 473         svm->vmcb->control.event_inj_err = ex->error_code;
 474 }
 475
 476 static void svm_init_erratum_383(void)
 477 {
 478         u32 low, high;
 479         int err;
 480         u64 val;
 481
 482         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 483                 return;
 484
 485         /* Use _safe variants to not break nested virtualization */
 486         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 487         if (err)
 488                 return;
 489
 490         val |= (1ULL << 47);
 491
 492         low  = lower_32_bits(val);
 493         high = upper_32_bits(val);
 494
 495         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 496
 497         erratum_383_found = true;
 498 }
 499
 500 static void svm_init_osvw(struct kvm_vcpu *vcpu)
 501 {
 502         /*
 503          * Guests should see errata 400 and 415 as fixed (assuming that
 504          * HLT and IO instructions are intercepted).
 505          */
 506         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 507         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 508
 509         /*
 510          * By increasing VCPU's osvw.length to 3 we are telling the guest that
 511          * all osvw.status bits inside that length, including bit 0 (which is
 512          * reserved for erratum 298), are valid. However, if host processor's
 513          * osvw_len is 0 then osvw_status[0] carries no information. We need to
 514          * be conservative here and therefore we tell the guest that erratum 298
 515          * is present (because we really don't know).
 516          */
 517         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 518                 vcpu->arch.osvw.status |= 1;
 519 }
 520
 521 static bool __kvm_is_svm_supported(void)
 522 {
 523         int cpu = smp_processor_id();
 524         struct cpuinfo_x86 *c = &cpu_data(cpu);
 525
 526         u64 vm_cr;
 527
 528         if (c->x86_vendor != X86_VENDOR_AMD &&
 529             c->x86_vendor != X86_VENDOR_HYGON) {
 530                 pr_err("CPU %d isn't AMD or Hygon\n", cpu);
 531                 return false;
 532         }
 533
 534         if (!cpu_has(c, X86_FEATURE_SVM)) {
 535                 pr_err("SVM not supported by CPU %d\n", cpu);
 536                 return false;
 537         }
 538
 539         if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
 540                 pr_info("KVM is unsupported when running as an SEV guest\n");
 541                 return false;
 542         }
 543
 544         rdmsrl(MSR_VM_CR, vm_cr);
 545         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
 546                 pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
 547                 return false;
 548         }
 549
 550         return true;
 551 }
 552
 553 static bool kvm_is_svm_supported(void)
 554 {
 555         bool supported;
 556
 557         migrate_disable();
 558         supported = __kvm_is_svm_supported();
 559         migrate_enable();
 560
 561         return supported;
 562 }
 563
 564 static int svm_check_processor_compat(void)
 565 {
 566         if (!__kvm_is_svm_supported())
 567                 return -EIO;
 568
 569         return 0;
 570 }
 571
 572 void __svm_write_tsc_multiplier(u64 multiplier)
 573 {
 574         preempt_disable();
 575
 576         if (multiplier == __this_cpu_read(current_tsc_ratio))
 577                 goto out;
 578
 579         wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
 580         __this_cpu_write(current_tsc_ratio, multiplier);
 581 out:
 582         preempt_enable();
 583 }
 584
 585 static void svm_emergency_disable(void)
 586 {
 587         cpu_svm_disable();
 588 }
 589
 590 static void svm_hardware_disable(void)
 591 {
 592         /* Make sure we clean up behind us */
 593         if (tsc_scaling)
 594                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 595
 596         cpu_svm_disable();
 597
 598         amd_pmu_disable_virt();
 599 }
 600
 601 static int svm_hardware_enable(void)
 602 {
 603
 604         struct svm_cpu_data *sd;
 605         uint64_t efer;
 606         int me = raw_smp_processor_id();
 607
 608         rdmsrl(MSR_EFER, efer);
 609         if (efer & EFER_SVME)
 610                 return -EBUSY;
 611
 612         sd = per_cpu_ptr(&svm_data, me);
 613         sd->asid_generation = 1;
 614         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 615         sd->next_asid = sd->max_asid + 1;
 616         sd->min_asid = max_sev_asid + 1;
 617
 618         wrmsrl(MSR_EFER, efer | EFER_SVME);
 619
 620         wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
 621
 622         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 623                 /*
 624                  * Set the default value, even if we don't use TSC scaling
 625                  * to avoid having stale value in the msr
 626                  */
 627                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 628         }
 629
 630
 631         /*
 632          * Get OSVW bits.
 633          *
 634          * Note that it is possible to have a system with mixed processor
 635          * revisions and therefore different OSVW bits. If bits are not the same
 636          * on different processors then choose the worst case (i.e. if erratum
 637          * is present on one processor and not on another then assume that the
 638          * erratum is present everywhere).
 639          */
 640         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 641                 uint64_t len, status = 0;
 642                 int err;
 643
 644                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 645                 if (!err)
 646                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 647                                                       &err);
 648
 649                 if (err)
 650                         osvw_status = osvw_len = 0;
 651                 else {
 652                         if (len < osvw_len)
 653                                 osvw_len = len;
 654                         osvw_status |= status;
 655                         osvw_status &= (1ULL << osvw_len) - 1;
 656                 }
 657         } else
 658                 osvw_status = osvw_len = 0;
 659
 660         svm_init_erratum_383();
 661
 662         amd_pmu_enable_virt();
 663
 664         return 0;
 665 }
 666
 667 static void svm_cpu_uninit(int cpu)
 668 {
 669         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 670
 671         if (!sd->save_area)
 672                 return;
 673
 674         kfree(sd->sev_vmcbs);
 675         __free_page(sd->save_area);
 676         sd->save_area_pa = 0;
 677         sd->save_area = NULL;
 678 }
 679
 680 static int svm_cpu_init(int cpu)
 681 {
 682         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 683         int ret = -ENOMEM;
 684
 685         memset(sd, 0, sizeof(struct svm_cpu_data));
 686         sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
 687         if (!sd->save_area)
 688                 return ret;
 689
 690         ret = sev_cpu_init(sd);
 691         if (ret)
 692                 goto free_save_area;
 693
 694         sd->save_area_pa = __sme_page_pa(sd->save_area);
 695         return 0;
 696
 697 free_save_area:
 698         __free_page(sd->save_area);
 699         sd->save_area = NULL;
 700         return ret;
 701
 702 }
 703
 704 static int direct_access_msr_slot(u32 msr)
 705 {
 706         u32 i;
 707
 708         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 709                 if (direct_access_msrs[i].index == msr)
 710                         return i;
 711
 712         return -ENOENT;
 713 }
 714
 715 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
 716                                      int write)
 717 {
 718         struct vcpu_svm *svm = to_svm(vcpu);
 719         int slot = direct_access_msr_slot(msr);
 720
 721         if (slot == -ENOENT)
 722                 return;
 723
 724         /* Set the shadow bitmaps to the desired intercept states */
 725         if (read)
 726                 set_bit(slot, svm->shadow_msr_intercept.read);
 727         else
 728                 clear_bit(slot, svm->shadow_msr_intercept.read);
 729
 730         if (write)
 731                 set_bit(slot, svm->shadow_msr_intercept.write);
 732         else
 733                 clear_bit(slot, svm->shadow_msr_intercept.write);
 734 }
 735
 736 static bool valid_msr_intercept(u32 index)
 737 {
 738         return direct_access_msr_slot(index) != -ENOENT;
 739 }
 740
 741 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 742 {
 743         u8 bit_write;
 744         unsigned long tmp;
 745         u32 offset;
 746         u32 *msrpm;
 747
 748         /*
 749          * For non-nested case:
 750          * If the L01 MSR bitmap does not intercept the MSR, then we need to
 751          * save it.
 752          *
 753          * For nested case:
 754          * If the L02 MSR bitmap does not intercept the MSR, then we need to
 755          * save it.
 756          */
 757         msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 758                                       to_svm(vcpu)->msrpm;
 759
 760         offset    = svm_msrpm_offset(msr);
 761         bit_write = 2 * (msr & 0x0f) + 1;
 762         tmp       = msrpm[offset];
 763
 764         BUG_ON(offset == MSR_INVALID);
 765
 766         return test_bit(bit_write, &tmp);
 767 }
 768
 769 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
 770                                         u32 msr, int read, int write)
 771 {
 772         struct vcpu_svm *svm = to_svm(vcpu);
 773         u8 bit_read, bit_write;
 774         unsigned long tmp;
 775         u32 offset;
 776
 777         /*
 778          * If this warning triggers extend the direct_access_msrs list at the
 779          * beginning of the file
 780          */
 781         WARN_ON(!valid_msr_intercept(msr));
 782
 783         /* Enforce non allowed MSRs to trap */
 784         if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
 785                 read = 0;
 786
 787         if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
 788                 write = 0;
 789
 790         offset    = svm_msrpm_offset(msr);
 791         bit_read  = 2 * (msr & 0x0f);
 792         bit_write = 2 * (msr & 0x0f) + 1;
 793         tmp       = msrpm[offset];
 794
 795         BUG_ON(offset == MSR_INVALID);
 796
 797         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 798         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 799
 800         msrpm[offset] = tmp;
 801
 802         svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
 803         svm->nested.force_msr_bitmap_recalc = true;
 804 }
 805
 806 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 807                           int read, int write)
 808 {
 809         set_shadow_msr_intercept(vcpu, msr, read, write);
 810         set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
 811 }
 812
 813 u32 *svm_vcpu_alloc_msrpm(void)
 814 {
 815         unsigned int order = get_order(MSRPM_SIZE);
 816         struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
 817         u32 *msrpm;
 818
 819         if (!pages)
 820                 return NULL;
 821
 822         msrpm = page_address(pages);
 823         memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 824
 825         return msrpm;
 826 }
 827
 828 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 829 {
 830         int i;
 831
 832         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 833                 if (!direct_access_msrs[i].always)
 834                         continue;
 835                 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 836         }
 837 }
 838
 839 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
 840 {
 841         int i;
 842
 843         if (intercept == svm->x2avic_msrs_intercepted)
 844                 return;
 845
 846         if (!x2avic_enabled ||
 847             !apic_x2apic_mode(svm->vcpu.arch.apic))
 848                 return;
 849
 850         for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
 851                 int index = direct_access_msrs[i].index;
 852
 853                 if ((index < APIC_BASE_MSR) ||
 854                     (index > APIC_BASE_MSR + 0xff))
 855                         continue;
 856                 set_msr_interception(&svm->vcpu, svm->msrpm, index,
 857                                      !intercept, !intercept);
 858         }
 859
 860         svm->x2avic_msrs_intercepted = intercept;
 861 }
 862
 863 void svm_vcpu_free_msrpm(u32 *msrpm)
 864 {
 865         __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 866 }
 867
 868 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
 869 {
 870         struct vcpu_svm *svm = to_svm(vcpu);
 871         u32 i;
 872
 873         /*
 874          * Set intercept permissions for all direct access MSRs again. They
 875          * will automatically get filtered through the MSR filter, so we are
 876          * back in sync after this.
 877          */
 878         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 879                 u32 msr = direct_access_msrs[i].index;
 880                 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
 881                 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
 882
 883                 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
 884         }
 885 }
 886
 887 static void add_msr_offset(u32 offset)
 888 {
 889         int i;
 890
 891         for (i = 0; i < MSRPM_OFFSETS; ++i) {
 892
 893                 /* Offset already in list? */
 894                 if (msrpm_offsets[i] == offset)
 895                         return;
 896
 897                 /* Slot used by another offset? */
 898                 if (msrpm_offsets[i] != MSR_INVALID)
 899                         continue;
 900
 901                 /* Add offset to list */
 902                 msrpm_offsets[i] = offset;
 903
 904                 return;
 905         }
 906
 907         /*
 908          * If this BUG triggers the msrpm_offsets table has an overflow. Just
 909          * increase MSRPM_OFFSETS in this case.
 910          */
 911         BUG();
 912 }
 913
 914 static void init_msrpm_offsets(void)
 915 {
 916         int i;
 917
 918         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 919
 920         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 921                 u32 offset;
 922
 923                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
 924                 BUG_ON(offset == MSR_INVALID);
 925
 926                 add_msr_offset(offset);
 927         }
 928 }
 929
 930 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 931 {
 932         to_vmcb->save.dbgctl            = from_vmcb->save.dbgctl;
 933         to_vmcb->save.br_from           = from_vmcb->save.br_from;
 934         to_vmcb->save.br_to             = from_vmcb->save.br_to;
 935         to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
 936         to_vmcb->save.last_excp_to      = from_vmcb->save.last_excp_to;
 937
 938         vmcb_mark_dirty(to_vmcb, VMCB_LBR);
 939 }
 940
 941 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 942 {
 943         struct vcpu_svm *svm = to_svm(vcpu);
 944
 945         svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
 946         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 947         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 948         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 949         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 950
 951         /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
 952         if (is_guest_mode(vcpu))
 953                 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
 954 }
 955
 956 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 957 {
 958         struct vcpu_svm *svm = to_svm(vcpu);
 959
 960         svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
 961         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 962         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 963         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 964         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 965
 966         /*
 967          * Move the LBR msrs back to the vmcb01 to avoid copying them
 968          * on nested guest entries.
 969          */
 970         if (is_guest_mode(vcpu))
 971                 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
 972 }
 973
 974 static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
 975 {
 976         /*
 977          * If the LBR virtualization is disabled, the LBR msrs are always
 978          * kept in the vmcb01 to avoid copying them on nested guest entries.
 979          *
 980          * If nested, and the LBR virtualization is enabled/disabled, the msrs
 981          * are moved between the vmcb01 and vmcb02 as needed.
 982          */
 983         struct vmcb *vmcb =
 984                 (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
 985                         svm->vmcb : svm->vmcb01.ptr;
 986
 987         switch (index) {
 988         case MSR_IA32_DEBUGCTLMSR:
 989                 return vmcb->save.dbgctl;
 990         case MSR_IA32_LASTBRANCHFROMIP:
 991                 return vmcb->save.br_from;
 992         case MSR_IA32_LASTBRANCHTOIP:
 993                 return vmcb->save.br_to;
 994         case MSR_IA32_LASTINTFROMIP:
 995                 return vmcb->save.last_excp_from;
 996         case MSR_IA32_LASTINTTOIP:
 997                 return vmcb->save.last_excp_to;
 998         default:
 999                 KVM_BUG(false, svm->vcpu.kvm,
1000                         "%s: Unknown MSR 0x%x", __func__, index);
1001                 return 0;
1002         }
1003 }
1004
1005 void svm_update_lbrv(struct kvm_vcpu *vcpu)
1006 {
1007         struct vcpu_svm *svm = to_svm(vcpu);
1008
1009         bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
1010                                            DEBUGCTLMSR_LBR;
1011
1012         bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
1013                                       LBR_CTL_ENABLE_MASK);
1014
1015         if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
1016                 if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
1017                         enable_lbrv = true;
1018
1019         if (enable_lbrv == current_enable_lbrv)
1020                 return;
1021
1022         if (enable_lbrv)
1023                 svm_enable_lbrv(vcpu);
1024         else
1025                 svm_disable_lbrv(vcpu);
1026 }
1027
1028 void disable_nmi_singlestep(struct vcpu_svm *svm)
1029 {
1030         svm->nmi_singlestep = false;
1031
1032         if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1033                 /* Clear our flags if they were not set by the guest */
1034                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1035                         svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1036                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1037                         svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1038         }
1039 }
1040
1041 static void grow_ple_window(struct kvm_vcpu *vcpu)
1042 {
1043         struct vcpu_svm *svm = to_svm(vcpu);
1044         struct vmcb_control_area *control = &svm->vmcb->control;
1045         int old = control->pause_filter_count;
1046
1047         if (kvm_pause_in_guest(vcpu->kvm))
1048                 return;
1049
1050         control->pause_filter_count = __grow_ple_window(old,
1051                                                         pause_filter_count,
1052                                                         pause_filter_count_grow,
1053                                                         pause_filter_count_max);
1054
1055         if (control->pause_filter_count != old) {
1056                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1057                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1058                                             control->pause_filter_count, old);
1059         }
1060 }
1061
1062 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1063 {
1064         struct vcpu_svm *svm = to_svm(vcpu);
1065         struct vmcb_control_area *control = &svm->vmcb->control;
1066         int old = control->pause_filter_count;
1067
1068         if (kvm_pause_in_guest(vcpu->kvm))
1069                 return;
1070
1071         control->pause_filter_count =
1072                                 __shrink_ple_window(old,
1073                                                     pause_filter_count,
1074                                                     pause_filter_count_shrink,
1075                                                     pause_filter_count);
1076         if (control->pause_filter_count != old) {
1077                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1078                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1079                                             control->pause_filter_count, old);
1080         }
1081 }
1082
1083 static void svm_hardware_unsetup(void)
1084 {
1085         int cpu;
1086
1087         sev_hardware_unsetup();
1088
1089         for_each_possible_cpu(cpu)
1090                 svm_cpu_uninit(cpu);
1091
1092         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1093         get_order(IOPM_SIZE));
1094         iopm_base = 0;
1095 }
1096
1097 static void init_seg(struct vmcb_seg *seg)
1098 {
1099         seg->selector = 0;
1100         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1101                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1102         seg->limit = 0xffff;
1103         seg->base = 0;
1104 }
1105
1106 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1107 {
1108         seg->selector = 0;
1109         seg->attrib = SVM_SELECTOR_P_MASK | type;
1110         seg->limit = 0xffff;
1111         seg->base = 0;
1112 }
1113
1114 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1115 {
1116         struct vcpu_svm *svm = to_svm(vcpu);
1117
1118         return svm->nested.ctl.tsc_offset;
1119 }
1120
1121 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1122 {
1123         struct vcpu_svm *svm = to_svm(vcpu);
1124
1125         return svm->tsc_ratio_msr;
1126 }
1127
1128 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1129 {
1130         struct vcpu_svm *svm = to_svm(vcpu);
1131
1132         svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1133         svm->vmcb->control.tsc_offset = offset;
1134         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1135 }
1136
1137 static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1138 {
1139         __svm_write_tsc_multiplier(multiplier);
1140 }
1141
1142
1143 /* Evaluate instruction intercepts that depend on guest CPUID features. */
1144 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1145                                               struct vcpu_svm *svm)
1146 {
1147         /*
1148          * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1149          * roots, or if INVPCID is disabled in the guest to inject #UD.
1150          */
1151         if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1152                 if (!npt_enabled ||
1153                     !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1154                         svm_set_intercept(svm, INTERCEPT_INVPCID);
1155                 else
1156                         svm_clr_intercept(svm, INTERCEPT_INVPCID);
1157         }
1158
1159         if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1160                 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1161                         svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1162                 else
1163                         svm_set_intercept(svm, INTERCEPT_RDTSCP);
1164         }
1165 }
1166
1167 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1168 {
1169         struct vcpu_svm *svm = to_svm(vcpu);
1170
1171         if (guest_cpuid_is_intel(vcpu)) {
1172                 /*
1173                  * We must intercept SYSENTER_EIP and SYSENTER_ESP
1174                  * accesses because the processor only stores 32 bits.
1175                  * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1176                  */
1177                 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1178                 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1179                 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1180
1181                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1182                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1183
1184                 svm->v_vmload_vmsave_enabled = false;
1185         } else {
1186                 /*
1187                  * If hardware supports Virtual VMLOAD VMSAVE then enable it
1188                  * in VMCB and clear intercepts to avoid #VMEXIT.
1189                  */
1190                 if (vls) {
1191                         svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1192                         svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1193                         svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1194                 }
1195                 /* No need to intercept these MSRs */
1196                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1197                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1198         }
1199 }
1200
1201 static void init_vmcb(struct kvm_vcpu *vcpu)
1202 {
1203         struct vcpu_svm *svm = to_svm(vcpu);
1204         struct vmcb *vmcb = svm->vmcb01.ptr;
1205         struct vmcb_control_area *control = &vmcb->control;
1206         struct vmcb_save_area *save = &vmcb->save;
1207
1208         svm_set_intercept(svm, INTERCEPT_CR0_READ);
1209         svm_set_intercept(svm, INTERCEPT_CR3_READ);
1210         svm_set_intercept(svm, INTERCEPT_CR4_READ);
1211         svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1212         svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1213         svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1214         if (!kvm_vcpu_apicv_active(vcpu))
1215                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1216
1217         set_dr_intercepts(svm);
1218
1219         set_exception_intercept(svm, PF_VECTOR);
1220         set_exception_intercept(svm, UD_VECTOR);
1221         set_exception_intercept(svm, MC_VECTOR);
1222         set_exception_intercept(svm, AC_VECTOR);
1223         set_exception_intercept(svm, DB_VECTOR);
1224         /*
1225          * Guest access to VMware backdoor ports could legitimately
1226          * trigger #GP because of TSS I/O permission bitmap.
1227          * We intercept those #GP and allow access to them anyway
1228          * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
1229          * decrypt guest memory to decode the faulting instruction.
1230          */
1231         if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
1232                 set_exception_intercept(svm, GP_VECTOR);
1233
1234         svm_set_intercept(svm, INTERCEPT_INTR);
1235         svm_set_intercept(svm, INTERCEPT_NMI);
1236
1237         if (intercept_smi)
1238                 svm_set_intercept(svm, INTERCEPT_SMI);
1239
1240         svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1241         svm_set_intercept(svm, INTERCEPT_RDPMC);
1242         svm_set_intercept(svm, INTERCEPT_CPUID);
1243         svm_set_intercept(svm, INTERCEPT_INVD);
1244         svm_set_intercept(svm, INTERCEPT_INVLPG);
1245         svm_set_intercept(svm, INTERCEPT_INVLPGA);
1246         svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1247         svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1248         svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1249         svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1250         svm_set_intercept(svm, INTERCEPT_VMRUN);
1251         svm_set_intercept(svm, INTERCEPT_VMMCALL);
1252         svm_set_intercept(svm, INTERCEPT_VMLOAD);
1253         svm_set_intercept(svm, INTERCEPT_VMSAVE);
1254         svm_set_intercept(svm, INTERCEPT_STGI);
1255         svm_set_intercept(svm, INTERCEPT_CLGI);
1256         svm_set_intercept(svm, INTERCEPT_SKINIT);
1257         svm_set_intercept(svm, INTERCEPT_WBINVD);
1258         svm_set_intercept(svm, INTERCEPT_XSETBV);
1259         svm_set_intercept(svm, INTERCEPT_RDPRU);
1260         svm_set_intercept(svm, INTERCEPT_RSM);
1261
1262         if (!kvm_mwait_in_guest(vcpu->kvm)) {
1263                 svm_set_intercept(svm, INTERCEPT_MONITOR);
1264                 svm_set_intercept(svm, INTERCEPT_MWAIT);
1265         }
1266
1267         if (!kvm_hlt_in_guest(vcpu->kvm))
1268                 svm_set_intercept(svm, INTERCEPT_HLT);
1269
1270         control->iopm_base_pa = __sme_set(iopm_base);
1271         control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1272         control->int_ctl = V_INTR_MASKING_MASK;
1273
1274         init_seg(&save->es);
1275         init_seg(&save->ss);
1276         init_seg(&save->ds);
1277         init_seg(&save->fs);
1278         init_seg(&save->gs);
1279
1280         save->cs.selector = 0xf000;
1281         save->cs.base = 0xffff0000;
1282         /* Executable/Readable Code Segment */
1283         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1284                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1285         save->cs.limit = 0xffff;
1286
1287         save->gdtr.base = 0;
1288         save->gdtr.limit = 0xffff;
1289         save->idtr.base = 0;
1290         save->idtr.limit = 0xffff;
1291
1292         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1293         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1294
1295         if (npt_enabled) {
1296                 /* Setup VMCB for Nested Paging */
1297                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1298                 svm_clr_intercept(svm, INTERCEPT_INVLPG);
1299                 clr_exception_intercept(svm, PF_VECTOR);
1300                 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1301                 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1302                 save->g_pat = vcpu->arch.pat;
1303                 save->cr3 = 0;
1304         }
1305         svm->current_vmcb->asid_generation = 0;
1306         svm->asid = 0;
1307
1308         svm->nested.vmcb12_gpa = INVALID_GPA;
1309         svm->nested.last_vmcb12_gpa = INVALID_GPA;
1310
1311         if (!kvm_pause_in_guest(vcpu->kvm)) {
1312                 control->pause_filter_count = pause_filter_count;
1313                 if (pause_filter_thresh)
1314                         control->pause_filter_thresh = pause_filter_thresh;
1315                 svm_set_intercept(svm, INTERCEPT_PAUSE);
1316         } else {
1317                 svm_clr_intercept(svm, INTERCEPT_PAUSE);
1318         }
1319
1320         svm_recalc_instruction_intercepts(vcpu, svm);
1321
1322         /*
1323          * If the host supports V_SPEC_CTRL then disable the interception
1324          * of MSR_IA32_SPEC_CTRL.
1325          */
1326         if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1327                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1328
1329         if (kvm_vcpu_apicv_active(vcpu))
1330                 avic_init_vmcb(svm, vmcb);
1331
1332         if (vnmi)
1333                 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
1334
1335         if (vgif) {
1336                 svm_clr_intercept(svm, INTERCEPT_STGI);
1337                 svm_clr_intercept(svm, INTERCEPT_CLGI);
1338                 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1339         }
1340
1341         if (sev_guest(vcpu->kvm))
1342                 sev_init_vmcb(svm);
1343
1344         svm_hv_init_vmcb(vmcb);
1345         init_vmcb_after_set_cpuid(vcpu);
1346
1347         vmcb_mark_all_dirty(vmcb);
1348
1349         enable_gif(svm);
1350 }
1351
1352 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1353 {
1354         struct vcpu_svm *svm = to_svm(vcpu);
1355
1356         svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1357
1358         svm_init_osvw(vcpu);
1359         vcpu->arch.microcode_version = 0x01000065;
1360         svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1361
1362         svm->nmi_masked = false;
1363         svm->awaiting_iret_completion = false;
1364
1365         if (sev_es_guest(vcpu->kvm))
1366                 sev_es_vcpu_reset(svm);
1367 }
1368
1369 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1370 {
1371         struct vcpu_svm *svm = to_svm(vcpu);
1372
1373         svm->spec_ctrl = 0;
1374         svm->virt_spec_ctrl = 0;
1375
1376         init_vmcb(vcpu);
1377
1378         if (!init_event)
1379                 __svm_vcpu_reset(vcpu);
1380 }
1381
1382 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1383 {
1384         svm->current_vmcb = target_vmcb;
1385         svm->vmcb = target_vmcb->ptr;
1386 }
1387
1388 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1389 {
1390         struct vcpu_svm *svm;
1391         struct page *vmcb01_page;
1392         struct page *vmsa_page = NULL;
1393         int err;
1394
1395         BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1396         svm = to_svm(vcpu);
1397
1398         err = -ENOMEM;
1399         vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1400         if (!vmcb01_page)
1401                 goto out;
1402
1403         if (sev_es_guest(vcpu->kvm)) {
1404                 /*
1405                  * SEV-ES guests require a separate VMSA page used to contain
1406                  * the encrypted register state of the guest.
1407                  */
1408                 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1409                 if (!vmsa_page)
1410                         goto error_free_vmcb_page;
1411
1412                 /*
1413                  * SEV-ES guests maintain an encrypted version of their FPU
1414                  * state which is restored and saved on VMRUN and VMEXIT.
1415                  * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1416                  * do xsave/xrstor on it.
1417                  */
1418                 fpstate_set_confidential(&vcpu->arch.guest_fpu);
1419         }
1420
1421         err = avic_init_vcpu(svm);
1422         if (err)
1423                 goto error_free_vmsa_page;
1424
1425         svm->msrpm = svm_vcpu_alloc_msrpm();
1426         if (!svm->msrpm) {
1427                 err = -ENOMEM;
1428                 goto error_free_vmsa_page;
1429         }
1430
1431         svm->x2avic_msrs_intercepted = true;
1432
1433         svm->vmcb01.ptr = page_address(vmcb01_page);
1434         svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1435         svm_switch_vmcb(svm, &svm->vmcb01);
1436
1437         if (vmsa_page)
1438                 svm->sev_es.vmsa = page_address(vmsa_page);
1439
1440         svm->guest_state_loaded = false;
1441
1442         return 0;
1443
1444 error_free_vmsa_page:
1445         if (vmsa_page)
1446                 __free_page(vmsa_page);
1447 error_free_vmcb_page:
1448         __free_page(vmcb01_page);
1449 out:
1450         return err;
1451 }
1452
1453 static void svm_clear_current_vmcb(struct vmcb *vmcb)
1454 {
1455         int i;
1456
1457         for_each_online_cpu(i)
1458                 cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
1459 }
1460
1461 static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1462 {
1463         struct vcpu_svm *svm = to_svm(vcpu);
1464
1465         /*
1466          * The vmcb page can be recycled, causing a false negative in
1467          * svm_vcpu_load(). So, ensure that no logical CPU has this
1468          * vmcb page recorded as its current vmcb.
1469          */
1470         svm_clear_current_vmcb(svm->vmcb);
1471
1472         svm_leave_nested(vcpu);
1473         svm_free_nested(svm);
1474
1475         sev_free_vcpu(vcpu);
1476
1477         __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1478         __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1479 }
1480
1481 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1482 {
1483         struct vcpu_svm *svm = to_svm(vcpu);
1484         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1485
1486         if (sev_es_guest(vcpu->kvm))
1487                 sev_es_unmap_ghcb(svm);
1488
1489         if (svm->guest_state_loaded)
1490                 return;
1491
1492         /*
1493          * Save additional host state that will be restored on VMEXIT (sev-es)
1494          * or subsequent vmload of host save area.
1495          */
1496         vmsave(sd->save_area_pa);
1497         if (sev_es_guest(vcpu->kvm)) {
1498                 struct sev_es_save_area *hostsa;
1499                 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1500
1501                 sev_es_prepare_switch_to_guest(hostsa);
1502         }
1503
1504         if (tsc_scaling)
1505                 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1506
1507         if (likely(tsc_aux_uret_slot >= 0))
1508                 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1509
1510         svm->guest_state_loaded = true;
1511 }
1512
1513 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1514 {
1515         to_svm(vcpu)->guest_state_loaded = false;
1516 }
1517
1518 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1519 {
1520         struct vcpu_svm *svm = to_svm(vcpu);
1521         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
1522
1523         if (sd->current_vmcb != svm->vmcb) {
1524                 sd->current_vmcb = svm->vmcb;
1525                 indirect_branch_prediction_barrier();
1526         }
1527         if (kvm_vcpu_apicv_active(vcpu))
1528                 avic_vcpu_load(vcpu, cpu);
1529 }
1530
1531 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1532 {
1533         if (kvm_vcpu_apicv_active(vcpu))
1534                 avic_vcpu_put(vcpu);
1535
1536         svm_prepare_host_switch(vcpu);
1537
1538         ++vcpu->stat.host_state_reload;
1539 }
1540
1541 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1542 {
1543         struct vcpu_svm *svm = to_svm(vcpu);
1544         unsigned long rflags = svm->vmcb->save.rflags;
1545
1546         if (svm->nmi_singlestep) {
1547                 /* Hide our flags if they were not set by the guest */
1548                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1549                         rflags &= ~X86_EFLAGS_TF;
1550                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1551                         rflags &= ~X86_EFLAGS_RF;
1552         }
1553         return rflags;
1554 }
1555
1556 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1557 {
1558         if (to_svm(vcpu)->nmi_singlestep)
1559                 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1560
1561        /*
1562         * Any change of EFLAGS.VM is accompanied by a reload of SS
1563         * (caused by either a task switch or an inter-privilege IRET),
1564         * so we do not need to update the CPL here.
1565         */
1566         to_svm(vcpu)->vmcb->save.rflags = rflags;
1567 }
1568
1569 static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1570 {
1571         struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1572
1573         return sev_es_guest(vcpu->kvm)
1574                 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1575                 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1576 }
1577
1578 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1579 {
1580         kvm_register_mark_available(vcpu, reg);
1581
1582         switch (reg) {
1583         case VCPU_EXREG_PDPTR:
1584                 /*
1585                  * When !npt_enabled, mmu->pdptrs[] is already available since
1586                  * it is always updated per SDM when moving to CRs.
1587                  */
1588                 if (npt_enabled)
1589                         load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1590                 break;
1591         default:
1592                 KVM_BUG_ON(1, vcpu->kvm);
1593         }
1594 }
1595
1596 static void svm_set_vintr(struct vcpu_svm *svm)
1597 {
1598         struct vmcb_control_area *control;
1599
1600         /*
1601          * The following fields are ignored when AVIC is enabled
1602          */
1603         WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1604
1605         svm_set_intercept(svm, INTERCEPT_VINTR);
1606
1607         /*
1608          * Recalculating intercepts may have cleared the VINTR intercept.  If
1609          * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
1610          * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
1611          * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
1612          * interrupts will never be unblocked while L2 is running.
1613          */
1614         if (!svm_is_intercept(svm, INTERCEPT_VINTR))
1615                 return;
1616
1617         /*
1618          * This is just a dummy VINTR to actually cause a vmexit to happen.
1619          * Actual injection of virtual interrupts happens through EVENTINJ.
1620          */
1621         control = &svm->vmcb->control;
1622         control->int_vector = 0x0;
1623         control->int_ctl &= ~V_INTR_PRIO_MASK;
1624         control->int_ctl |= V_IRQ_MASK |
1625                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1626         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1627 }
1628
1629 static void svm_clear_vintr(struct vcpu_svm *svm)
1630 {
1631         svm_clr_intercept(svm, INTERCEPT_VINTR);
1632
1633         /* Drop int_ctl fields related to VINTR injection.  */
1634         svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1635         if (is_guest_mode(&svm->vcpu)) {
1636                 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1637
1638                 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1639                         (svm->nested.ctl.int_ctl & V_TPR_MASK));
1640
1641                 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1642                         V_IRQ_INJECTION_BITS_MASK;
1643
1644                 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1645         }
1646
1647         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1648 }
1649
1650 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1651 {
1652         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1653         struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1654
1655         switch (seg) {
1656         case VCPU_SREG_CS: return &save->cs;
1657         case VCPU_SREG_DS: return &save->ds;
1658         case VCPU_SREG_ES: return &save->es;
1659         case VCPU_SREG_FS: return &save01->fs;
1660         case VCPU_SREG_GS: return &save01->gs;
1661         case VCPU_SREG_SS: return &save->ss;
1662         case VCPU_SREG_TR: return &save01->tr;
1663         case VCPU_SREG_LDTR: return &save01->ldtr;
1664         }
1665         BUG();
1666         return NULL;
1667 }
1668
1669 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1670 {
1671         struct vmcb_seg *s = svm_seg(vcpu, seg);
1672
1673         return s->base;
1674 }
1675
1676 static void svm_get_segment(struct kvm_vcpu *vcpu,
1677                             struct kvm_segment *var, int seg)
1678 {
1679         struct vmcb_seg *s = svm_seg(vcpu, seg);
1680
1681         var->base = s->base;
1682         var->limit = s->limit;
1683         var->selector = s->selector;
1684         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1685         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1686         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1687         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1688         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1689         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1690         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1691
1692         /*
1693          * AMD CPUs circa 2014 track the G bit for all segments except CS.
1694          * However, the SVM spec states that the G bit is not observed by the
1695          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1696          * So let's synthesize a legal G bit for all segments, this helps
1697          * running KVM nested. It also helps cross-vendor migration, because
1698          * Intel's vmentry has a check on the 'G' bit.
1699          */
1700         var->g = s->limit > 0xfffff;
1701
1702         /*
1703          * AMD's VMCB does not have an explicit unusable field, so emulate it
1704          * for cross vendor migration purposes by "not present"
1705          */
1706         var->unusable = !var->present;
1707
1708         switch (seg) {
1709         case VCPU_SREG_TR:
1710                 /*
1711                  * Work around a bug where the busy flag in the tr selector
1712                  * isn't exposed
1713                  */
1714                 var->type |= 0x2;
1715                 break;
1716         case VCPU_SREG_DS:
1717         case VCPU_SREG_ES:
1718         case VCPU_SREG_FS:
1719         case VCPU_SREG_GS:
1720                 /*
1721                  * The accessed bit must always be set in the segment
1722                  * descriptor cache, although it can be cleared in the
1723                  * descriptor, the cached bit always remains at 1. Since
1724                  * Intel has a check on this, set it here to support
1725                  * cross-vendor migration.
1726                  */
1727                 if (!var->unusable)
1728                         var->type |= 0x1;
1729                 break;
1730         case VCPU_SREG_SS:
1731                 /*
1732                  * On AMD CPUs sometimes the DB bit in the segment
1733                  * descriptor is left as 1, although the whole segment has
1734                  * been made unusable. Clear it here to pass an Intel VMX
1735                  * entry check when cross vendor migrating.
1736                  */
1737                 if (var->unusable)
1738                         var->db = 0;
1739                 /* This is symmetric with svm_set_segment() */
1740                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1741                 break;
1742         }
1743 }
1744
1745 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1746 {
1747         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1748
1749         return save->cpl;
1750 }
1751
1752 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1753 {
1754         struct kvm_segment cs;
1755
1756         svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1757         *db = cs.db;
1758         *l = cs.l;
1759 }
1760
1761 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1762 {
1763         struct vcpu_svm *svm = to_svm(vcpu);
1764
1765         dt->size = svm->vmcb->save.idtr.limit;
1766         dt->address = svm->vmcb->save.idtr.base;
1767 }
1768
1769 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1770 {
1771         struct vcpu_svm *svm = to_svm(vcpu);
1772
1773         svm->vmcb->save.idtr.limit = dt->size;
1774         svm->vmcb->save.idtr.base = dt->address ;
1775         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1776 }
1777
1778 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1779 {
1780         struct vcpu_svm *svm = to_svm(vcpu);
1781
1782         dt->size = svm->vmcb->save.gdtr.limit;
1783         dt->address = svm->vmcb->save.gdtr.base;
1784 }
1785
1786 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1787 {
1788         struct vcpu_svm *svm = to_svm(vcpu);
1789
1790         svm->vmcb->save.gdtr.limit = dt->size;
1791         svm->vmcb->save.gdtr.base = dt->address ;
1792         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1793 }
1794
1795 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1796 {
1797         struct vcpu_svm *svm = to_svm(vcpu);
1798
1799         /*
1800          * For guests that don't set guest_state_protected, the cr3 update is
1801          * handled via kvm_mmu_load() while entering the guest. For guests
1802          * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1803          * VMCB save area now, since the save area will become the initial
1804          * contents of the VMSA, and future VMCB save area updates won't be
1805          * seen.
1806          */
1807         if (sev_es_guest(vcpu->kvm)) {
1808                 svm->vmcb->save.cr3 = cr3;
1809                 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1810         }
1811 }
1812
1813 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1814 {
1815         struct vcpu_svm *svm = to_svm(vcpu);
1816         u64 hcr0 = cr0;
1817         bool old_paging = is_paging(vcpu);
1818
1819 #ifdef CONFIG_X86_64
1820         if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1821                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1822                         vcpu->arch.efer |= EFER_LMA;
1823                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1824                 }
1825
1826                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1827                         vcpu->arch.efer &= ~EFER_LMA;
1828                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1829                 }
1830         }
1831 #endif
1832         vcpu->arch.cr0 = cr0;
1833
1834         if (!npt_enabled) {
1835                 hcr0 |= X86_CR0_PG | X86_CR0_WP;
1836                 if (old_paging != is_paging(vcpu))
1837                         svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1838         }
1839
1840         /*
1841          * re-enable caching here because the QEMU bios
1842          * does not do it - this results in some delay at
1843          * reboot
1844          */
1845         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1846                 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1847
1848         svm->vmcb->save.cr0 = hcr0;
1849         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1850
1851         /*
1852          * SEV-ES guests must always keep the CR intercepts cleared. CR
1853          * tracking is done using the CR write traps.
1854          */
1855         if (sev_es_guest(vcpu->kvm))
1856                 return;
1857
1858         if (hcr0 == cr0) {
1859                 /* Selective CR0 write remains on.  */
1860                 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1861                 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1862         } else {
1863                 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1864                 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1865         }
1866 }
1867
1868 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1869 {
1870         return true;
1871 }
1872
1873 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1874 {
1875         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1876         unsigned long old_cr4 = vcpu->arch.cr4;
1877
1878         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1879                 svm_flush_tlb_current(vcpu);
1880
1881         vcpu->arch.cr4 = cr4;
1882         if (!npt_enabled) {
1883                 cr4 |= X86_CR4_PAE;
1884
1885                 if (!is_paging(vcpu))
1886                         cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1887         }
1888         cr4 |= host_cr4_mce;
1889         to_svm(vcpu)->vmcb->save.cr4 = cr4;
1890         vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1891
1892         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1893                 kvm_update_cpuid_runtime(vcpu);
1894 }
1895
1896 static void svm_set_segment(struct kvm_vcpu *vcpu,
1897                             struct kvm_segment *var, int seg)
1898 {
1899         struct vcpu_svm *svm = to_svm(vcpu);
1900         struct vmcb_seg *s = svm_seg(vcpu, seg);
1901
1902         s->base = var->base;
1903         s->limit = var->limit;
1904         s->selector = var->selector;
1905         s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1906         s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1907         s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1908         s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1909         s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1910         s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1911         s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1912         s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1913
1914         /*
1915          * This is always accurate, except if SYSRET returned to a segment
1916          * with SS.DPL != 3.  Intel does not have this quirk, and always
1917          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1918          * would entail passing the CPL to userspace and back.
1919          */
1920         if (seg == VCPU_SREG_SS)
1921                 /* This is symmetric with svm_get_segment() */
1922                 svm->vmcb->save.cpl = (var->dpl & 3);
1923
1924         vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1925 }
1926
1927 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1928 {
1929         struct vcpu_svm *svm = to_svm(vcpu);
1930
1931         clr_exception_intercept(svm, BP_VECTOR);
1932
1933         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1934                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1935                         set_exception_intercept(svm, BP_VECTOR);
1936         }
1937 }
1938
1939 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1940 {
1941         if (sd->next_asid > sd->max_asid) {
1942                 ++sd->asid_generation;
1943                 sd->next_asid = sd->min_asid;
1944                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1945                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1946         }
1947
1948         svm->current_vmcb->asid_generation = sd->asid_generation;
1949         svm->asid = sd->next_asid++;
1950 }
1951
1952 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1953 {
1954         struct vmcb *vmcb = svm->vmcb;
1955
1956         if (svm->vcpu.arch.guest_state_protected)
1957                 return;
1958
1959         if (unlikely(value != vmcb->save.dr6)) {
1960                 vmcb->save.dr6 = value;
1961                 vmcb_mark_dirty(vmcb, VMCB_DR);
1962         }
1963 }
1964
1965 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1966 {
1967         struct vcpu_svm *svm = to_svm(vcpu);
1968
1969         if (vcpu->arch.guest_state_protected)
1970                 return;
1971
1972         get_debugreg(vcpu->arch.db[0], 0);
1973         get_debugreg(vcpu->arch.db[1], 1);
1974         get_debugreg(vcpu->arch.db[2], 2);
1975         get_debugreg(vcpu->arch.db[3], 3);
1976         /*
1977          * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1978          * because db_interception might need it.  We can do it before vmentry.
1979          */
1980         vcpu->arch.dr6 = svm->vmcb->save.dr6;
1981         vcpu->arch.dr7 = svm->vmcb->save.dr7;
1982         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1983         set_dr_intercepts(svm);
1984 }
1985
1986 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1987 {
1988         struct vcpu_svm *svm = to_svm(vcpu);
1989
1990         if (vcpu->arch.guest_state_protected)
1991                 return;
1992
1993         svm->vmcb->save.dr7 = value;
1994         vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1995 }
1996
1997 static int pf_interception(struct kvm_vcpu *vcpu)
1998 {
1999         struct vcpu_svm *svm = to_svm(vcpu);
2000
2001         u64 fault_address = svm->vmcb->control.exit_info_2;
2002         u64 error_code = svm->vmcb->control.exit_info_1;
2003
2004         return kvm_handle_page_fault(vcpu, error_code, fault_address,
2005                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2006                         svm->vmcb->control.insn_bytes : NULL,
2007                         svm->vmcb->control.insn_len);
2008 }
2009
2010 static int npf_interception(struct kvm_vcpu *vcpu)
2011 {
2012         struct vcpu_svm *svm = to_svm(vcpu);
2013
2014         u64 fault_address = svm->vmcb->control.exit_info_2;
2015         u64 error_code = svm->vmcb->control.exit_info_1;
2016
2017         trace_kvm_page_fault(vcpu, fault_address, error_code);
2018         return kvm_mmu_page_fault(vcpu, fault_address, error_code,
2019                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2020                         svm->vmcb->control.insn_bytes : NULL,
2021                         svm->vmcb->control.insn_len);
2022 }
2023
2024 static int db_interception(struct kvm_vcpu *vcpu)
2025 {
2026         struct kvm_run *kvm_run = vcpu->run;
2027         struct vcpu_svm *svm = to_svm(vcpu);
2028
2029         if (!(vcpu->guest_debug &
2030               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2031                 !svm->nmi_singlestep) {
2032                 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
2033                 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
2034                 return 1;
2035         }
2036
2037         if (svm->nmi_singlestep) {
2038                 disable_nmi_singlestep(svm);
2039                 /* Make sure we check for pending NMIs upon entry */
2040                 kvm_make_request(KVM_REQ_EVENT, vcpu);
2041         }
2042
2043         if (vcpu->guest_debug &
2044             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2045                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2046                 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2047                 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
2048                 kvm_run->debug.arch.pc =
2049                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2050                 kvm_run->debug.arch.exception = DB_VECTOR;
2051                 return 0;
2052         }
2053
2054         return 1;
2055 }
2056
2057 static int bp_interception(struct kvm_vcpu *vcpu)
2058 {
2059         struct vcpu_svm *svm = to_svm(vcpu);
2060         struct kvm_run *kvm_run = vcpu->run;
2061
2062         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2063         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2064         kvm_run->debug.arch.exception = BP_VECTOR;
2065         return 0;
2066 }
2067
2068 static int ud_interception(struct kvm_vcpu *vcpu)
2069 {
2070         return handle_ud(vcpu);
2071 }
2072
2073 static int ac_interception(struct kvm_vcpu *vcpu)
2074 {
2075         kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2076         return 1;
2077 }
2078
2079 static bool is_erratum_383(void)
2080 {
2081         int err, i;
2082         u64 value;
2083
2084         if (!erratum_383_found)
2085                 return false;
2086
2087         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2088         if (err)
2089                 return false;
2090
2091         /* Bit 62 may or may not be set for this mce */
2092         value &= ~(1ULL << 62);
2093
2094         if (value != 0xb600000000010015ULL)
2095                 return false;
2096
2097         /* Clear MCi_STATUS registers */
2098         for (i = 0; i < 6; ++i)
2099                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2100
2101         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2102         if (!err) {
2103                 u32 low, high;
2104
2105                 value &= ~(1ULL << 2);
2106                 low    = lower_32_bits(value);
2107                 high   = upper_32_bits(value);
2108
2109                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2110         }
2111
2112         /* Flush tlb to evict multi-match entries */
2113         __flush_tlb_all();
2114
2115         return true;
2116 }
2117
2118 static void svm_handle_mce(struct kvm_vcpu *vcpu)
2119 {
2120         if (is_erratum_383()) {
2121                 /*
2122                  * Erratum 383 triggered. Guest state is corrupt so kill the
2123                  * guest.
2124                  */
2125                 pr_err("Guest triggered AMD Erratum 383\n");
2126
2127                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2128
2129                 return;
2130         }
2131
2132         /*
2133          * On an #MC intercept the MCE handler is not called automatically in
2134          * the host. So do it by hand here.
2135          */
2136         kvm_machine_check();
2137 }
2138
2139 static int mc_interception(struct kvm_vcpu *vcpu)
2140 {
2141         return 1;
2142 }
2143
2144 static int shutdown_interception(struct kvm_vcpu *vcpu)
2145 {
2146         struct kvm_run *kvm_run = vcpu->run;
2147         struct vcpu_svm *svm = to_svm(vcpu);
2148
2149         /*
2150          * The VM save area has already been encrypted so it
2151          * cannot be reinitialized - just terminate.
2152          */
2153         if (sev_es_guest(vcpu->kvm))
2154                 return -EINVAL;
2155
2156         /*
2157          * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2158          * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2159          * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2160          * userspace.  At a platform view, INIT is acceptable behavior as
2161          * there exist bare metal platforms that automatically INIT the CPU
2162          * in response to shutdown.
2163          */
2164         clear_page(svm->vmcb);
2165         kvm_vcpu_reset(vcpu, true);
2166
2167         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2168         return 0;
2169 }
2170
2171 static int io_interception(struct kvm_vcpu *vcpu)
2172 {
2173         struct vcpu_svm *svm = to_svm(vcpu);
2174         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2175         int size, in, string;
2176         unsigned port;
2177
2178         ++vcpu->stat.io_exits;
2179         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2180         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2181         port = io_info >> 16;
2182         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2183
2184         if (string) {
2185                 if (sev_es_guest(vcpu->kvm))
2186                         return sev_es_string_io(svm, size, port, in);
2187                 else
2188                         return kvm_emulate_instruction(vcpu, 0);
2189         }
2190
2191         svm->next_rip = svm->vmcb->control.exit_info_2;
2192
2193         return kvm_fast_pio(vcpu, size, port, in);
2194 }
2195
2196 static int nmi_interception(struct kvm_vcpu *vcpu)
2197 {
2198         return 1;
2199 }
2200
2201 static int smi_interception(struct kvm_vcpu *vcpu)
2202 {
2203         return 1;
2204 }
2205
2206 static int intr_interception(struct kvm_vcpu *vcpu)
2207 {
2208         ++vcpu->stat.irq_exits;
2209         return 1;
2210 }
2211
2212 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2213 {
2214         struct vcpu_svm *svm = to_svm(vcpu);
2215         struct vmcb *vmcb12;
2216         struct kvm_host_map map;
2217         int ret;
2218
2219         if (nested_svm_check_permissions(vcpu))
2220                 return 1;
2221
2222         ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2223         if (ret) {
2224                 if (ret == -EINVAL)
2225                         kvm_inject_gp(vcpu, 0);
2226                 return 1;
2227         }
2228
2229         vmcb12 = map.hva;
2230
2231         ret = kvm_skip_emulated_instruction(vcpu);
2232
2233         if (vmload) {
2234                 svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2235                 svm->sysenter_eip_hi = 0;
2236                 svm->sysenter_esp_hi = 0;
2237         } else {
2238                 svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2239         }
2240
2241         kvm_vcpu_unmap(vcpu, &map, true);
2242
2243         return ret;
2244 }
2245
2246 static int vmload_interception(struct kvm_vcpu *vcpu)
2247 {
2248         return vmload_vmsave_interception(vcpu, true);
2249 }
2250
2251 static int vmsave_interception(struct kvm_vcpu *vcpu)
2252 {
2253         return vmload_vmsave_interception(vcpu, false);
2254 }
2255
2256 static int vmrun_interception(struct kvm_vcpu *vcpu)
2257 {
2258         if (nested_svm_check_permissions(vcpu))
2259                 return 1;
2260
2261         return nested_svm_vmrun(vcpu);
2262 }
2263
2264 enum {
2265         NONE_SVM_INSTR,
2266         SVM_INSTR_VMRUN,
2267         SVM_INSTR_VMLOAD,
2268         SVM_INSTR_VMSAVE,
2269 };
2270
2271 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2272 static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2273 {
2274         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2275
2276         if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2277                 return NONE_SVM_INSTR;
2278
2279         switch (ctxt->modrm) {
2280         case 0xd8: /* VMRUN */
2281                 return SVM_INSTR_VMRUN;
2282         case 0xda: /* VMLOAD */
2283                 return SVM_INSTR_VMLOAD;
2284         case 0xdb: /* VMSAVE */
2285                 return SVM_INSTR_VMSAVE;
2286         default:
2287                 break;
2288         }
2289
2290         return NONE_SVM_INSTR;
2291 }
2292
2293 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2294 {
2295         const int guest_mode_exit_codes[] = {
2296                 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2297                 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2298                 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2299         };
2300         int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2301                 [SVM_INSTR_VMRUN] = vmrun_interception,
2302                 [SVM_INSTR_VMLOAD] = vmload_interception,
2303                 [SVM_INSTR_VMSAVE] = vmsave_interception,
2304         };
2305         struct vcpu_svm *svm = to_svm(vcpu);
2306         int ret;
2307
2308         if (is_guest_mode(vcpu)) {
2309                 /* Returns '1' or -errno on failure, '0' on success. */
2310                 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2311                 if (ret)
2312                         return ret;
2313                 return 1;
2314         }
2315         return svm_instr_handlers[opcode](vcpu);
2316 }
2317
2318 /*
2319  * #GP handling code. Note that #GP can be triggered under the following two
2320  * cases:
2321  *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2322  *      some AMD CPUs when EAX of these instructions are in the reserved memory
2323  *      regions (e.g. SMM memory on host).
2324  *   2) VMware backdoor
2325  */
2326 static int gp_interception(struct kvm_vcpu *vcpu)
2327 {
2328         struct vcpu_svm *svm = to_svm(vcpu);
2329         u32 error_code = svm->vmcb->control.exit_info_1;
2330         int opcode;
2331
2332         /* Both #GP cases have zero error_code */
2333         if (error_code)
2334                 goto reinject;
2335
2336         /* Decode the instruction for usage later */
2337         if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2338                 goto reinject;
2339
2340         opcode = svm_instr_opcode(vcpu);
2341
2342         if (opcode == NONE_SVM_INSTR) {
2343                 if (!enable_vmware_backdoor)
2344                         goto reinject;
2345
2346                 /*
2347                  * VMware backdoor emulation on #GP interception only handles
2348                  * IN{S}, OUT{S}, and RDPMC.
2349                  */
2350                 if (!is_guest_mode(vcpu))
2351                         return kvm_emulate_instruction(vcpu,
2352                                 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2353         } else {
2354                 /* All SVM instructions expect page aligned RAX */
2355                 if (svm->vmcb->save.rax & ~PAGE_MASK)
2356                         goto reinject;
2357
2358                 return emulate_svm_instr(vcpu, opcode);
2359         }
2360
2361 reinject:
2362         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2363         return 1;
2364 }
2365
2366 void svm_set_gif(struct vcpu_svm *svm, bool value)
2367 {
2368         if (value) {
2369                 /*
2370                  * If VGIF is enabled, the STGI intercept is only added to
2371                  * detect the opening of the SMI/NMI window; remove it now.
2372                  * Likewise, clear the VINTR intercept, we will set it
2373                  * again while processing KVM_REQ_EVENT if needed.
2374                  */
2375                 if (vgif)
2376                         svm_clr_intercept(svm, INTERCEPT_STGI);
2377                 if (svm_is_intercept(svm, INTERCEPT_VINTR))
2378                         svm_clear_vintr(svm);
2379
2380                 enable_gif(svm);
2381                 if (svm->vcpu.arch.smi_pending ||
2382                     svm->vcpu.arch.nmi_pending ||
2383                     kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2384                     kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2385                         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2386         } else {
2387                 disable_gif(svm);
2388
2389                 /*
2390                  * After a CLGI no interrupts should come.  But if vGIF is
2391                  * in use, we still rely on the VINTR intercept (rather than
2392                  * STGI) to detect an open interrupt window.
2393                 */
2394                 if (!vgif)
2395                         svm_clear_vintr(svm);
2396         }
2397 }
2398
2399 static int stgi_interception(struct kvm_vcpu *vcpu)
2400 {
2401         int ret;
2402
2403         if (nested_svm_check_permissions(vcpu))
2404                 return 1;
2405
2406         ret = kvm_skip_emulated_instruction(vcpu);
2407         svm_set_gif(to_svm(vcpu), true);
2408         return ret;
2409 }
2410
2411 static int clgi_interception(struct kvm_vcpu *vcpu)
2412 {
2413         int ret;
2414
2415         if (nested_svm_check_permissions(vcpu))
2416                 return 1;
2417
2418         ret = kvm_skip_emulated_instruction(vcpu);
2419         svm_set_gif(to_svm(vcpu), false);
2420         return ret;
2421 }
2422
2423 static int invlpga_interception(struct kvm_vcpu *vcpu)
2424 {
2425         gva_t gva = kvm_rax_read(vcpu);
2426         u32 asid = kvm_rcx_read(vcpu);
2427
2428         /* FIXME: Handle an address size prefix. */
2429         if (!is_long_mode(vcpu))
2430                 gva = (u32)gva;
2431
2432         trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2433
2434         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2435         kvm_mmu_invlpg(vcpu, gva);
2436
2437         return kvm_skip_emulated_instruction(vcpu);
2438 }
2439
2440 static int skinit_interception(struct kvm_vcpu *vcpu)
2441 {
2442         trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2443
2444         kvm_queue_exception(vcpu, UD_VECTOR);
2445         return 1;
2446 }
2447
2448 static int task_switch_interception(struct kvm_vcpu *vcpu)
2449 {
2450         struct vcpu_svm *svm = to_svm(vcpu);
2451         u16 tss_selector;
2452         int reason;
2453         int int_type = svm->vmcb->control.exit_int_info &
2454                 SVM_EXITINTINFO_TYPE_MASK;
2455         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2456         uint32_t type =
2457                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2458         uint32_t idt_v =
2459                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2460         bool has_error_code = false;
2461         u32 error_code = 0;
2462
2463         tss_selector = (u16)svm->vmcb->control.exit_info_1;
2464
2465         if (svm->vmcb->control.exit_info_2 &
2466             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2467                 reason = TASK_SWITCH_IRET;
2468         else if (svm->vmcb->control.exit_info_2 &
2469                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2470                 reason = TASK_SWITCH_JMP;
2471         else if (idt_v)
2472                 reason = TASK_SWITCH_GATE;
2473         else
2474                 reason = TASK_SWITCH_CALL;
2475
2476         if (reason == TASK_SWITCH_GATE) {
2477                 switch (type) {
2478                 case SVM_EXITINTINFO_TYPE_NMI:
2479                         vcpu->arch.nmi_injected = false;
2480                         break;
2481                 case SVM_EXITINTINFO_TYPE_EXEPT:
2482                         if (svm->vmcb->control.exit_info_2 &
2483                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2484                                 has_error_code = true;
2485                                 error_code =
2486                                         (u32)svm->vmcb->control.exit_info_2;
2487                         }
2488                         kvm_clear_exception_queue(vcpu);
2489                         break;
2490                 case SVM_EXITINTINFO_TYPE_INTR:
2491                 case SVM_EXITINTINFO_TYPE_SOFT:
2492                         kvm_clear_interrupt_queue(vcpu);
2493                         break;
2494                 default:
2495                         break;
2496                 }
2497         }
2498
2499         if (reason != TASK_SWITCH_GATE ||
2500             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2501             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2502              (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2503                 if (!svm_skip_emulated_instruction(vcpu))
2504                         return 0;
2505         }
2506
2507         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2508                 int_vec = -1;
2509
2510         return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2511                                has_error_code, error_code);
2512 }
2513
2514 static void svm_clr_iret_intercept(struct vcpu_svm *svm)
2515 {
2516         if (!sev_es_guest(svm->vcpu.kvm))
2517                 svm_clr_intercept(svm, INTERCEPT_IRET);
2518 }
2519
2520 static void svm_set_iret_intercept(struct vcpu_svm *svm)
2521 {
2522         if (!sev_es_guest(svm->vcpu.kvm))
2523                 svm_set_intercept(svm, INTERCEPT_IRET);
2524 }
2525
2526 static int iret_interception(struct kvm_vcpu *vcpu)
2527 {
2528         struct vcpu_svm *svm = to_svm(vcpu);
2529
2530         ++vcpu->stat.nmi_window_exits;
2531         svm->awaiting_iret_completion = true;
2532
2533         svm_clr_iret_intercept(svm);
2534         if (!sev_es_guest(vcpu->kvm))
2535                 svm->nmi_iret_rip = kvm_rip_read(vcpu);
2536
2537         kvm_make_request(KVM_REQ_EVENT, vcpu);
2538         return 1;
2539 }
2540
2541 static int invlpg_interception(struct kvm_vcpu *vcpu)
2542 {
2543         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2544                 return kvm_emulate_instruction(vcpu, 0);
2545
2546         kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2547         return kvm_skip_emulated_instruction(vcpu);
2548 }
2549
2550 static int emulate_on_interception(struct kvm_vcpu *vcpu)
2551 {
2552         return kvm_emulate_instruction(vcpu, 0);
2553 }
2554
2555 static int rsm_interception(struct kvm_vcpu *vcpu)
2556 {
2557         return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2558 }
2559
2560 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2561                                             unsigned long val)
2562 {
2563         struct vcpu_svm *svm = to_svm(vcpu);
2564         unsigned long cr0 = vcpu->arch.cr0;
2565         bool ret = false;
2566
2567         if (!is_guest_mode(vcpu) ||
2568             (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2569                 return false;
2570
2571         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2572         val &= ~SVM_CR0_SELECTIVE_MASK;
2573
2574         if (cr0 ^ val) {
2575                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2576                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2577         }
2578
2579         return ret;
2580 }
2581
2582 #define CR_VALID (1ULL << 63)
2583
2584 static int cr_interception(struct kvm_vcpu *vcpu)
2585 {
2586         struct vcpu_svm *svm = to_svm(vcpu);
2587         int reg, cr;
2588         unsigned long val;
2589         int err;
2590
2591         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2592                 return emulate_on_interception(vcpu);
2593
2594         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2595                 return emulate_on_interception(vcpu);
2596
2597         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2598         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2599                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2600         else
2601                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2602
2603         err = 0;
2604         if (cr >= 16) { /* mov to cr */
2605                 cr -= 16;
2606                 val = kvm_register_read(vcpu, reg);
2607                 trace_kvm_cr_write(cr, val);
2608                 switch (cr) {
2609                 case 0:
2610                         if (!check_selective_cr0_intercepted(vcpu, val))
2611                                 err = kvm_set_cr0(vcpu, val);
2612                         else
2613                                 return 1;
2614
2615                         break;
2616                 case 3:
2617                         err = kvm_set_cr3(vcpu, val);
2618                         break;
2619                 case 4:
2620                         err = kvm_set_cr4(vcpu, val);
2621                         break;
2622                 case 8:
2623                         err = kvm_set_cr8(vcpu, val);
2624                         break;
2625                 default:
2626                         WARN(1, "unhandled write to CR%d", cr);
2627                         kvm_queue_exception(vcpu, UD_VECTOR);
2628                         return 1;
2629                 }
2630         } else { /* mov from cr */
2631                 switch (cr) {
2632                 case 0:
2633                         val = kvm_read_cr0(vcpu);
2634                         break;
2635                 case 2:
2636                         val = vcpu->arch.cr2;
2637                         break;
2638                 case 3:
2639                         val = kvm_read_cr3(vcpu);
2640                         break;
2641                 case 4:
2642                         val = kvm_read_cr4(vcpu);
2643                         break;
2644                 case 8:
2645                         val = kvm_get_cr8(vcpu);
2646                         break;
2647                 default:
2648                         WARN(1, "unhandled read from CR%d", cr);
2649                         kvm_queue_exception(vcpu, UD_VECTOR);
2650                         return 1;
2651                 }
2652                 kvm_register_write(vcpu, reg, val);
2653                 trace_kvm_cr_read(cr, val);
2654         }
2655         return kvm_complete_insn_gp(vcpu, err);
2656 }
2657
2658 static int cr_trap(struct kvm_vcpu *vcpu)
2659 {
2660         struct vcpu_svm *svm = to_svm(vcpu);
2661         unsigned long old_value, new_value;
2662         unsigned int cr;
2663         int ret = 0;
2664
2665         new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2666
2667         cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2668         switch (cr) {
2669         case 0:
2670                 old_value = kvm_read_cr0(vcpu);
2671                 svm_set_cr0(vcpu, new_value);
2672
2673                 kvm_post_set_cr0(vcpu, old_value, new_value);
2674                 break;
2675         case 4:
2676                 old_value = kvm_read_cr4(vcpu);
2677                 svm_set_cr4(vcpu, new_value);
2678
2679                 kvm_post_set_cr4(vcpu, old_value, new_value);
2680                 break;
2681         case 8:
2682                 ret = kvm_set_cr8(vcpu, new_value);
2683                 break;
2684         default:
2685                 WARN(1, "unhandled CR%d write trap", cr);
2686                 kvm_queue_exception(vcpu, UD_VECTOR);
2687                 return 1;
2688         }
2689
2690         return kvm_complete_insn_gp(vcpu, ret);
2691 }
2692
2693 static int dr_interception(struct kvm_vcpu *vcpu)
2694 {
2695         struct vcpu_svm *svm = to_svm(vcpu);
2696         int reg, dr;
2697         unsigned long val;
2698         int err = 0;
2699
2700         if (vcpu->guest_debug == 0) {
2701                 /*
2702                  * No more DR vmexits; force a reload of the debug registers
2703                  * and reenter on this instruction.  The next vmexit will
2704                  * retrieve the full state of the debug registers.
2705                  */
2706                 clr_dr_intercepts(svm);
2707                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2708                 return 1;
2709         }
2710
2711         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2712                 return emulate_on_interception(vcpu);
2713
2714         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2715         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2716         if (dr >= 16) { /* mov to DRn  */
2717                 dr -= 16;
2718                 val = kvm_register_read(vcpu, reg);
2719                 err = kvm_set_dr(vcpu, dr, val);
2720         } else {
2721                 kvm_get_dr(vcpu, dr, &val);
2722                 kvm_register_write(vcpu, reg, val);
2723         }
2724
2725         return kvm_complete_insn_gp(vcpu, err);
2726 }
2727
2728 static int cr8_write_interception(struct kvm_vcpu *vcpu)
2729 {
2730         int r;
2731
2732         u8 cr8_prev = kvm_get_cr8(vcpu);
2733         /* instruction emulation calls kvm_set_cr8() */
2734         r = cr_interception(vcpu);
2735         if (lapic_in_kernel(vcpu))
2736                 return r;
2737         if (cr8_prev <= kvm_get_cr8(vcpu))
2738                 return r;
2739         vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2740         return 0;
2741 }
2742
2743 static int efer_trap(struct kvm_vcpu *vcpu)
2744 {
2745         struct msr_data msr_info;
2746         int ret;
2747
2748         /*
2749          * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2750          * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2751          * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2752          * the guest doesn't have X86_FEATURE_SVM.
2753          */
2754         msr_info.host_initiated = false;
2755         msr_info.index = MSR_EFER;
2756         msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2757         ret = kvm_set_msr_common(vcpu, &msr_info);
2758
2759         return kvm_complete_insn_gp(vcpu, ret);
2760 }
2761
2762 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2763 {
2764         msr->data = 0;
2765
2766         switch (msr->index) {
2767         case MSR_AMD64_DE_CFG:
2768                 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2769                         msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2770                 break;
2771         default:
2772                 return KVM_MSR_RET_INVALID;
2773         }
2774
2775         return 0;
2776 }
2777
2778 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2779 {
2780         struct vcpu_svm *svm = to_svm(vcpu);
2781
2782         switch (msr_info->index) {
2783         case MSR_AMD64_TSC_RATIO:
2784                 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
2785                         return 1;
2786                 msr_info->data = svm->tsc_ratio_msr;
2787                 break;
2788         case MSR_STAR:
2789                 msr_info->data = svm->vmcb01.ptr->save.star;
2790                 break;
2791 #ifdef CONFIG_X86_64
2792         case MSR_LSTAR:
2793                 msr_info->data = svm->vmcb01.ptr->save.lstar;
2794                 break;
2795         case MSR_CSTAR:
2796                 msr_info->data = svm->vmcb01.ptr->save.cstar;
2797                 break;
2798         case MSR_KERNEL_GS_BASE:
2799                 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2800                 break;
2801         case MSR_SYSCALL_MASK:
2802                 msr_info->data = svm->vmcb01.ptr->save.sfmask;
2803                 break;
2804 #endif
2805         case MSR_IA32_SYSENTER_CS:
2806                 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2807                 break;
2808         case MSR_IA32_SYSENTER_EIP:
2809                 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2810                 if (guest_cpuid_is_intel(vcpu))
2811                         msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2812                 break;
2813         case MSR_IA32_SYSENTER_ESP:
2814                 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2815                 if (guest_cpuid_is_intel(vcpu))
2816                         msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2817                 break;
2818         case MSR_TSC_AUX:
2819                 msr_info->data = svm->tsc_aux;
2820                 break;
2821         case MSR_IA32_DEBUGCTLMSR:
2822         case MSR_IA32_LASTBRANCHFROMIP:
2823         case MSR_IA32_LASTBRANCHTOIP:
2824         case MSR_IA32_LASTINTFROMIP:
2825         case MSR_IA32_LASTINTTOIP:
2826                 msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
2827                 break;
2828         case MSR_VM_HSAVE_PA:
2829                 msr_info->data = svm->nested.hsave_msr;
2830                 break;
2831         case MSR_VM_CR:
2832                 msr_info->data = svm->nested.vm_cr_msr;
2833                 break;
2834         case MSR_IA32_SPEC_CTRL:
2835                 if (!msr_info->host_initiated &&
2836                     !guest_has_spec_ctrl_msr(vcpu))
2837                         return 1;
2838
2839                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2840                         msr_info->data = svm->vmcb->save.spec_ctrl;
2841                 else
2842                         msr_info->data = svm->spec_ctrl;
2843                 break;
2844         case MSR_AMD64_VIRT_SPEC_CTRL:
2845                 if (!msr_info->host_initiated &&
2846                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2847                         return 1;
2848
2849                 msr_info->data = svm->virt_spec_ctrl;
2850                 break;
2851         case MSR_F15H_IC_CFG: {
2852
2853                 int family, model;
2854
2855                 family = guest_cpuid_family(vcpu);
2856                 model  = guest_cpuid_model(vcpu);
2857
2858                 if (family < 0 || model < 0)
2859                         return kvm_get_msr_common(vcpu, msr_info);
2860
2861                 msr_info->data = 0;
2862
2863                 if (family == 0x15 &&
2864                     (model >= 0x2 && model < 0x20))
2865                         msr_info->data = 0x1E;
2866                 }
2867                 break;
2868         case MSR_AMD64_DE_CFG:
2869                 msr_info->data = svm->msr_decfg;
2870                 break;
2871         default:
2872                 return kvm_get_msr_common(vcpu, msr_info);
2873         }
2874         return 0;
2875 }
2876
2877 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2878 {
2879         struct vcpu_svm *svm = to_svm(vcpu);
2880         if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2881                 return kvm_complete_insn_gp(vcpu, err);
2882
2883         ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2884         ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2885                                 X86_TRAP_GP |
2886                                 SVM_EVTINJ_TYPE_EXEPT |
2887                                 SVM_EVTINJ_VALID);
2888         return 1;
2889 }
2890
2891 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2892 {
2893         struct vcpu_svm *svm = to_svm(vcpu);
2894         int svm_dis, chg_mask;
2895
2896         if (data & ~SVM_VM_CR_VALID_MASK)
2897                 return 1;
2898
2899         chg_mask = SVM_VM_CR_VALID_MASK;
2900
2901         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2902                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2903
2904         svm->nested.vm_cr_msr &= ~chg_mask;
2905         svm->nested.vm_cr_msr |= (data & chg_mask);
2906
2907         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2908
2909         /* check for svm_disable while efer.svme is set */
2910         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2911                 return 1;
2912
2913         return 0;
2914 }
2915
2916 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2917 {
2918         struct vcpu_svm *svm = to_svm(vcpu);
2919         int ret = 0;
2920
2921         u32 ecx = msr->index;
2922         u64 data = msr->data;
2923         switch (ecx) {
2924         case MSR_AMD64_TSC_RATIO:
2925
2926                 if (!svm->tsc_scaling_enabled) {
2927
2928                         if (!msr->host_initiated)
2929                                 return 1;
2930                         /*
2931                          * In case TSC scaling is not enabled, always
2932                          * leave this MSR at the default value.
2933                          *
2934                          * Due to bug in qemu 6.2.0, it would try to set
2935                          * this msr to 0 if tsc scaling is not enabled.
2936                          * Ignore this value as well.
2937                          */
2938                         if (data != 0 && data != svm->tsc_ratio_msr)
2939                                 return 1;
2940                         break;
2941                 }
2942
2943                 if (data & SVM_TSC_RATIO_RSVD)
2944                         return 1;
2945
2946                 svm->tsc_ratio_msr = data;
2947
2948                 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
2949                         nested_svm_update_tsc_ratio_msr(vcpu);
2950
2951                 break;
2952         case MSR_IA32_CR_PAT:
2953                 ret = kvm_set_msr_common(vcpu, msr);
2954                 if (ret)
2955                         break;
2956
2957                 svm->vmcb01.ptr->save.g_pat = data;
2958                 if (is_guest_mode(vcpu))
2959                         nested_vmcb02_compute_g_pat(svm);
2960                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2961                 break;
2962         case MSR_IA32_SPEC_CTRL:
2963                 if (!msr->host_initiated &&
2964                     !guest_has_spec_ctrl_msr(vcpu))
2965                         return 1;
2966
2967                 if (kvm_spec_ctrl_test_value(data))
2968                         return 1;
2969
2970                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2971                         svm->vmcb->save.spec_ctrl = data;
2972                 else
2973                         svm->spec_ctrl = data;
2974                 if (!data)
2975                         break;
2976
2977                 /*
2978                  * For non-nested:
2979                  * When it's written (to non-zero) for the first time, pass
2980                  * it through.
2981                  *
2982                  * For nested:
2983                  * The handling of the MSR bitmap for L2 guests is done in
2984                  * nested_svm_vmrun_msrpm.
2985                  * We update the L1 MSR bit as well since it will end up
2986                  * touching the MSR anyway now.
2987                  */
2988                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2989                 break;
2990         case MSR_AMD64_VIRT_SPEC_CTRL:
2991                 if (!msr->host_initiated &&
2992                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2993                         return 1;
2994
2995                 if (data & ~SPEC_CTRL_SSBD)
2996                         return 1;
2997
2998                 svm->virt_spec_ctrl = data;
2999                 break;
3000         case MSR_STAR:
3001                 svm->vmcb01.ptr->save.star = data;
3002                 break;
3003 #ifdef CONFIG_X86_64
3004         case MSR_LSTAR:
3005                 svm->vmcb01.ptr->save.lstar = data;
3006                 break;
3007         case MSR_CSTAR:
3008                 svm->vmcb01.ptr->save.cstar = data;
3009                 break;
3010         case MSR_KERNEL_GS_BASE:
3011                 svm->vmcb01.ptr->save.kernel_gs_base = data;
3012                 break;
3013         case MSR_SYSCALL_MASK:
3014                 svm->vmcb01.ptr->save.sfmask = data;
3015                 break;
3016 #endif
3017         case MSR_IA32_SYSENTER_CS:
3018                 svm->vmcb01.ptr->save.sysenter_cs = data;
3019                 break;
3020         case MSR_IA32_SYSENTER_EIP:
3021                 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
3022                 /*
3023                  * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
3024                  * when we spoof an Intel vendor ID (for cross vendor migration).
3025                  * In this case we use this intercept to track the high
3026                  * 32 bit part of these msrs to support Intel's
3027                  * implementation of SYSENTER/SYSEXIT.
3028                  */
3029                 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3030                 break;
3031         case MSR_IA32_SYSENTER_ESP:
3032                 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
3033                 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3034                 break;
3035         case MSR_TSC_AUX:
3036                 /*
3037                  * TSC_AUX is usually changed only during boot and never read
3038                  * directly.  Intercept TSC_AUX instead of exposing it to the
3039                  * guest via direct_access_msrs, and switch it via user return.
3040                  */
3041                 preempt_disable();
3042                 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
3043                 preempt_enable();
3044                 if (ret)
3045                         break;
3046
3047                 svm->tsc_aux = data;
3048                 break;
3049         case MSR_IA32_DEBUGCTLMSR:
3050                 if (!lbrv) {
3051                         kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3052                         break;
3053                 }
3054                 if (data & DEBUGCTL_RESERVED_BITS)
3055                         return 1;
3056
3057                 if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
3058                         svm->vmcb->save.dbgctl = data;
3059                 else
3060                         svm->vmcb01.ptr->save.dbgctl = data;
3061
3062                 svm_update_lbrv(vcpu);
3063
3064                 break;
3065         case MSR_VM_HSAVE_PA:
3066                 /*
3067                  * Old kernels did not validate the value written to
3068                  * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
3069                  * value to allow live migrating buggy or malicious guests
3070                  * originating from those kernels.
3071                  */
3072                 if (!msr->host_initiated && !page_address_valid(vcpu, data))
3073                         return 1;
3074
3075                 svm->nested.hsave_msr = data & PAGE_MASK;
3076                 break;
3077         case MSR_VM_CR:
3078                 return svm_set_vm_cr(vcpu, data);
3079         case MSR_VM_IGNNE:
3080                 kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3081                 break;
3082         case MSR_AMD64_DE_CFG: {
3083                 struct kvm_msr_entry msr_entry;
3084
3085                 msr_entry.index = msr->index;
3086                 if (svm_get_msr_feature(&msr_entry))
3087                         return 1;
3088
3089                 /* Check the supported bits */
3090                 if (data & ~msr_entry.data)
3091                         return 1;
3092
3093                 /* Don't allow the guest to change a bit, #GP */
3094                 if (!msr->host_initiated && (data ^ msr_entry.data))
3095                         return 1;
3096
3097                 svm->msr_decfg = data;
3098                 break;
3099         }
3100         default:
3101                 return kvm_set_msr_common(vcpu, msr);
3102         }
3103         return ret;
3104 }
3105
3106 static int msr_interception(struct kvm_vcpu *vcpu)
3107 {
3108         if (to_svm(vcpu)->vmcb->control.exit_info_1)
3109                 return kvm_emulate_wrmsr(vcpu);
3110         else
3111                 return kvm_emulate_rdmsr(vcpu);
3112 }
3113
3114 static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3115 {
3116         kvm_make_request(KVM_REQ_EVENT, vcpu);
3117         svm_clear_vintr(to_svm(vcpu));
3118
3119         /*
3120          * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3121          * In this case AVIC was temporarily disabled for
3122          * requesting the IRQ window and we have to re-enable it.
3123          *
3124          * If running nested, still remove the VM wide AVIC inhibit to
3125          * support case in which the interrupt window was requested when the
3126          * vCPU was not running nested.
3127
3128          * All vCPUs which run still run nested, will remain to have their
3129          * AVIC still inhibited due to per-cpu AVIC inhibition.
3130          */
3131         kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3132
3133         ++vcpu->stat.irq_window_exits;
3134         return 1;
3135 }
3136
3137 static int pause_interception(struct kvm_vcpu *vcpu)
3138 {
3139         bool in_kernel;
3140         /*
3141          * CPL is not made available for an SEV-ES guest, therefore
3142          * vcpu->arch.preempted_in_kernel can never be true.  Just
3143          * set in_kernel to false as well.
3144          */
3145         in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3146
3147         grow_ple_window(vcpu);
3148
3149         kvm_vcpu_on_spin(vcpu, in_kernel);
3150         return kvm_skip_emulated_instruction(vcpu);
3151 }
3152
3153 static int invpcid_interception(struct kvm_vcpu *vcpu)
3154 {
3155         struct vcpu_svm *svm = to_svm(vcpu);
3156         unsigned long type;
3157         gva_t gva;
3158
3159         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3160                 kvm_queue_exception(vcpu, UD_VECTOR);
3161                 return 1;
3162         }
3163
3164         /*
3165          * For an INVPCID intercept:
3166          * EXITINFO1 provides the linear address of the memory operand.
3167          * EXITINFO2 provides the contents of the register operand.
3168          */
3169         type = svm->vmcb->control.exit_info_2;
3170         gva = svm->vmcb->control.exit_info_1;
3171
3172         return kvm_handle_invpcid(vcpu, type, gva);
3173 }
3174
3175 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3176         [SVM_EXIT_READ_CR0]                     = cr_interception,
3177         [SVM_EXIT_READ_CR3]                     = cr_interception,
3178         [SVM_EXIT_READ_CR4]                     = cr_interception,
3179         [SVM_EXIT_READ_CR8]                     = cr_interception,
3180         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3181         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3182         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3183         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3184         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3185         [SVM_EXIT_READ_DR0]                     = dr_interception,
3186         [SVM_EXIT_READ_DR1]                     = dr_interception,
3187         [SVM_EXIT_READ_DR2]                     = dr_interception,
3188         [SVM_EXIT_READ_DR3]                     = dr_interception,
3189         [SVM_EXIT_READ_DR4]                     = dr_interception,
3190         [SVM_EXIT_READ_DR5]                     = dr_interception,
3191         [SVM_EXIT_READ_DR6]                     = dr_interception,
3192         [SVM_EXIT_READ_DR7]                     = dr_interception,
3193         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3194         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3195         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3196         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3197         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3198         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3199         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3200         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3201         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3202         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3203         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3204         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3205         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3206         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3207         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
3208         [SVM_EXIT_INTR]                         = intr_interception,
3209         [SVM_EXIT_NMI]                          = nmi_interception,
3210         [SVM_EXIT_SMI]                          = smi_interception,
3211         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3212         [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
3213         [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
3214         [SVM_EXIT_IRET]                         = iret_interception,
3215         [SVM_EXIT_INVD]                         = kvm_emulate_invd,
3216         [SVM_EXIT_PAUSE]                        = pause_interception,
3217         [SVM_EXIT_HLT]                          = kvm_emulate_halt,
3218         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3219         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3220         [SVM_EXIT_IOIO]                         = io_interception,
3221         [SVM_EXIT_MSR]                          = msr_interception,
3222         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3223         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3224         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3225         [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
3226         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3227         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3228         [SVM_EXIT_STGI]                         = stgi_interception,
3229         [SVM_EXIT_CLGI]                         = clgi_interception,
3230         [SVM_EXIT_SKINIT]                       = skinit_interception,
3231         [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
3232         [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3233         [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
3234         [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
3235         [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
3236         [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
3237         [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
3238         [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
3239         [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
3240         [SVM_EXIT_CR8_WRITE_TRAP]               = cr_trap,
3241         [SVM_EXIT_INVPCID]                      = invpcid_interception,
3242         [SVM_EXIT_NPF]                          = npf_interception,
3243         [SVM_EXIT_RSM]                          = rsm_interception,
3244         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3245         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3246         [SVM_EXIT_VMGEXIT]                      = sev_handle_vmgexit,
3247 };
3248
3249 static void dump_vmcb(struct kvm_vcpu *vcpu)
3250 {
3251         struct vcpu_svm *svm = to_svm(vcpu);
3252         struct vmcb_control_area *control = &svm->vmcb->control;
3253         struct vmcb_save_area *save = &svm->vmcb->save;
3254         struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3255
3256         if (!dump_invalid_vmcb) {
3257                 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3258                 return;
3259         }
3260
3261         pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3262                svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3263         pr_err("VMCB Control Area:\n");
3264         pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3265         pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3266         pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3267         pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3268         pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3269         pr_err("%-20s%08x %08x\n", "intercepts:",
3270               control->intercepts[INTERCEPT_WORD3],
3271                control->intercepts[INTERCEPT_WORD4]);
3272         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3273         pr_err("%-20s%d\n", "pause filter threshold:",
3274                control->pause_filter_thresh);
3275         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3276         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3277         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3278         pr_err("%-20s%d\n", "asid:", control->asid);
3279         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3280         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3281         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3282         pr_err("%-20s%08x\n", "int_state:", control->int_state);
3283         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3284         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3285         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3286         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3287         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3288         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3289         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3290         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3291         pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3292         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3293         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3294         pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3295         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3296         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3297         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3298         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3299         pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3300         pr_err("VMCB State Save Area:\n");
3301         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3302                "es:",
3303                save->es.selector, save->es.attrib,
3304                save->es.limit, save->es.base);
3305         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3306                "cs:",
3307                save->cs.selector, save->cs.attrib,
3308                save->cs.limit, save->cs.base);
3309         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3310                "ss:",
3311                save->ss.selector, save->ss.attrib,
3312                save->ss.limit, save->ss.base);
3313         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3314                "ds:",
3315                save->ds.selector, save->ds.attrib,
3316                save->ds.limit, save->ds.base);
3317         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3318                "fs:",
3319                save01->fs.selector, save01->fs.attrib,
3320                save01->fs.limit, save01->fs.base);
3321         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3322                "gs:",
3323                save01->gs.selector, save01->gs.attrib,
3324                save01->gs.limit, save01->gs.base);
3325         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3326                "gdtr:",
3327                save->gdtr.selector, save->gdtr.attrib,
3328                save->gdtr.limit, save->gdtr.base);
3329         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3330                "ldtr:",
3331                save01->ldtr.selector, save01->ldtr.attrib,
3332                save01->ldtr.limit, save01->ldtr.base);
3333         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3334                "idtr:",
3335                save->idtr.selector, save->idtr.attrib,
3336                save->idtr.limit, save->idtr.base);
3337         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3338                "tr:",
3339                save01->tr.selector, save01->tr.attrib,
3340                save01->tr.limit, save01->tr.base);
3341         pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3342                save->vmpl, save->cpl, save->efer);
3343         pr_err("%-15s %016llx %-13s %016llx\n",
3344                "cr0:", save->cr0, "cr2:", save->cr2);
3345         pr_err("%-15s %016llx %-13s %016llx\n",
3346                "cr3:", save->cr3, "cr4:", save->cr4);
3347         pr_err("%-15s %016llx %-13s %016llx\n",
3348                "dr6:", save->dr6, "dr7:", save->dr7);
3349         pr_err("%-15s %016llx %-13s %016llx\n",
3350                "rip:", save->rip, "rflags:", save->rflags);
3351         pr_err("%-15s %016llx %-13s %016llx\n",
3352                "rsp:", save->rsp, "rax:", save->rax);
3353         pr_err("%-15s %016llx %-13s %016llx\n",
3354                "star:", save01->star, "lstar:", save01->lstar);
3355         pr_err("%-15s %016llx %-13s %016llx\n",
3356                "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3357         pr_err("%-15s %016llx %-13s %016llx\n",
3358                "kernel_gs_base:", save01->kernel_gs_base,
3359                "sysenter_cs:", save01->sysenter_cs);
3360         pr_err("%-15s %016llx %-13s %016llx\n",
3361                "sysenter_esp:", save01->sysenter_esp,
3362                "sysenter_eip:", save01->sysenter_eip);
3363         pr_err("%-15s %016llx %-13s %016llx\n",
3364                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3365         pr_err("%-15s %016llx %-13s %016llx\n",
3366                "br_from:", save->br_from, "br_to:", save->br_to);
3367         pr_err("%-15s %016llx %-13s %016llx\n",
3368                "excp_from:", save->last_excp_from,
3369                "excp_to:", save->last_excp_to);
3370 }
3371
3372 static bool svm_check_exit_valid(u64 exit_code)
3373 {
3374         return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3375                 svm_exit_handlers[exit_code]);
3376 }
3377
3378 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3379 {
3380         vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3381         dump_vmcb(vcpu);
3382         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3383         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3384         vcpu->run->internal.ndata = 2;
3385         vcpu->run->internal.data[0] = exit_code;
3386         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3387         return 0;
3388 }
3389
3390 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3391 {
3392         if (!svm_check_exit_valid(exit_code))
3393                 return svm_handle_invalid_exit(vcpu, exit_code);
3394
3395 #ifdef CONFIG_RETPOLINE
3396         if (exit_code == SVM_EXIT_MSR)
3397                 return msr_interception(vcpu);
3398         else if (exit_code == SVM_EXIT_VINTR)
3399                 return interrupt_window_interception(vcpu);
3400         else if (exit_code == SVM_EXIT_INTR)
3401                 return intr_interception(vcpu);
3402         else if (exit_code == SVM_EXIT_HLT)
3403                 return kvm_emulate_halt(vcpu);
3404         else if (exit_code == SVM_EXIT_NPF)
3405                 return npf_interception(vcpu);
3406 #endif
3407         return svm_exit_handlers[exit_code](vcpu);
3408 }
3409
3410 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3411                               u64 *info1, u64 *info2,
3412                               u32 *intr_info, u32 *error_code)
3413 {
3414         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3415
3416         *reason = control->exit_code;
3417         *info1 = control->exit_info_1;
3418         *info2 = control->exit_info_2;
3419         *intr_info = control->exit_int_info;
3420         if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3421             (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3422                 *error_code = control->exit_int_info_err;
3423         else
3424                 *error_code = 0;
3425 }
3426
3427 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3428 {
3429         struct vcpu_svm *svm = to_svm(vcpu);
3430         struct kvm_run *kvm_run = vcpu->run;
3431         u32 exit_code = svm->vmcb->control.exit_code;
3432
3433         /* SEV-ES guests must use the CR write traps to track CR registers. */
3434         if (!sev_es_guest(vcpu->kvm)) {
3435                 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3436                         vcpu->arch.cr0 = svm->vmcb->save.cr0;
3437                 if (npt_enabled)
3438                         vcpu->arch.cr3 = svm->vmcb->save.cr3;
3439         }
3440
3441         if (is_guest_mode(vcpu)) {
3442                 int vmexit;
3443
3444                 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3445
3446                 vmexit = nested_svm_exit_special(svm);
3447
3448                 if (vmexit == NESTED_EXIT_CONTINUE)
3449                         vmexit = nested_svm_exit_handled(svm);
3450
3451                 if (vmexit == NESTED_EXIT_DONE)
3452                         return 1;
3453         }
3454
3455         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3456                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3457                 kvm_run->fail_entry.hardware_entry_failure_reason
3458                         = svm->vmcb->control.exit_code;
3459                 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3460                 dump_vmcb(vcpu);
3461                 return 0;
3462         }
3463
3464         if (exit_fastpath != EXIT_FASTPATH_NONE)
3465                 return 1;
3466
3467         return svm_invoke_exit_handler(vcpu, exit_code);
3468 }
3469
3470 static void pre_svm_run(struct kvm_vcpu *vcpu)
3471 {
3472         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3473         struct vcpu_svm *svm = to_svm(vcpu);
3474
3475         /*
3476          * If the previous vmrun of the vmcb occurred on a different physical
3477          * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3478          * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3479          */
3480         if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3481                 svm->current_vmcb->asid_generation = 0;
3482                 vmcb_mark_all_dirty(svm->vmcb);
3483                 svm->current_vmcb->cpu = vcpu->cpu;
3484         }
3485
3486         if (sev_guest(vcpu->kvm))
3487                 return pre_sev_run(svm, vcpu->cpu);
3488
3489         /* FIXME: handle wraparound of asid_generation */
3490         if (svm->current_vmcb->asid_generation != sd->asid_generation)
3491                 new_asid(svm, sd);
3492 }
3493
3494 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3495 {
3496         struct vcpu_svm *svm = to_svm(vcpu);
3497
3498         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3499
3500         if (svm->nmi_l1_to_l2)
3501                 return;
3502
3503         svm->nmi_masked = true;
3504         svm_set_iret_intercept(svm);
3505         ++vcpu->stat.nmi_injections;
3506 }
3507
3508 static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
3509 {
3510         struct vcpu_svm *svm = to_svm(vcpu);
3511
3512         if (!is_vnmi_enabled(svm))
3513                 return false;
3514
3515         return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
3516 }
3517
3518 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
3519 {
3520         struct vcpu_svm *svm = to_svm(vcpu);
3521
3522         if (!is_vnmi_enabled(svm))
3523                 return false;
3524
3525         if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
3526                 return false;
3527
3528         svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
3529         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
3530
3531         /*
3532          * Because the pending NMI is serviced by hardware, KVM can't know when
3533          * the NMI is "injected", but for all intents and purposes, passing the
3534          * NMI off to hardware counts as injection.
3535          */
3536         ++vcpu->stat.nmi_injections;
3537
3538         return true;
3539 }
3540
3541 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3542 {
3543         struct vcpu_svm *svm = to_svm(vcpu);
3544         u32 type;
3545
3546         if (vcpu->arch.interrupt.soft) {
3547                 if (svm_update_soft_interrupt_rip(vcpu))
3548                         return;
3549
3550                 type = SVM_EVTINJ_TYPE_SOFT;
3551         } else {
3552                 type = SVM_EVTINJ_TYPE_INTR;
3553         }
3554
3555         trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3556                            vcpu->arch.interrupt.soft, reinjected);
3557         ++vcpu->stat.irq_injections;
3558
3559         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3560                                        SVM_EVTINJ_VALID | type;
3561 }
3562
3563 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3564                                      int trig_mode, int vector)
3565 {
3566         /*
3567          * apic->apicv_active must be read after vcpu->mode.
3568          * Pairs with smp_store_release in vcpu_enter_guest.
3569          */
3570         bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3571
3572         /* Note, this is called iff the local APIC is in-kernel. */
3573         if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3574                 /* Process the interrupt via kvm_check_and_inject_events(). */
3575                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3576                 kvm_vcpu_kick(vcpu);
3577                 return;
3578         }
3579
3580         trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3581         if (in_guest_mode) {
3582                 /*
3583                  * Signal the doorbell to tell hardware to inject the IRQ.  If
3584                  * the vCPU exits the guest before the doorbell chimes, hardware
3585                  * will automatically process AVIC interrupts at the next VMRUN.
3586                  */
3587                 avic_ring_doorbell(vcpu);
3588         } else {
3589                 /*
3590                  * Wake the vCPU if it was blocking.  KVM will then detect the
3591                  * pending IRQ when checking if the vCPU has a wake event.
3592                  */
3593                 kvm_vcpu_wake_up(vcpu);
3594         }
3595 }
3596
3597 static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3598                                   int trig_mode, int vector)
3599 {
3600         kvm_lapic_set_irr(vector, apic);
3601
3602         /*
3603          * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3604          * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3605          * the read of guest_mode.  This guarantees that either VMRUN will see
3606          * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3607          * will signal the doorbell if the CPU has already entered the guest.
3608          */
3609         smp_mb__after_atomic();
3610         svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3611 }
3612
3613 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3614 {
3615         struct vcpu_svm *svm = to_svm(vcpu);
3616
3617         /*
3618          * SEV-ES guests must always keep the CR intercepts cleared. CR
3619          * tracking is done using the CR write traps.
3620          */
3621         if (sev_es_guest(vcpu->kvm))
3622                 return;
3623
3624         if (nested_svm_virtualize_tpr(vcpu))
3625                 return;
3626
3627         svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3628
3629         if (irr == -1)
3630                 return;
3631
3632         if (tpr >= irr)
3633                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3634 }
3635
3636 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3637 {
3638         struct vcpu_svm *svm = to_svm(vcpu);
3639
3640         if (is_vnmi_enabled(svm))
3641                 return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
3642         else
3643                 return svm->nmi_masked;
3644 }
3645
3646 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3647 {
3648         struct vcpu_svm *svm = to_svm(vcpu);
3649
3650         if (is_vnmi_enabled(svm)) {
3651                 if (masked)
3652                         svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
3653                 else
3654                         svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
3655
3656         } else {
3657                 svm->nmi_masked = masked;
3658                 if (masked)
3659                         svm_set_iret_intercept(svm);
3660                 else
3661                         svm_clr_iret_intercept(svm);
3662         }
3663 }
3664
3665 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3666 {
3667         struct vcpu_svm *svm = to_svm(vcpu);
3668         struct vmcb *vmcb = svm->vmcb;
3669
3670         if (!gif_set(svm))
3671                 return true;
3672
3673         if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3674                 return false;
3675
3676         if (svm_get_nmi_mask(vcpu))
3677                 return true;
3678
3679         return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
3680 }
3681
3682 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3683 {
3684         struct vcpu_svm *svm = to_svm(vcpu);
3685         if (svm->nested.nested_run_pending)
3686                 return -EBUSY;
3687
3688         if (svm_nmi_blocked(vcpu))
3689                 return 0;
3690
3691         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3692         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3693                 return -EBUSY;
3694         return 1;
3695 }
3696
3697 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3698 {
3699         struct vcpu_svm *svm = to_svm(vcpu);
3700         struct vmcb *vmcb = svm->vmcb;
3701
3702         if (!gif_set(svm))
3703                 return true;
3704
3705         if (is_guest_mode(vcpu)) {
3706                 /* As long as interrupts are being delivered...  */
3707                 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3708                     ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3709                     : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3710                         return true;
3711
3712                 /* ... vmexits aren't blocked by the interrupt shadow  */
3713                 if (nested_exit_on_intr(svm))
3714                         return false;
3715         } else {
3716                 if (!svm_get_if_flag(vcpu))
3717                         return true;
3718         }
3719
3720         return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3721 }
3722
3723 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3724 {
3725         struct vcpu_svm *svm = to_svm(vcpu);
3726
3727         if (svm->nested.nested_run_pending)
3728                 return -EBUSY;
3729
3730         if (svm_interrupt_blocked(vcpu))
3731                 return 0;
3732
3733         /*
3734          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3735          * e.g. if the IRQ arrived asynchronously after checking nested events.
3736          */
3737         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3738                 return -EBUSY;
3739
3740         return 1;
3741 }
3742
3743 static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3744 {
3745         struct vcpu_svm *svm = to_svm(vcpu);
3746
3747         /*
3748          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3749          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3750          * get that intercept, this function will be called again though and
3751          * we'll get the vintr intercept. However, if the vGIF feature is
3752          * enabled, the STGI interception will not occur. Enable the irq
3753          * window under the assumption that the hardware will set the GIF.
3754          */
3755         if (vgif || gif_set(svm)) {
3756                 /*
3757                  * IRQ window is not needed when AVIC is enabled,
3758                  * unless we have pending ExtINT since it cannot be injected
3759                  * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3760                  * and fallback to injecting IRQ via V_IRQ.
3761                  *
3762                  * If running nested, AVIC is already locally inhibited
3763                  * on this vCPU, therefore there is no need to request
3764                  * the VM wide AVIC inhibition.
3765                  */
3766                 if (!is_guest_mode(vcpu))
3767                         kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3768
3769                 svm_set_vintr(svm);
3770         }
3771 }
3772
3773 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3774 {
3775         struct vcpu_svm *svm = to_svm(vcpu);
3776
3777         /*
3778          * KVM should never request an NMI window when vNMI is enabled, as KVM
3779          * allows at most one to-be-injected NMI and one pending NMI, i.e. if
3780          * two NMIs arrive simultaneously, KVM will inject one and set
3781          * V_NMI_PENDING for the other.  WARN, but continue with the standard
3782          * single-step approach to try and salvage the pending NMI.
3783          */
3784         WARN_ON_ONCE(is_vnmi_enabled(svm));
3785
3786         if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
3787                 return; /* IRET will cause a vm exit */
3788
3789         if (!gif_set(svm)) {
3790                 if (vgif)
3791                         svm_set_intercept(svm, INTERCEPT_STGI);
3792                 return; /* STGI will cause a vm exit */
3793         }
3794
3795         /*
3796          * Something prevents NMI from been injected. Single step over possible
3797          * problem (IRET or exception injection or interrupt shadow)
3798          */
3799         svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3800         svm->nmi_singlestep = true;
3801         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3802 }
3803
3804 static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
3805 {
3806         struct vcpu_svm *svm = to_svm(vcpu);
3807
3808         /*
3809          * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
3810          * A TLB flush for the current ASID flushes both "host" and "guest" TLB
3811          * entries, and thus is a superset of Hyper-V's fine grained flushing.
3812          */
3813         kvm_hv_vcpu_purge_flush_tlb(vcpu);
3814
3815         /*
3816          * Flush only the current ASID even if the TLB flush was invoked via
3817          * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3818          * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3819          * unconditionally does a TLB flush on both nested VM-Enter and nested
3820          * VM-Exit (via kvm_mmu_reset_context()).
3821          */
3822         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3823                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3824         else
3825                 svm->current_vmcb->asid_generation--;
3826 }
3827
3828 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3829 {
3830         hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
3831
3832         /*
3833          * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
3834          * flush the NPT mappings via hypercall as flushing the ASID only
3835          * affects virtual to physical mappings, it does not invalidate guest
3836          * physical to host physical mappings.
3837          */
3838         if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
3839                 hyperv_flush_guest_mapping(root_tdp);
3840
3841         svm_flush_tlb_asid(vcpu);
3842 }
3843
3844 static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
3845 {
3846         /*
3847          * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
3848          * flushes should be routed to hv_flush_remote_tlbs() without requesting
3849          * a "regular" remote flush.  Reaching this point means either there's
3850          * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
3851          * which might be fatal to the guest.  Yell, but try to recover.
3852          */
3853         if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
3854                 hv_flush_remote_tlbs(vcpu->kvm);
3855
3856         svm_flush_tlb_asid(vcpu);
3857 }
3858
3859 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3860 {
3861         struct vcpu_svm *svm = to_svm(vcpu);
3862
3863         invlpga(gva, svm->vmcb->control.asid);
3864 }
3865
3866 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3867 {
3868         struct vcpu_svm *svm = to_svm(vcpu);
3869
3870         if (nested_svm_virtualize_tpr(vcpu))
3871                 return;
3872
3873         if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3874                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3875                 kvm_set_cr8(vcpu, cr8);
3876         }
3877 }
3878
3879 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3880 {
3881         struct vcpu_svm *svm = to_svm(vcpu);
3882         u64 cr8;
3883
3884         if (nested_svm_virtualize_tpr(vcpu) ||
3885             kvm_vcpu_apicv_active(vcpu))
3886                 return;
3887
3888         cr8 = kvm_get_cr8(vcpu);
3889         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3890         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3891 }
3892
3893 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3894                                         int type)
3895 {
3896         bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3897         bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3898         struct vcpu_svm *svm = to_svm(vcpu);
3899
3900         /*
3901          * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3902          * associated with the original soft exception/interrupt.  next_rip is
3903          * cleared on all exits that can occur while vectoring an event, so KVM
3904          * needs to manually set next_rip for re-injection.  Unlike the !nrips
3905          * case below, this needs to be done if and only if KVM is re-injecting
3906          * the same event, i.e. if the event is a soft exception/interrupt,
3907          * otherwise next_rip is unused on VMRUN.
3908          */
3909         if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3910             kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3911                 svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3912         /*
3913          * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3914          * injecting the soft exception/interrupt.  That advancement needs to
3915          * be unwound if vectoring didn't complete.  Note, the new event may
3916          * not be the injected event, e.g. if KVM injected an INTn, the INTn
3917          * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3918          * be the reported vectored event, but RIP still needs to be unwound.
3919          */
3920         else if (!nrips && (is_soft || is_exception) &&
3921                  kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3922                 kvm_rip_write(vcpu, svm->soft_int_old_rip);
3923 }
3924
3925 static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3926 {
3927         struct vcpu_svm *svm = to_svm(vcpu);
3928         u8 vector;
3929         int type;
3930         u32 exitintinfo = svm->vmcb->control.exit_int_info;
3931         bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
3932         bool soft_int_injected = svm->soft_int_injected;
3933
3934         svm->nmi_l1_to_l2 = false;
3935         svm->soft_int_injected = false;
3936
3937         /*
3938          * If we've made progress since setting HF_IRET_MASK, we've
3939          * executed an IRET and can allow NMI injection.
3940          */
3941         if (svm->awaiting_iret_completion &&
3942             (sev_es_guest(vcpu->kvm) ||
3943              kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3944                 svm->awaiting_iret_completion = false;
3945                 svm->nmi_masked = false;
3946                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3947         }
3948
3949         vcpu->arch.nmi_injected = false;
3950         kvm_clear_exception_queue(vcpu);
3951         kvm_clear_interrupt_queue(vcpu);
3952
3953         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3954                 return;
3955
3956         kvm_make_request(KVM_REQ_EVENT, vcpu);
3957
3958         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3959         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3960
3961         if (soft_int_injected)
3962                 svm_complete_soft_interrupt(vcpu, vector, type);
3963
3964         switch (type) {
3965         case SVM_EXITINTINFO_TYPE_NMI:
3966                 vcpu->arch.nmi_injected = true;
3967                 svm->nmi_l1_to_l2 = nmi_l1_to_l2;
3968                 break;
3969         case SVM_EXITINTINFO_TYPE_EXEPT:
3970                 /*
3971                  * Never re-inject a #VC exception.
3972                  */
3973                 if (vector == X86_TRAP_VC)
3974                         break;
3975
3976                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3977                         u32 err = svm->vmcb->control.exit_int_info_err;
3978                         kvm_requeue_exception_e(vcpu, vector, err);
3979
3980                 } else
3981                         kvm_requeue_exception(vcpu, vector);
3982                 break;
3983         case SVM_EXITINTINFO_TYPE_INTR:
3984                 kvm_queue_interrupt(vcpu, vector, false);
3985                 break;
3986         case SVM_EXITINTINFO_TYPE_SOFT:
3987                 kvm_queue_interrupt(vcpu, vector, true);
3988                 break;
3989         default:
3990                 break;
3991         }
3992
3993 }
3994
3995 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3996 {
3997         struct vcpu_svm *svm = to_svm(vcpu);
3998         struct vmcb_control_area *control = &svm->vmcb->control;
3999
4000         control->exit_int_info = control->event_inj;
4001         control->exit_int_info_err = control->event_inj_err;
4002         control->event_inj = 0;
4003         svm_complete_interrupts(vcpu);
4004 }
4005
4006 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
4007 {
4008         return 1;
4009 }
4010
4011 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
4012 {
4013         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
4014
4015         /*
4016          * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
4017          * can't read guest memory (dereference memslots) to decode the WRMSR.
4018          */
4019         if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
4020             nrips && control->next_rip)
4021                 return handle_fastpath_set_msr_irqoff(vcpu);
4022
4023         return EXIT_FASTPATH_NONE;
4024 }
4025
4026 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
4027 {
4028         struct vcpu_svm *svm = to_svm(vcpu);
4029
4030         guest_state_enter_irqoff();
4031
4032         if (sev_es_guest(vcpu->kvm))
4033                 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
4034         else
4035                 __svm_vcpu_run(svm, spec_ctrl_intercepted);
4036
4037         guest_state_exit_irqoff();
4038 }
4039
4040 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
4041 {
4042         struct vcpu_svm *svm = to_svm(vcpu);
4043         bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
4044
4045         trace_kvm_entry(vcpu);
4046
4047         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4048         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4049         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4050
4051         /*
4052          * Disable singlestep if we're injecting an interrupt/exception.
4053          * We don't want our modified rflags to be pushed on the stack where
4054          * we might not be able to easily reset them if we disabled NMI
4055          * singlestep later.
4056          */
4057         if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
4058                 /*
4059                  * Event injection happens before external interrupts cause a
4060                  * vmexit and interrupts are disabled here, so smp_send_reschedule
4061                  * is enough to force an immediate vmexit.
4062                  */
4063                 disable_nmi_singlestep(svm);
4064                 smp_send_reschedule(vcpu->cpu);
4065         }
4066
4067         pre_svm_run(vcpu);
4068
4069         sync_lapic_to_cr8(vcpu);
4070
4071         if (unlikely(svm->asid != svm->vmcb->control.asid)) {
4072                 svm->vmcb->control.asid = svm->asid;
4073                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
4074         }
4075         svm->vmcb->save.cr2 = vcpu->arch.cr2;
4076
4077         svm_hv_update_vp_id(svm->vmcb, vcpu);
4078
4079         /*
4080          * Run with all-zero DR6 unless needed, so that we can get the exact cause
4081          * of a #DB.
4082          */
4083         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
4084                 svm_set_dr6(svm, vcpu->arch.dr6);
4085         else
4086                 svm_set_dr6(svm, DR6_ACTIVE_LOW);
4087
4088         clgi();
4089         kvm_load_guest_xsave_state(vcpu);
4090
4091         kvm_wait_lapic_expire(vcpu);
4092
4093         /*
4094          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
4095          * it's non-zero. Since vmentry is serialising on affected CPUs, there
4096          * is no need to worry about the conditional branch over the wrmsr
4097          * being speculatively taken.
4098          */
4099         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4100                 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
4101
4102         svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
4103
4104         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4105                 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
4106
4107         if (!sev_es_guest(vcpu->kvm)) {
4108                 vcpu->arch.cr2 = svm->vmcb->save.cr2;
4109                 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4110                 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4111                 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4112         }
4113         vcpu->arch.regs_dirty = 0;
4114
4115         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4116                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4117
4118         kvm_load_host_xsave_state(vcpu);
4119         stgi();
4120
4121         /* Any pending NMI will happen here */
4122
4123         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4124                 kvm_after_interrupt(vcpu);
4125
4126         sync_cr8_to_lapic(vcpu);
4127
4128         svm->next_rip = 0;
4129         if (is_guest_mode(vcpu)) {
4130                 nested_sync_control_from_vmcb02(svm);
4131
4132                 /* Track VMRUNs that have made past consistency checking */
4133                 if (svm->nested.nested_run_pending &&
4134                     svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4135                         ++vcpu->stat.nested_run;
4136
4137                 svm->nested.nested_run_pending = 0;
4138         }
4139
4140         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4141         vmcb_mark_all_clean(svm->vmcb);
4142
4143         /* if exit due to PF check for async PF */
4144         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4145                 vcpu->arch.apf.host_apf_flags =
4146                         kvm_read_and_reset_apf_flags();
4147
4148         vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4149
4150         /*
4151          * We need to handle MC intercepts here before the vcpu has a chance to
4152          * change the physical cpu
4153          */
4154         if (unlikely(svm->vmcb->control.exit_code ==
4155                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
4156                 svm_handle_mce(vcpu);
4157
4158         trace_kvm_exit(vcpu, KVM_ISA_SVM);
4159
4160         svm_complete_interrupts(vcpu);
4161
4162         if (is_guest_mode(vcpu))
4163                 return EXIT_FASTPATH_NONE;
4164
4165         return svm_exit_handlers_fastpath(vcpu);
4166 }
4167
4168 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4169                              int root_level)
4170 {
4171         struct vcpu_svm *svm = to_svm(vcpu);
4172         unsigned long cr3;
4173
4174         if (npt_enabled) {
4175                 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4176                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4177
4178                 hv_track_root_tdp(vcpu, root_hpa);
4179
4180                 cr3 = vcpu->arch.cr3;
4181         } else if (root_level >= PT64_ROOT_4LEVEL) {
4182                 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4183         } else {
4184                 /* PCID in the guest should be impossible with a 32-bit MMU. */
4185                 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4186                 cr3 = root_hpa;
4187         }
4188
4189         svm->vmcb->save.cr3 = cr3;
4190         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4191 }
4192
4193 static void
4194 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4195 {
4196         /*
4197          * Patch in the VMMCALL instruction:
4198          */
4199         hypercall[0] = 0x0f;
4200         hypercall[1] = 0x01;
4201         hypercall[2] = 0xd9;
4202 }
4203
4204 /*
4205  * The kvm parameter can be NULL (module initialization, or invocation before
4206  * VM creation). Be sure to check the kvm parameter before using it.
4207  */
4208 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4209 {
4210         switch (index) {
4211         case MSR_IA32_MCG_EXT_CTL:
4212         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
4213                 return false;
4214         case MSR_IA32_SMBASE:
4215                 if (!IS_ENABLED(CONFIG_KVM_SMM))
4216                         return false;
4217                 /* SEV-ES guests do not support SMM, so report false */
4218                 if (kvm && sev_es_guest(kvm))
4219                         return false;
4220                 break;
4221         default:
4222                 break;
4223         }
4224
4225         return true;
4226 }
4227
4228 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4229 {
4230         struct vcpu_svm *svm = to_svm(vcpu);
4231         struct kvm_cpuid_entry2 *best;
4232
4233         vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4234                                     boot_cpu_has(X86_FEATURE_XSAVE) &&
4235                                     boot_cpu_has(X86_FEATURE_XSAVES);
4236
4237         /* Update nrips enabled cache */
4238         svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4239                              guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
4240
4241         svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
4242         svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
4243
4244         svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4245
4246         svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
4247                         guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
4248
4249         svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
4250                         guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
4251
4252         svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
4253
4254         svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
4255
4256         svm_recalc_instruction_intercepts(vcpu, svm);
4257
4258         if (boot_cpu_has(X86_FEATURE_IBPB))
4259                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
4260                                      !!guest_has_pred_cmd_msr(vcpu));
4261
4262         if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
4263                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
4264                                      !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
4265
4266         /* For sev guests, the memory encryption bit is not reserved in CR3.  */
4267         if (sev_guest(vcpu->kvm)) {
4268                 best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4269                 if (best)
4270                         vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4271         }
4272
4273         init_vmcb_after_set_cpuid(vcpu);
4274 }
4275
4276 static bool svm_has_wbinvd_exit(void)
4277 {
4278         return true;
4279 }
4280
4281 #define PRE_EX(exit)  { .exit_code = (exit), \
4282                         .stage = X86_ICPT_PRE_EXCEPT, }
4283 #define POST_EX(exit) { .exit_code = (exit), \
4284                         .stage = X86_ICPT_POST_EXCEPT, }
4285 #define POST_MEM(exit) { .exit_code = (exit), \
4286                         .stage = X86_ICPT_POST_MEMACCESS, }
4287
4288 static const struct __x86_intercept {
4289         u32 exit_code;
4290         enum x86_intercept_stage stage;
4291 } x86_intercept_map[] = {
4292         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4293         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4294         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4295         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4296         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4297         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4298         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4299         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4300         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4301         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4302         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4303         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4304         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4305         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4306         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4307         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4308         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4309         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4310         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4311         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4312         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4313         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4314         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4315         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4316         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4317         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4318         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4319         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4320         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4321         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4322         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4323         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4324         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4325         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4326         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4327         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4328         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4329         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4330         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4331         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4332         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4333         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4334         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4335         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4336         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4337         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4338         [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
4339 };
4340
4341 #undef PRE_EX
4342 #undef POST_EX
4343 #undef POST_MEM
4344
4345 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4346                                struct x86_instruction_info *info,
4347                                enum x86_intercept_stage stage,
4348                                struct x86_exception *exception)
4349 {
4350         struct vcpu_svm *svm = to_svm(vcpu);
4351         int vmexit, ret = X86EMUL_CONTINUE;
4352         struct __x86_intercept icpt_info;
4353         struct vmcb *vmcb = svm->vmcb;
4354
4355         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4356                 goto out;
4357
4358         icpt_info = x86_intercept_map[info->intercept];
4359
4360         if (stage != icpt_info.stage)
4361                 goto out;
4362
4363         switch (icpt_info.exit_code) {
4364         case SVM_EXIT_READ_CR0:
4365                 if (info->intercept == x86_intercept_cr_read)
4366                         icpt_info.exit_code += info->modrm_reg;
4367                 break;
4368         case SVM_EXIT_WRITE_CR0: {
4369                 unsigned long cr0, val;
4370
4371                 if (info->intercept == x86_intercept_cr_write)
4372                         icpt_info.exit_code += info->modrm_reg;
4373
4374                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4375                     info->intercept == x86_intercept_clts)
4376                         break;
4377
4378                 if (!(vmcb12_is_intercept(&svm->nested.ctl,
4379                                         INTERCEPT_SELECTIVE_CR0)))
4380                         break;
4381
4382                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4383                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4384
4385                 if (info->intercept == x86_intercept_lmsw) {
4386                         cr0 &= 0xfUL;
4387                         val &= 0xfUL;
4388                         /* lmsw can't clear PE - catch this here */
4389                         if (cr0 & X86_CR0_PE)
4390                                 val |= X86_CR0_PE;
4391                 }
4392
4393                 if (cr0 ^ val)
4394                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4395
4396                 break;
4397         }
4398         case SVM_EXIT_READ_DR0:
4399         case SVM_EXIT_WRITE_DR0:
4400                 icpt_info.exit_code += info->modrm_reg;
4401                 break;
4402         case SVM_EXIT_MSR:
4403                 if (info->intercept == x86_intercept_wrmsr)
4404                         vmcb->control.exit_info_1 = 1;
4405                 else
4406                         vmcb->control.exit_info_1 = 0;
4407                 break;
4408         case SVM_EXIT_PAUSE:
4409                 /*
4410                  * We get this for NOP only, but pause
4411                  * is rep not, check this here
4412                  */
4413                 if (info->rep_prefix != REPE_PREFIX)
4414                         goto out;
4415                 break;
4416         case SVM_EXIT_IOIO: {
4417                 u64 exit_info;
4418                 u32 bytes;
4419
4420                 if (info->intercept == x86_intercept_in ||
4421                     info->intercept == x86_intercept_ins) {
4422                         exit_info = ((info->src_val & 0xffff) << 16) |
4423                                 SVM_IOIO_TYPE_MASK;
4424                         bytes = info->dst_bytes;
4425                 } else {
4426                         exit_info = (info->dst_val & 0xffff) << 16;
4427                         bytes = info->src_bytes;
4428                 }
4429
4430                 if (info->intercept == x86_intercept_outs ||
4431                     info->intercept == x86_intercept_ins)
4432                         exit_info |= SVM_IOIO_STR_MASK;
4433
4434                 if (info->rep_prefix)
4435                         exit_info |= SVM_IOIO_REP_MASK;
4436
4437                 bytes = min(bytes, 4u);
4438
4439                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4440
4441                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4442
4443                 vmcb->control.exit_info_1 = exit_info;
4444                 vmcb->control.exit_info_2 = info->next_rip;
4445
4446                 break;
4447         }
4448         default:
4449                 break;
4450         }
4451
4452         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4453         if (static_cpu_has(X86_FEATURE_NRIPS))
4454                 vmcb->control.next_rip  = info->next_rip;
4455         vmcb->control.exit_code = icpt_info.exit_code;
4456         vmexit = nested_svm_exit_handled(svm);
4457
4458         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4459                                            : X86EMUL_CONTINUE;
4460
4461 out:
4462         return ret;
4463 }
4464
4465 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4466 {
4467         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4468                 vcpu->arch.at_instruction_boundary = true;
4469 }
4470
4471 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4472 {
4473         if (!kvm_pause_in_guest(vcpu->kvm))
4474                 shrink_ple_window(vcpu);
4475 }
4476
4477 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4478 {
4479         /* [63:9] are reserved. */
4480         vcpu->arch.mcg_cap &= 0x1ff;
4481 }
4482
4483 #ifdef CONFIG_KVM_SMM
4484 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4485 {
4486         struct vcpu_svm *svm = to_svm(vcpu);
4487
4488         /* Per APM Vol.2 15.22.2 "Response to SMI" */
4489         if (!gif_set(svm))
4490                 return true;
4491
4492         return is_smm(vcpu);
4493 }
4494
4495 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4496 {
4497         struct vcpu_svm *svm = to_svm(vcpu);
4498         if (svm->nested.nested_run_pending)
4499                 return -EBUSY;
4500
4501         if (svm_smi_blocked(vcpu))
4502                 return 0;
4503
4504         /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4505         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4506                 return -EBUSY;
4507
4508         return 1;
4509 }
4510
4511 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4512 {
4513         struct vcpu_svm *svm = to_svm(vcpu);
4514         struct kvm_host_map map_save;
4515         int ret;
4516
4517         if (!is_guest_mode(vcpu))
4518                 return 0;
4519
4520         /*
4521          * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
4522          * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4523          */
4524
4525         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4526                 return 1;
4527
4528         smram->smram64.svm_guest_flag = 1;
4529         smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4530
4531         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4532         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4533         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4534
4535         ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4536         if (ret)
4537                 return ret;
4538
4539         /*
4540          * KVM uses VMCB01 to store L1 host state while L2 runs but
4541          * VMCB01 is going to be used during SMM and thus the state will
4542          * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4543          * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4544          * format of the area is identical to guest save area offsetted
4545          * by 0x400 (matches the offset of 'struct vmcb_save_area'
4546          * within 'struct vmcb'). Note: HSAVE area may also be used by
4547          * L1 hypervisor to save additional host context (e.g. KVM does
4548          * that, see svm_prepare_switch_to_guest()) which must be
4549          * preserved.
4550          */
4551         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4552                 return 1;
4553
4554         BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4555
4556         svm_copy_vmrun_state(map_save.hva + 0x400,
4557                              &svm->vmcb01.ptr->save);
4558
4559         kvm_vcpu_unmap(vcpu, &map_save, true);
4560         return 0;
4561 }
4562
4563 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4564 {
4565         struct vcpu_svm *svm = to_svm(vcpu);
4566         struct kvm_host_map map, map_save;
4567         struct vmcb *vmcb12;
4568         int ret;
4569
4570         const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4571
4572         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4573                 return 0;
4574
4575         /* Non-zero if SMI arrived while vCPU was in guest mode. */
4576         if (!smram64->svm_guest_flag)
4577                 return 0;
4578
4579         if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4580                 return 1;
4581
4582         if (!(smram64->efer & EFER_SVME))
4583                 return 1;
4584
4585         if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4586                 return 1;
4587
4588         ret = 1;
4589         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4590                 goto unmap_map;
4591
4592         if (svm_allocate_nested(svm))
4593                 goto unmap_save;
4594
4595         /*
4596          * Restore L1 host state from L1 HSAVE area as VMCB01 was
4597          * used during SMM (see svm_enter_smm())
4598          */
4599
4600         svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4601
4602         /*
4603          * Enter the nested guest now
4604          */
4605
4606         vmcb_mark_all_dirty(svm->vmcb01.ptr);
4607
4608         vmcb12 = map.hva;
4609         nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4610         nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4611         ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
4612
4613         if (ret)
4614                 goto unmap_save;
4615
4616         svm->nested.nested_run_pending = 1;
4617
4618 unmap_save:
4619         kvm_vcpu_unmap(vcpu, &map_save, true);
4620 unmap_map:
4621         kvm_vcpu_unmap(vcpu, &map, true);
4622         return ret;
4623 }
4624
4625 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4626 {
4627         struct vcpu_svm *svm = to_svm(vcpu);
4628
4629         if (!gif_set(svm)) {
4630                 if (vgif)
4631                         svm_set_intercept(svm, INTERCEPT_STGI);
4632                 /* STGI will cause a vm exit */
4633         } else {
4634                 /* We must be in SMM; RSM will cause a vmexit anyway.  */
4635         }
4636 }
4637 #endif
4638
4639 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4640                                         void *insn, int insn_len)
4641 {
4642         bool smep, smap, is_user;
4643         u64 error_code;
4644
4645         /* Emulation is always possible when KVM has access to all guest state. */
4646         if (!sev_guest(vcpu->kvm))
4647                 return true;
4648
4649         /* #UD and #GP should never be intercepted for SEV guests. */
4650         WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4651                                   EMULTYPE_TRAP_UD_FORCED |
4652                                   EMULTYPE_VMWARE_GP));
4653
4654         /*
4655          * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4656          * to guest register state.
4657          */
4658         if (sev_es_guest(vcpu->kvm))
4659                 return false;
4660
4661         /*
4662          * Emulation is possible if the instruction is already decoded, e.g.
4663          * when completing I/O after returning from userspace.
4664          */
4665         if (emul_type & EMULTYPE_NO_DECODE)
4666                 return true;
4667
4668         /*
4669          * Emulation is possible for SEV guests if and only if a prefilled
4670          * buffer containing the bytes of the intercepted instruction is
4671          * available. SEV guest memory is encrypted with a guest specific key
4672          * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4673          * decode garbage.
4674          *
4675          * Inject #UD if KVM reached this point without an instruction buffer.
4676          * In practice, this path should never be hit by a well-behaved guest,
4677          * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
4678          * is still theoretically reachable, e.g. via unaccelerated fault-like
4679          * AVIC access, and needs to be handled by KVM to avoid putting the
4680          * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
4681          * but its the least awful option given lack of insight into the guest.
4682          */
4683         if (unlikely(!insn)) {
4684                 kvm_queue_exception(vcpu, UD_VECTOR);
4685                 return false;
4686         }
4687
4688         /*
4689          * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4690          * will be empty if the DecodeAssist microcode cannot fetch bytes for
4691          * the faulting instruction because the code fetch itself faulted, e.g.
4692          * the guest attempted to fetch from emulated MMIO or a guest page
4693          * table used to translate CS:RIP resides in emulated MMIO.
4694          */
4695         if (likely(insn_len))
4696                 return true;
4697
4698         /*
4699          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4700          *
4701          * Errata:
4702          * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4703          * possible that CPU microcode implementing DecodeAssist will fail to
4704          * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4705          * be '0'.  This happens because microcode reads CS:RIP using a _data_
4706          * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4707          * gives up and does not fill the instruction bytes buffer.
4708          *
4709          * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4710          * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4711          * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4712          * GuestIntrBytes field of the VMCB.
4713          *
4714          * This does _not_ mean that the erratum has been encountered, as the
4715          * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4716          * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4717          * encountered a reserved/not-present #PF.
4718          *
4719          * To hit the erratum, the following conditions must be true:
4720          *    1. CR4.SMAP=1 (obviously).
4721          *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4722          *       have been hit as the guest would have encountered a SMEP
4723          *       violation #PF, not a #NPF.
4724          *    3. The #NPF is not due to a code fetch, in which case failure to
4725          *       retrieve the instruction bytes is legitimate (see abvoe).
4726          *
4727          * In addition, don't apply the erratum workaround if the #NPF occurred
4728          * while translating guest page tables (see below).
4729          */
4730         error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4731         if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4732                 goto resume_guest;
4733
4734         smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
4735         smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
4736         is_user = svm_get_cpl(vcpu) == 3;
4737         if (smap && (!smep || is_user)) {
4738                 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
4739
4740                 /*
4741                  * If the fault occurred in userspace, arbitrarily inject #GP
4742                  * to avoid killing the guest and to hopefully avoid confusing
4743                  * the guest kernel too much, e.g. injecting #PF would not be
4744                  * coherent with respect to the guest's page tables.  Request
4745                  * triple fault if the fault occurred in the kernel as there's
4746                  * no fault that KVM can inject without confusing the guest.
4747                  * In practice, the triple fault is moot as no sane SEV kernel
4748                  * will execute from user memory while also running with SMAP=1.
4749                  */
4750                 if (is_user)
4751                         kvm_inject_gp(vcpu, 0);
4752                 else
4753                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4754         }
4755
4756 resume_guest:
4757         /*
4758          * If the erratum was not hit, simply resume the guest and let it fault
4759          * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4760          * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4761          * userspace will kill the guest, and letting the emulator read garbage
4762          * will yield random behavior and potentially corrupt the guest.
4763          *
4764          * Simply resuming the guest is technically not a violation of the SEV
4765          * architecture.  AMD's APM states that all code fetches and page table
4766          * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4767          * APM also states that encrypted accesses to MMIO are "ignored", but
4768          * doesn't explicitly define "ignored", i.e. doing nothing and letting
4769          * the guest spin is technically "ignoring" the access.
4770          */
4771         return false;
4772 }
4773
4774 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4775 {
4776         struct vcpu_svm *svm = to_svm(vcpu);
4777
4778         return !gif_set(svm);
4779 }
4780
4781 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4782 {
4783         if (!sev_es_guest(vcpu->kvm))
4784                 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4785
4786         sev_vcpu_deliver_sipi_vector(vcpu, vector);
4787 }
4788
4789 static void svm_vm_destroy(struct kvm *kvm)
4790 {
4791         avic_vm_destroy(kvm);
4792         sev_vm_destroy(kvm);
4793 }
4794
4795 static int svm_vm_init(struct kvm *kvm)
4796 {
4797         if (!pause_filter_count || !pause_filter_thresh)
4798                 kvm->arch.pause_in_guest = true;
4799
4800         if (enable_apicv) {
4801                 int ret = avic_vm_init(kvm);
4802                 if (ret)
4803                         return ret;
4804         }
4805
4806         return 0;
4807 }
4808
4809 static struct kvm_x86_ops svm_x86_ops __initdata = {
4810         .name = KBUILD_MODNAME,
4811
4812         .check_processor_compatibility = svm_check_processor_compat,
4813
4814         .hardware_unsetup = svm_hardware_unsetup,
4815         .hardware_enable = svm_hardware_enable,
4816         .hardware_disable = svm_hardware_disable,
4817         .has_emulated_msr = svm_has_emulated_msr,
4818
4819         .vcpu_create = svm_vcpu_create,
4820         .vcpu_free = svm_vcpu_free,
4821         .vcpu_reset = svm_vcpu_reset,
4822
4823         .vm_size = sizeof(struct kvm_svm),
4824         .vm_init = svm_vm_init,
4825         .vm_destroy = svm_vm_destroy,
4826
4827         .prepare_switch_to_guest = svm_prepare_switch_to_guest,
4828         .vcpu_load = svm_vcpu_load,
4829         .vcpu_put = svm_vcpu_put,
4830         .vcpu_blocking = avic_vcpu_blocking,
4831         .vcpu_unblocking = avic_vcpu_unblocking,
4832
4833         .update_exception_bitmap = svm_update_exception_bitmap,
4834         .get_msr_feature = svm_get_msr_feature,
4835         .get_msr = svm_get_msr,
4836         .set_msr = svm_set_msr,
4837         .get_segment_base = svm_get_segment_base,
4838         .get_segment = svm_get_segment,
4839         .set_segment = svm_set_segment,
4840         .get_cpl = svm_get_cpl,
4841         .get_cs_db_l_bits = svm_get_cs_db_l_bits,
4842         .set_cr0 = svm_set_cr0,
4843         .post_set_cr3 = sev_post_set_cr3,
4844         .is_valid_cr4 = svm_is_valid_cr4,
4845         .set_cr4 = svm_set_cr4,
4846         .set_efer = svm_set_efer,
4847         .get_idt = svm_get_idt,
4848         .set_idt = svm_set_idt,
4849         .get_gdt = svm_get_gdt,
4850         .set_gdt = svm_set_gdt,
4851         .set_dr7 = svm_set_dr7,
4852         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4853         .cache_reg = svm_cache_reg,
4854         .get_rflags = svm_get_rflags,
4855         .set_rflags = svm_set_rflags,
4856         .get_if_flag = svm_get_if_flag,
4857
4858         .flush_tlb_all = svm_flush_tlb_all,
4859         .flush_tlb_current = svm_flush_tlb_current,
4860         .flush_tlb_gva = svm_flush_tlb_gva,
4861         .flush_tlb_guest = svm_flush_tlb_asid,
4862
4863         .vcpu_pre_run = svm_vcpu_pre_run,
4864         .vcpu_run = svm_vcpu_run,
4865         .handle_exit = svm_handle_exit,
4866         .skip_emulated_instruction = svm_skip_emulated_instruction,
4867         .update_emulated_instruction = NULL,
4868         .set_interrupt_shadow = svm_set_interrupt_shadow,
4869         .get_interrupt_shadow = svm_get_interrupt_shadow,
4870         .patch_hypercall = svm_patch_hypercall,
4871         .inject_irq = svm_inject_irq,
4872         .inject_nmi = svm_inject_nmi,
4873         .is_vnmi_pending = svm_is_vnmi_pending,
4874         .set_vnmi_pending = svm_set_vnmi_pending,
4875         .inject_exception = svm_inject_exception,
4876         .cancel_injection = svm_cancel_injection,
4877         .interrupt_allowed = svm_interrupt_allowed,
4878         .nmi_allowed = svm_nmi_allowed,
4879         .get_nmi_mask = svm_get_nmi_mask,
4880         .set_nmi_mask = svm_set_nmi_mask,
4881         .enable_nmi_window = svm_enable_nmi_window,
4882         .enable_irq_window = svm_enable_irq_window,
4883         .update_cr8_intercept = svm_update_cr8_intercept,
4884         .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
4885         .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4886         .apicv_post_state_restore = avic_apicv_post_state_restore,
4887         .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
4888
4889         .get_exit_info = svm_get_exit_info,
4890
4891         .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4892
4893         .has_wbinvd_exit = svm_has_wbinvd_exit,
4894
4895         .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4896         .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4897         .write_tsc_offset = svm_write_tsc_offset,
4898         .write_tsc_multiplier = svm_write_tsc_multiplier,
4899
4900         .load_mmu_pgd = svm_load_mmu_pgd,
4901
4902         .check_intercept = svm_check_intercept,
4903         .handle_exit_irqoff = svm_handle_exit_irqoff,
4904
4905         .request_immediate_exit = __kvm_request_immediate_exit,
4906
4907         .sched_in = svm_sched_in,
4908
4909         .nested_ops = &svm_nested_ops,
4910
4911         .deliver_interrupt = svm_deliver_interrupt,
4912         .pi_update_irte = avic_pi_update_irte,
4913         .setup_mce = svm_setup_mce,
4914
4915 #ifdef CONFIG_KVM_SMM
4916         .smi_allowed = svm_smi_allowed,
4917         .enter_smm = svm_enter_smm,
4918         .leave_smm = svm_leave_smm,
4919         .enable_smi_window = svm_enable_smi_window,
4920 #endif
4921
4922         .mem_enc_ioctl = sev_mem_enc_ioctl,
4923         .mem_enc_register_region = sev_mem_enc_register_region,
4924         .mem_enc_unregister_region = sev_mem_enc_unregister_region,
4925         .guest_memory_reclaimed = sev_guest_memory_reclaimed,
4926
4927         .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4928         .vm_move_enc_context_from = sev_vm_move_enc_context_from,
4929
4930         .can_emulate_instruction = svm_can_emulate_instruction,
4931
4932         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
4933
4934         .msr_filter_changed = svm_msr_filter_changed,
4935         .complete_emulated_msr = svm_complete_emulated_msr,
4936
4937         .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
4938         .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
4939 };
4940
4941 /*
4942  * The default MMIO mask is a single bit (excluding the present bit),
4943  * which could conflict with the memory encryption bit. Check for
4944  * memory encryption support and override the default MMIO mask if
4945  * memory encryption is enabled.
4946  */
4947 static __init void svm_adjust_mmio_mask(void)
4948 {
4949         unsigned int enc_bit, mask_bit;
4950         u64 msr, mask;
4951
4952         /* If there is no memory encryption support, use existing mask */
4953         if (cpuid_eax(0x80000000) < 0x8000001f)
4954                 return;
4955
4956         /* If memory encryption is not enabled, use existing mask */
4957         rdmsrl(MSR_AMD64_SYSCFG, msr);
4958         if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
4959                 return;
4960
4961         enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
4962         mask_bit = boot_cpu_data.x86_phys_bits;
4963
4964         /* Increment the mask bit if it is the same as the encryption bit */
4965         if (enc_bit == mask_bit)
4966                 mask_bit++;
4967
4968         /*
4969          * If the mask bit location is below 52, then some bits above the
4970          * physical addressing limit will always be reserved, so use the
4971          * rsvd_bits() function to generate the mask. This mask, along with
4972          * the present bit, will be used to generate a page fault with
4973          * PFER.RSV = 1.
4974          *
4975          * If the mask bit location is 52 (or above), then clear the mask.
4976          */
4977         mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
4978
4979         kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
4980 }
4981
4982 static __init void svm_set_cpu_caps(void)
4983 {
4984         kvm_set_cpu_caps();
4985
4986         kvm_caps.supported_perf_cap = 0;
4987         kvm_caps.supported_xss = 0;
4988
4989         /* CPUID 0x80000001 and 0x8000000A (SVM features) */
4990         if (nested) {
4991                 kvm_cpu_cap_set(X86_FEATURE_SVM);
4992                 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
4993
4994                 if (nrips)
4995                         kvm_cpu_cap_set(X86_FEATURE_NRIPS);
4996
4997                 if (npt_enabled)
4998                         kvm_cpu_cap_set(X86_FEATURE_NPT);
4999
5000                 if (tsc_scaling)
5001                         kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
5002
5003                 if (vls)
5004                         kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
5005                 if (lbrv)
5006                         kvm_cpu_cap_set(X86_FEATURE_LBRV);
5007
5008                 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
5009                         kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
5010
5011                 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
5012                         kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
5013
5014                 if (vgif)
5015                         kvm_cpu_cap_set(X86_FEATURE_VGIF);
5016
5017                 if (vnmi)
5018                         kvm_cpu_cap_set(X86_FEATURE_VNMI);
5019
5020                 /* Nested VM can receive #VMEXIT instead of triggering #GP */
5021                 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
5022         }
5023
5024         /* CPUID 0x80000008 */
5025         if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
5026             boot_cpu_has(X86_FEATURE_AMD_SSBD))
5027                 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
5028
5029         if (enable_pmu) {
5030                 /*
5031                  * Enumerate support for PERFCTR_CORE if and only if KVM has
5032                  * access to enough counters to virtualize "core" support,
5033                  * otherwise limit vPMU support to the legacy number of counters.
5034                  */
5035                 if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
5036                         kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
5037                                                           kvm_pmu_cap.num_counters_gp);
5038                 else
5039                         kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
5040
5041                 if (kvm_pmu_cap.version != 2 ||
5042                     !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
5043                         kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
5044         }
5045
5046         /* CPUID 0x8000001F (SME/SEV features) */
5047         sev_set_cpu_caps();
5048 }
5049
5050 static __init int svm_hardware_setup(void)
5051 {
5052         int cpu;
5053         struct page *iopm_pages;
5054         void *iopm_va;
5055         int r;
5056         unsigned int order = get_order(IOPM_SIZE);
5057
5058         /*
5059          * NX is required for shadow paging and for NPT if the NX huge pages
5060          * mitigation is enabled.
5061          */
5062         if (!boot_cpu_has(X86_FEATURE_NX)) {
5063                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
5064                 return -EOPNOTSUPP;
5065         }
5066         kvm_enable_efer_bits(EFER_NX);
5067
5068         iopm_pages = alloc_pages(GFP_KERNEL, order);
5069
5070         if (!iopm_pages)
5071                 return -ENOMEM;
5072
5073         iopm_va = page_address(iopm_pages);
5074         memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
5075         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
5076
5077         init_msrpm_offsets();
5078
5079         kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
5080                                      XFEATURE_MASK_BNDCSR);
5081
5082         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
5083                 kvm_enable_efer_bits(EFER_FFXSR);
5084
5085         if (tsc_scaling) {
5086                 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
5087                         tsc_scaling = false;
5088                 } else {
5089                         pr_info("TSC scaling supported\n");
5090                         kvm_caps.has_tsc_control = true;
5091                 }
5092         }
5093         kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
5094         kvm_caps.tsc_scaling_ratio_frac_bits = 32;
5095
5096         tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
5097
5098         if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
5099                 kvm_enable_efer_bits(EFER_AUTOIBRS);
5100
5101         /* Check for pause filtering support */
5102         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
5103                 pause_filter_count = 0;
5104                 pause_filter_thresh = 0;
5105         } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
5106                 pause_filter_thresh = 0;
5107         }
5108
5109         if (nested) {
5110                 pr_info("Nested Virtualization enabled\n");
5111                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
5112         }
5113
5114         /*
5115          * KVM's MMU doesn't support using 2-level paging for itself, and thus
5116          * NPT isn't supported if the host is using 2-level paging since host
5117          * CR4 is unchanged on VMRUN.
5118          */
5119         if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
5120                 npt_enabled = false;
5121
5122         if (!boot_cpu_has(X86_FEATURE_NPT))
5123                 npt_enabled = false;
5124
5125         /* Force VM NPT level equal to the host's paging level */
5126         kvm_configure_mmu(npt_enabled, get_npt_level(),
5127                           get_npt_level(), PG_LEVEL_1G);
5128         pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5129
5130         /* Setup shadow_me_value and shadow_me_mask */
5131         kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5132
5133         svm_adjust_mmio_mask();
5134
5135         /*
5136          * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5137          * may be modified by svm_adjust_mmio_mask()).
5138          */
5139         sev_hardware_setup();
5140
5141         svm_hv_hardware_setup();
5142
5143         for_each_possible_cpu(cpu) {
5144                 r = svm_cpu_init(cpu);
5145                 if (r)
5146                         goto err;
5147         }
5148
5149         if (nrips) {
5150                 if (!boot_cpu_has(X86_FEATURE_NRIPS))
5151                         nrips = false;
5152         }
5153
5154         enable_apicv = avic = avic && avic_hardware_setup();
5155
5156         if (!enable_apicv) {
5157                 svm_x86_ops.vcpu_blocking = NULL;
5158                 svm_x86_ops.vcpu_unblocking = NULL;
5159                 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5160         } else if (!x2avic_enabled) {
5161                 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
5162         }
5163
5164         if (vls) {
5165                 if (!npt_enabled ||
5166                     !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5167                     !IS_ENABLED(CONFIG_X86_64)) {
5168                         vls = false;
5169                 } else {
5170                         pr_info("Virtual VMLOAD VMSAVE supported\n");
5171                 }
5172         }
5173
5174         if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5175                 svm_gp_erratum_intercept = false;
5176
5177         if (vgif) {
5178                 if (!boot_cpu_has(X86_FEATURE_VGIF))
5179                         vgif = false;
5180                 else
5181                         pr_info("Virtual GIF supported\n");
5182         }
5183
5184         vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
5185         if (vnmi)
5186                 pr_info("Virtual NMI enabled\n");
5187
5188         if (!vnmi) {
5189                 svm_x86_ops.is_vnmi_pending = NULL;
5190                 svm_x86_ops.set_vnmi_pending = NULL;
5191         }
5192
5193
5194         if (lbrv) {
5195                 if (!boot_cpu_has(X86_FEATURE_LBRV))
5196                         lbrv = false;
5197                 else
5198                         pr_info("LBR virtualization supported\n");
5199         }
5200
5201         if (!enable_pmu)
5202                 pr_info("PMU virtualization is disabled\n");
5203
5204         svm_set_cpu_caps();
5205
5206         /*
5207          * It seems that on AMD processors PTE's accessed bit is
5208          * being set by the CPU hardware before the NPF vmexit.
5209          * This is not expected behaviour and our tests fail because
5210          * of it.
5211          * A workaround here is to disable support for
5212          * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5213          * In this case userspace can know if there is support using
5214          * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5215          * it
5216          * If future AMD CPU models change the behaviour described above,
5217          * this variable can be changed accordingly
5218          */
5219         allow_smaller_maxphyaddr = !npt_enabled;
5220
5221         return 0;
5222
5223 err:
5224         svm_hardware_unsetup();
5225         return r;
5226 }
5227
5228
5229 static struct kvm_x86_init_ops svm_init_ops __initdata = {
5230         .hardware_setup = svm_hardware_setup,
5231
5232         .runtime_ops = &svm_x86_ops,
5233         .pmu_ops = &amd_pmu_ops,
5234 };
5235
5236 static void __svm_exit(void)
5237 {
5238         kvm_x86_vendor_exit();
5239
5240         cpu_emergency_unregister_virt_callback(svm_emergency_disable);
5241 }
5242
5243 static int __init svm_init(void)
5244 {
5245         int r;
5246
5247         __unused_size_checks();
5248
5249         if (!kvm_is_svm_supported())
5250                 return -EOPNOTSUPP;
5251
5252         r = kvm_x86_vendor_init(&svm_init_ops);
5253         if (r)
5254                 return r;
5255
5256         cpu_emergency_register_virt_callback(svm_emergency_disable);
5257
5258         /*
5259          * Common KVM initialization _must_ come last, after this, /dev/kvm is
5260          * exposed to userspace!
5261          */
5262         r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
5263                      THIS_MODULE);
5264         if (r)
5265                 goto err_kvm_init;
5266
5267         return 0;
5268
5269 err_kvm_init:
5270         __svm_exit();
5271         return r;
5272 }
5273
5274 static void __exit svm_exit(void)
5275 {
5276         kvm_exit();
5277         __svm_exit();
5278 }
5279
5280 module_init(svm_init)
5281 module_exit(svm_exit)