arch/x86/kvm/svm/svm.c

   1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   2
   3 #include <linux/kvm_host.h>
   4
   5 #include "irq.h"
   6 #include "mmu.h"
   7 #include "kvm_cache_regs.h"
   8 #include "x86.h"
   9 #include "smm.h"
  10 #include "cpuid.h"
  11 #include "pmu.h"
  12
  13 #include <linux/module.h>
  14 #include <linux/mod_devicetable.h>
  15 #include <linux/kernel.h>
  16 #include <linux/vmalloc.h>
  17 #include <linux/highmem.h>
  18 #include <linux/amd-iommu.h>
  19 #include <linux/sched.h>
  20 #include <linux/trace_events.h>
  21 #include <linux/slab.h>
  22 #include <linux/hashtable.h>
  23 #include <linux/objtool.h>
  24 #include <linux/psp-sev.h>
  25 #include <linux/file.h>
  26 #include <linux/pagemap.h>
  27 #include <linux/swap.h>
  28 #include <linux/rwsem.h>
  29 #include <linux/cc_platform.h>
  30 #include <linux/smp.h>
  31
  32 #include <asm/apic.h>
  33 #include <asm/perf_event.h>
  34 #include <asm/tlbflush.h>
  35 #include <asm/desc.h>
  36 #include <asm/debugreg.h>
  37 #include <asm/kvm_para.h>
  38 #include <asm/irq_remapping.h>
  39 #include <asm/spec-ctrl.h>
  40 #include <asm/cpu_device_id.h>
  41 #include <asm/traps.h>
  42 #include <asm/reboot.h>
  43 #include <asm/fpu/api.h>
  44
  45 #include <trace/events/ipi.h>
  46
  47 #include "trace.h"
  48
  49 #include "svm.h"
  50 #include "svm_ops.h"
  51
  52 #include "kvm_onhyperv.h"
  53 #include "svm_onhyperv.h"
  54
  55 MODULE_AUTHOR("Qumranet");
  56 MODULE_LICENSE("GPL");
  57
  58 #ifdef MODULE
  59 static const struct x86_cpu_id svm_cpu_id[] = {
  60         X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  61         {}
  62 };
  63 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  64 #endif
  65
  66 #define SEG_TYPE_LDT 2
  67 #define SEG_TYPE_BUSY_TSS16 3
  68
  69 static bool erratum_383_found __read_mostly;
  70
  71 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  72
  73 /*
  74  * Set osvw_len to higher value when updated Revision Guides
  75  * are published and we know what the new status bits are
  76  */
  77 static uint64_t osvw_len = 4, osvw_status;
  78
  79 static DEFINE_PER_CPU(u64, current_tsc_ratio);
  80
  81 #define X2APIC_MSR(x)   (APIC_BASE_MSR + (x >> 4))
  82
  83 static const struct svm_direct_access_msrs {
  84         u32 index;   /* Index of the MSR */
  85         bool always; /* True if intercept is initially cleared */
  86 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  87         { .index = MSR_STAR,                            .always = true  },
  88         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
  89         { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
  90         { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
  91 #ifdef CONFIG_X86_64
  92         { .index = MSR_GS_BASE,                         .always = true  },
  93         { .index = MSR_FS_BASE,                         .always = true  },
  94         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
  95         { .index = MSR_LSTAR,                           .always = true  },
  96         { .index = MSR_CSTAR,                           .always = true  },
  97         { .index = MSR_SYSCALL_MASK,                    .always = true  },
  98 #endif
  99         { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
 100         { .index = MSR_IA32_PRED_CMD,                   .always = false },
 101         { .index = MSR_IA32_FLUSH_CMD,                  .always = false },
 102         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 103         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 104         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 105         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 106         { .index = MSR_EFER,                            .always = false },
 107         { .index = MSR_IA32_CR_PAT,                     .always = false },
 108         { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
 109         { .index = MSR_TSC_AUX,                         .always = false },
 110         { .index = X2APIC_MSR(APIC_ID),                 .always = false },
 111         { .index = X2APIC_MSR(APIC_LVR),                .always = false },
 112         { .index = X2APIC_MSR(APIC_TASKPRI),            .always = false },
 113         { .index = X2APIC_MSR(APIC_ARBPRI),             .always = false },
 114         { .index = X2APIC_MSR(APIC_PROCPRI),            .always = false },
 115         { .index = X2APIC_MSR(APIC_EOI),                .always = false },
 116         { .index = X2APIC_MSR(APIC_RRR),                .always = false },
 117         { .index = X2APIC_MSR(APIC_LDR),                .always = false },
 118         { .index = X2APIC_MSR(APIC_DFR),                .always = false },
 119         { .index = X2APIC_MSR(APIC_SPIV),               .always = false },
 120         { .index = X2APIC_MSR(APIC_ISR),                .always = false },
 121         { .index = X2APIC_MSR(APIC_TMR),                .always = false },
 122         { .index = X2APIC_MSR(APIC_IRR),                .always = false },
 123         { .index = X2APIC_MSR(APIC_ESR),                .always = false },
 124         { .index = X2APIC_MSR(APIC_ICR),                .always = false },
 125         { .index = X2APIC_MSR(APIC_ICR2),               .always = false },
 126
 127         /*
 128          * Note:
 129          * AMD does not virtualize APIC TSC-deadline timer mode, but it is
 130          * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
 131          * the AVIC hardware would generate GP fault. Therefore, always
 132          * intercept the MSR 0x832, and do not setup direct_access_msr.
 133          */
 134         { .index = X2APIC_MSR(APIC_LVTTHMR),            .always = false },
 135         { .index = X2APIC_MSR(APIC_LVTPC),              .always = false },
 136         { .index = X2APIC_MSR(APIC_LVT0),               .always = false },
 137         { .index = X2APIC_MSR(APIC_LVT1),               .always = false },
 138         { .index = X2APIC_MSR(APIC_LVTERR),             .always = false },
 139         { .index = X2APIC_MSR(APIC_TMICT),              .always = false },
 140         { .index = X2APIC_MSR(APIC_TMCCT),              .always = false },
 141         { .index = X2APIC_MSR(APIC_TDCR),               .always = false },
 142         { .index = MSR_INVALID,                         .always = false },
 143 };
 144
 145 /*
 146  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 147  * pause_filter_count: On processors that support Pause filtering(indicated
 148  *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 149  *      count value. On VMRUN this value is loaded into an internal counter.
 150  *      Each time a pause instruction is executed, this counter is decremented
 151  *      until it reaches zero at which time a #VMEXIT is generated if pause
 152  *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 153  *      Intercept Filtering for more details.
 154  *      This also indicate if ple logic enabled.
 155  *
 156  * pause_filter_thresh: In addition, some processor families support advanced
 157  *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 158  *      the amount of time a guest is allowed to execute in a pause loop.
 159  *      In this mode, a 16-bit pause filter threshold field is added in the
 160  *      VMCB. The threshold value is a cycle count that is used to reset the
 161  *      pause counter. As with simple pause filtering, VMRUN loads the pause
 162  *      count value from VMCB into an internal counter. Then, on each pause
 163  *      instruction the hardware checks the elapsed number of cycles since
 164  *      the most recent pause instruction against the pause filter threshold.
 165  *      If the elapsed cycle count is greater than the pause filter threshold,
 166  *      then the internal pause count is reloaded from the VMCB and execution
 167  *      continues. If the elapsed cycle count is less than the pause filter
 168  *      threshold, then the internal pause count is decremented. If the count
 169  *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 170  *      triggered. If advanced pause filtering is supported and pause filter
 171  *      threshold field is set to zero, the filter will operate in the simpler,
 172  *      count only mode.
 173  */
 174
 175 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 176 module_param(pause_filter_thresh, ushort, 0444);
 177
 178 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 179 module_param(pause_filter_count, ushort, 0444);
 180
 181 /* Default doubles per-vcpu window every exit. */
 182 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 183 module_param(pause_filter_count_grow, ushort, 0444);
 184
 185 /* Default resets per-vcpu window every exit to pause_filter_count. */
 186 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 187 module_param(pause_filter_count_shrink, ushort, 0444);
 188
 189 /* Default is to compute the maximum so we can never overflow. */
 190 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 191 module_param(pause_filter_count_max, ushort, 0444);
 192
 193 /*
 194  * Use nested page tables by default.  Note, NPT may get forced off by
 195  * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 196  */
 197 bool npt_enabled = true;
 198 module_param_named(npt, npt_enabled, bool, 0444);
 199
 200 /* allow nested virtualization in KVM/SVM */
 201 static int nested = true;
 202 module_param(nested, int, S_IRUGO);
 203
 204 /* enable/disable Next RIP Save */
 205 int nrips = true;
 206 module_param(nrips, int, 0444);
 207
 208 /* enable/disable Virtual VMLOAD VMSAVE */
 209 static int vls = true;
 210 module_param(vls, int, 0444);
 211
 212 /* enable/disable Virtual GIF */
 213 int vgif = true;
 214 module_param(vgif, int, 0444);
 215
 216 /* enable/disable LBR virtualization */
 217 static int lbrv = true;
 218 module_param(lbrv, int, 0444);
 219
 220 static int tsc_scaling = true;
 221 module_param(tsc_scaling, int, 0444);
 222
 223 /*
 224  * enable / disable AVIC.  Because the defaults differ for APICv
 225  * support between VMX and SVM we cannot use module_param_named.
 226  */
 227 static bool avic;
 228 module_param(avic, bool, 0444);
 229
 230 bool __read_mostly dump_invalid_vmcb;
 231 module_param(dump_invalid_vmcb, bool, 0644);
 232
 233
 234 bool intercept_smi = true;
 235 module_param(intercept_smi, bool, 0444);
 236
 237 bool vnmi = true;
 238 module_param(vnmi, bool, 0444);
 239
 240 static bool svm_gp_erratum_intercept = true;
 241
 242 static u8 rsm_ins_bytes[] = "\x0f\xaa";
 243
 244 static unsigned long iopm_base;
 245
 246 DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
 247
 248 /*
 249  * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 250  * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 251  *
 252  * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 253  * defer the restoration of TSC_AUX until the CPU returns to userspace.
 254  */
 255 static int tsc_aux_uret_slot __read_mostly = -1;
 256
 257 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 258
 259 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 260 #define MSRS_RANGE_SIZE 2048
 261 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 262
 263 u32 svm_msrpm_offset(u32 msr)
 264 {
 265         u32 offset;
 266         int i;
 267
 268         for (i = 0; i < NUM_MSR_MAPS; i++) {
 269                 if (msr < msrpm_ranges[i] ||
 270                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 271                         continue;
 272
 273                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 274                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 275
 276                 /* Now we have the u8 offset - but need the u32 offset */
 277                 return offset / 4;
 278         }
 279
 280         /* MSR not in any range */
 281         return MSR_INVALID;
 282 }
 283
 284 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
 285
 286 static int get_npt_level(void)
 287 {
 288 #ifdef CONFIG_X86_64
 289         return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
 290 #else
 291         return PT32E_ROOT_LEVEL;
 292 #endif
 293 }
 294
 295 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 296 {
 297         struct vcpu_svm *svm = to_svm(vcpu);
 298         u64 old_efer = vcpu->arch.efer;
 299         vcpu->arch.efer = efer;
 300
 301         if (!npt_enabled) {
 302                 /* Shadow paging assumes NX to be available.  */
 303                 efer |= EFER_NX;
 304
 305                 if (!(efer & EFER_LMA))
 306                         efer &= ~EFER_LME;
 307         }
 308
 309         if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 310                 if (!(efer & EFER_SVME)) {
 311                         svm_leave_nested(vcpu);
 312                         svm_set_gif(svm, true);
 313                         /* #GP intercept is still needed for vmware backdoor */
 314                         if (!enable_vmware_backdoor)
 315                                 clr_exception_intercept(svm, GP_VECTOR);
 316
 317                         /*
 318                          * Free the nested guest state, unless we are in SMM.
 319                          * In this case we will return to the nested guest
 320                          * as soon as we leave SMM.
 321                          */
 322                         if (!is_smm(vcpu))
 323                                 svm_free_nested(svm);
 324
 325                 } else {
 326                         int ret = svm_allocate_nested(svm);
 327
 328                         if (ret) {
 329                                 vcpu->arch.efer = old_efer;
 330                                 return ret;
 331                         }
 332
 333                         /*
 334                          * Never intercept #GP for SEV guests, KVM can't
 335                          * decrypt guest memory to workaround the erratum.
 336                          */
 337                         if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
 338                                 set_exception_intercept(svm, GP_VECTOR);
 339                 }
 340         }
 341
 342         svm->vmcb->save.efer = efer | EFER_SVME;
 343         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 344         return 0;
 345 }
 346
 347 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 348 {
 349         struct vcpu_svm *svm = to_svm(vcpu);
 350         u32 ret = 0;
 351
 352         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 353                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 354         return ret;
 355 }
 356
 357 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 358 {
 359         struct vcpu_svm *svm = to_svm(vcpu);
 360
 361         if (mask == 0)
 362                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 363         else
 364                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 365
 366 }
 367 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
 368                                         void *insn, int insn_len);
 369
 370 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
 371                                            bool commit_side_effects)
 372 {
 373         struct vcpu_svm *svm = to_svm(vcpu);
 374         unsigned long old_rflags;
 375
 376         /*
 377          * SEV-ES does not expose the next RIP. The RIP update is controlled by
 378          * the type of exit and the #VC handler in the guest.
 379          */
 380         if (sev_es_guest(vcpu->kvm))
 381                 goto done;
 382
 383         if (nrips && svm->vmcb->control.next_rip != 0) {
 384                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 385                 svm->next_rip = svm->vmcb->control.next_rip;
 386         }
 387
 388         if (!svm->next_rip) {
 389                 /*
 390                  * FIXME: Drop this when kvm_emulate_instruction() does the
 391                  * right thing and treats "can't emulate" as outright failure
 392                  * for EMULTYPE_SKIP.
 393                  */
 394                 if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
 395                         return 0;
 396
 397                 if (unlikely(!commit_side_effects))
 398                         old_rflags = svm->vmcb->save.rflags;
 399
 400                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 401                         return 0;
 402
 403                 if (unlikely(!commit_side_effects))
 404                         svm->vmcb->save.rflags = old_rflags;
 405         } else {
 406                 kvm_rip_write(vcpu, svm->next_rip);
 407         }
 408
 409 done:
 410         if (likely(commit_side_effects))
 411                 svm_set_interrupt_shadow(vcpu, 0);
 412
 413         return 1;
 414 }
 415
 416 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 417 {
 418         return __svm_skip_emulated_instruction(vcpu, true);
 419 }
 420
 421 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
 422 {
 423         unsigned long rip, old_rip = kvm_rip_read(vcpu);
 424         struct vcpu_svm *svm = to_svm(vcpu);
 425
 426         /*
 427          * Due to architectural shortcomings, the CPU doesn't always provide
 428          * NextRIP, e.g. if KVM intercepted an exception that occurred while
 429          * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
 430          * the instruction even if NextRIP is supported to acquire the next
 431          * RIP so that it can be shoved into the NextRIP field, otherwise
 432          * hardware will fail to advance guest RIP during event injection.
 433          * Drop the exception/interrupt if emulation fails and effectively
 434          * retry the instruction, it's the least awful option.  If NRIPS is
 435          * in use, the skip must not commit any side effects such as clearing
 436          * the interrupt shadow or RFLAGS.RF.
 437          */
 438         if (!__svm_skip_emulated_instruction(vcpu, !nrips))
 439                 return -EIO;
 440
 441         rip = kvm_rip_read(vcpu);
 442
 443         /*
 444          * Save the injection information, even when using next_rip, as the
 445          * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
 446          * doesn't complete due to a VM-Exit occurring while the CPU is
 447          * vectoring the event.   Decoding the instruction isn't guaranteed to
 448          * work as there may be no backing instruction, e.g. if the event is
 449          * being injected by L1 for L2, or if the guest is patching INT3 into
 450          * a different instruction.
 451          */
 452         svm->soft_int_injected = true;
 453         svm->soft_int_csbase = svm->vmcb->save.cs.base;
 454         svm->soft_int_old_rip = old_rip;
 455         svm->soft_int_next_rip = rip;
 456
 457         if (nrips)
 458                 kvm_rip_write(vcpu, old_rip);
 459
 460         if (static_cpu_has(X86_FEATURE_NRIPS))
 461                 svm->vmcb->control.next_rip = rip;
 462
 463         return 0;
 464 }
 465
 466 static void svm_inject_exception(struct kvm_vcpu *vcpu)
 467 {
 468         struct kvm_queued_exception *ex = &vcpu->arch.exception;
 469         struct vcpu_svm *svm = to_svm(vcpu);
 470
 471         kvm_deliver_exception_payload(vcpu, ex);
 472
 473         if (kvm_exception_is_soft(ex->vector) &&
 474             svm_update_soft_interrupt_rip(vcpu))
 475                 return;
 476
 477         svm->vmcb->control.event_inj = ex->vector
 478                 | SVM_EVTINJ_VALID
 479                 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 480                 | SVM_EVTINJ_TYPE_EXEPT;
 481         svm->vmcb->control.event_inj_err = ex->error_code;
 482 }
 483
 484 static void svm_init_erratum_383(void)
 485 {
 486         u32 low, high;
 487         int err;
 488         u64 val;
 489
 490         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 491                 return;
 492
 493         /* Use _safe variants to not break nested virtualization */
 494         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 495         if (err)
 496                 return;
 497
 498         val |= (1ULL << 47);
 499
 500         low  = lower_32_bits(val);
 501         high = upper_32_bits(val);
 502
 503         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 504
 505         erratum_383_found = true;
 506 }
 507
 508 static void svm_init_osvw(struct kvm_vcpu *vcpu)
 509 {
 510         /*
 511          * Guests should see errata 400 and 415 as fixed (assuming that
 512          * HLT and IO instructions are intercepted).
 513          */
 514         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 515         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 516
 517         /*
 518          * By increasing VCPU's osvw.length to 3 we are telling the guest that
 519          * all osvw.status bits inside that length, including bit 0 (which is
 520          * reserved for erratum 298), are valid. However, if host processor's
 521          * osvw_len is 0 then osvw_status[0] carries no information. We need to
 522          * be conservative here and therefore we tell the guest that erratum 298
 523          * is present (because we really don't know).
 524          */
 525         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 526                 vcpu->arch.osvw.status |= 1;
 527 }
 528
 529 static bool __kvm_is_svm_supported(void)
 530 {
 531         int cpu = smp_processor_id();
 532         struct cpuinfo_x86 *c = &cpu_data(cpu);
 533
 534         u64 vm_cr;
 535
 536         if (c->x86_vendor != X86_VENDOR_AMD &&
 537             c->x86_vendor != X86_VENDOR_HYGON) {
 538                 pr_err("CPU %d isn't AMD or Hygon\n", cpu);
 539                 return false;
 540         }
 541
 542         if (!cpu_has(c, X86_FEATURE_SVM)) {
 543                 pr_err("SVM not supported by CPU %d\n", cpu);
 544                 return false;
 545         }
 546
 547         if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
 548                 pr_info("KVM is unsupported when running as an SEV guest\n");
 549                 return false;
 550         }
 551
 552         rdmsrl(MSR_VM_CR, vm_cr);
 553         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
 554                 pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
 555                 return false;
 556         }
 557
 558         return true;
 559 }
 560
 561 static bool kvm_is_svm_supported(void)
 562 {
 563         bool supported;
 564
 565         migrate_disable();
 566         supported = __kvm_is_svm_supported();
 567         migrate_enable();
 568
 569         return supported;
 570 }
 571
 572 static int svm_check_processor_compat(void)
 573 {
 574         if (!__kvm_is_svm_supported())
 575                 return -EIO;
 576
 577         return 0;
 578 }
 579
 580 static void __svm_write_tsc_multiplier(u64 multiplier)
 581 {
 582         if (multiplier == __this_cpu_read(current_tsc_ratio))
 583                 return;
 584
 585         wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
 586         __this_cpu_write(current_tsc_ratio, multiplier);
 587 }
 588
 589 static inline void kvm_cpu_svm_disable(void)
 590 {
 591         uint64_t efer;
 592
 593         wrmsrl(MSR_VM_HSAVE_PA, 0);
 594         rdmsrl(MSR_EFER, efer);
 595         if (efer & EFER_SVME) {
 596                 /*
 597                  * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
 598                  * NMI aren't blocked.
 599                  */
 600                 stgi();
 601                 wrmsrl(MSR_EFER, efer & ~EFER_SVME);
 602         }
 603 }
 604
 605 static void svm_emergency_disable(void)
 606 {
 607         kvm_rebooting = true;
 608
 609         kvm_cpu_svm_disable();
 610 }
 611
 612 static void svm_hardware_disable(void)
 613 {
 614         /* Make sure we clean up behind us */
 615         if (tsc_scaling)
 616                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 617
 618         kvm_cpu_svm_disable();
 619
 620         amd_pmu_disable_virt();
 621 }
 622
 623 static int svm_hardware_enable(void)
 624 {
 625
 626         struct svm_cpu_data *sd;
 627         uint64_t efer;
 628         int me = raw_smp_processor_id();
 629
 630         rdmsrl(MSR_EFER, efer);
 631         if (efer & EFER_SVME)
 632                 return -EBUSY;
 633
 634         sd = per_cpu_ptr(&svm_data, me);
 635         sd->asid_generation = 1;
 636         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 637         sd->next_asid = sd->max_asid + 1;
 638         sd->min_asid = max_sev_asid + 1;
 639
 640         wrmsrl(MSR_EFER, efer | EFER_SVME);
 641
 642         wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
 643
 644         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 645                 /*
 646                  * Set the default value, even if we don't use TSC scaling
 647                  * to avoid having stale value in the msr
 648                  */
 649                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 650         }
 651
 652
 653         /*
 654          * Get OSVW bits.
 655          *
 656          * Note that it is possible to have a system with mixed processor
 657          * revisions and therefore different OSVW bits. If bits are not the same
 658          * on different processors then choose the worst case (i.e. if erratum
 659          * is present on one processor and not on another then assume that the
 660          * erratum is present everywhere).
 661          */
 662         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 663                 uint64_t len, status = 0;
 664                 int err;
 665
 666                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 667                 if (!err)
 668                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 669                                                       &err);
 670
 671                 if (err)
 672                         osvw_status = osvw_len = 0;
 673                 else {
 674                         if (len < osvw_len)
 675                                 osvw_len = len;
 676                         osvw_status |= status;
 677                         osvw_status &= (1ULL << osvw_len) - 1;
 678                 }
 679         } else
 680                 osvw_status = osvw_len = 0;
 681
 682         svm_init_erratum_383();
 683
 684         amd_pmu_enable_virt();
 685
 686         return 0;
 687 }
 688
 689 static void svm_cpu_uninit(int cpu)
 690 {
 691         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 692
 693         if (!sd->save_area)
 694                 return;
 695
 696         kfree(sd->sev_vmcbs);
 697         __free_page(sd->save_area);
 698         sd->save_area_pa = 0;
 699         sd->save_area = NULL;
 700 }
 701
 702 static int svm_cpu_init(int cpu)
 703 {
 704         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 705         int ret = -ENOMEM;
 706
 707         memset(sd, 0, sizeof(struct svm_cpu_data));
 708         sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
 709         if (!sd->save_area)
 710                 return ret;
 711
 712         ret = sev_cpu_init(sd);
 713         if (ret)
 714                 goto free_save_area;
 715
 716         sd->save_area_pa = __sme_page_pa(sd->save_area);
 717         return 0;
 718
 719 free_save_area:
 720         __free_page(sd->save_area);
 721         sd->save_area = NULL;
 722         return ret;
 723
 724 }
 725
 726 static void set_dr_intercepts(struct vcpu_svm *svm)
 727 {
 728         struct vmcb *vmcb = svm->vmcb01.ptr;
 729
 730         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
 731         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
 732         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
 733         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
 734         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
 735         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
 736         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
 737         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
 738         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
 739         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
 740         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
 741         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
 742         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
 743         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
 744         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
 745         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
 746
 747         recalc_intercepts(svm);
 748 }
 749
 750 static void clr_dr_intercepts(struct vcpu_svm *svm)
 751 {
 752         struct vmcb *vmcb = svm->vmcb01.ptr;
 753
 754         vmcb->control.intercepts[INTERCEPT_DR] = 0;
 755
 756         recalc_intercepts(svm);
 757 }
 758
 759 static int direct_access_msr_slot(u32 msr)
 760 {
 761         u32 i;
 762
 763         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 764                 if (direct_access_msrs[i].index == msr)
 765                         return i;
 766
 767         return -ENOENT;
 768 }
 769
 770 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
 771                                      int write)
 772 {
 773         struct vcpu_svm *svm = to_svm(vcpu);
 774         int slot = direct_access_msr_slot(msr);
 775
 776         if (slot == -ENOENT)
 777                 return;
 778
 779         /* Set the shadow bitmaps to the desired intercept states */
 780         if (read)
 781                 set_bit(slot, svm->shadow_msr_intercept.read);
 782         else
 783                 clear_bit(slot, svm->shadow_msr_intercept.read);
 784
 785         if (write)
 786                 set_bit(slot, svm->shadow_msr_intercept.write);
 787         else
 788                 clear_bit(slot, svm->shadow_msr_intercept.write);
 789 }
 790
 791 static bool valid_msr_intercept(u32 index)
 792 {
 793         return direct_access_msr_slot(index) != -ENOENT;
 794 }
 795
 796 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 797 {
 798         u8 bit_write;
 799         unsigned long tmp;
 800         u32 offset;
 801         u32 *msrpm;
 802
 803         /*
 804          * For non-nested case:
 805          * If the L01 MSR bitmap does not intercept the MSR, then we need to
 806          * save it.
 807          *
 808          * For nested case:
 809          * If the L02 MSR bitmap does not intercept the MSR, then we need to
 810          * save it.
 811          */
 812         msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 813                                       to_svm(vcpu)->msrpm;
 814
 815         offset    = svm_msrpm_offset(msr);
 816         bit_write = 2 * (msr & 0x0f) + 1;
 817         tmp       = msrpm[offset];
 818
 819         BUG_ON(offset == MSR_INVALID);
 820
 821         return test_bit(bit_write, &tmp);
 822 }
 823
 824 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
 825                                         u32 msr, int read, int write)
 826 {
 827         struct vcpu_svm *svm = to_svm(vcpu);
 828         u8 bit_read, bit_write;
 829         unsigned long tmp;
 830         u32 offset;
 831
 832         /*
 833          * If this warning triggers extend the direct_access_msrs list at the
 834          * beginning of the file
 835          */
 836         WARN_ON(!valid_msr_intercept(msr));
 837
 838         /* Enforce non allowed MSRs to trap */
 839         if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
 840                 read = 0;
 841
 842         if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
 843                 write = 0;
 844
 845         offset    = svm_msrpm_offset(msr);
 846         bit_read  = 2 * (msr & 0x0f);
 847         bit_write = 2 * (msr & 0x0f) + 1;
 848         tmp       = msrpm[offset];
 849
 850         BUG_ON(offset == MSR_INVALID);
 851
 852         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 853         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 854
 855         msrpm[offset] = tmp;
 856
 857         svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
 858         svm->nested.force_msr_bitmap_recalc = true;
 859 }
 860
 861 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 862                           int read, int write)
 863 {
 864         set_shadow_msr_intercept(vcpu, msr, read, write);
 865         set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
 866 }
 867
 868 u32 *svm_vcpu_alloc_msrpm(void)
 869 {
 870         unsigned int order = get_order(MSRPM_SIZE);
 871         struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
 872         u32 *msrpm;
 873
 874         if (!pages)
 875                 return NULL;
 876
 877         msrpm = page_address(pages);
 878         memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 879
 880         return msrpm;
 881 }
 882
 883 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 884 {
 885         int i;
 886
 887         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 888                 if (!direct_access_msrs[i].always)
 889                         continue;
 890                 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 891         }
 892 }
 893
 894 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
 895 {
 896         int i;
 897
 898         if (intercept == svm->x2avic_msrs_intercepted)
 899                 return;
 900
 901         if (!x2avic_enabled ||
 902             !apic_x2apic_mode(svm->vcpu.arch.apic))
 903                 return;
 904
 905         for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
 906                 int index = direct_access_msrs[i].index;
 907
 908                 if ((index < APIC_BASE_MSR) ||
 909                     (index > APIC_BASE_MSR + 0xff))
 910                         continue;
 911                 set_msr_interception(&svm->vcpu, svm->msrpm, index,
 912                                      !intercept, !intercept);
 913         }
 914
 915         svm->x2avic_msrs_intercepted = intercept;
 916 }
 917
 918 void svm_vcpu_free_msrpm(u32 *msrpm)
 919 {
 920         __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 921 }
 922
 923 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
 924 {
 925         struct vcpu_svm *svm = to_svm(vcpu);
 926         u32 i;
 927
 928         /*
 929          * Set intercept permissions for all direct access MSRs again. They
 930          * will automatically get filtered through the MSR filter, so we are
 931          * back in sync after this.
 932          */
 933         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 934                 u32 msr = direct_access_msrs[i].index;
 935                 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
 936                 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
 937
 938                 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
 939         }
 940 }
 941
 942 static void add_msr_offset(u32 offset)
 943 {
 944         int i;
 945
 946         for (i = 0; i < MSRPM_OFFSETS; ++i) {
 947
 948                 /* Offset already in list? */
 949                 if (msrpm_offsets[i] == offset)
 950                         return;
 951
 952                 /* Slot used by another offset? */
 953                 if (msrpm_offsets[i] != MSR_INVALID)
 954                         continue;
 955
 956                 /* Add offset to list */
 957                 msrpm_offsets[i] = offset;
 958
 959                 return;
 960         }
 961
 962         /*
 963          * If this BUG triggers the msrpm_offsets table has an overflow. Just
 964          * increase MSRPM_OFFSETS in this case.
 965          */
 966         BUG();
 967 }
 968
 969 static void init_msrpm_offsets(void)
 970 {
 971         int i;
 972
 973         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 974
 975         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 976                 u32 offset;
 977
 978                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
 979                 BUG_ON(offset == MSR_INVALID);
 980
 981                 add_msr_offset(offset);
 982         }
 983 }
 984
 985 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 986 {
 987         to_vmcb->save.dbgctl            = from_vmcb->save.dbgctl;
 988         to_vmcb->save.br_from           = from_vmcb->save.br_from;
 989         to_vmcb->save.br_to             = from_vmcb->save.br_to;
 990         to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
 991         to_vmcb->save.last_excp_to      = from_vmcb->save.last_excp_to;
 992
 993         vmcb_mark_dirty(to_vmcb, VMCB_LBR);
 994 }
 995
 996 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 997 {
 998         struct vcpu_svm *svm = to_svm(vcpu);
 999
1000         svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
1001         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
1002         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
1003         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
1004         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
1005
1006         /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
1007         if (is_guest_mode(vcpu))
1008                 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
1009 }
1010
1011 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
1012 {
1013         struct vcpu_svm *svm = to_svm(vcpu);
1014
1015         svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
1016         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
1017         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
1018         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
1019         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
1020
1021         /*
1022          * Move the LBR msrs back to the vmcb01 to avoid copying them
1023          * on nested guest entries.
1024          */
1025         if (is_guest_mode(vcpu))
1026                 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
1027 }
1028
1029 static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
1030 {
1031         /*
1032          * If LBR virtualization is disabled, the LBR MSRs are always kept in
1033          * vmcb01.  If LBR virtualization is enabled and L1 is running VMs of
1034          * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
1035          */
1036         return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
1037                                                                    svm->vmcb01.ptr;
1038 }
1039
1040 void svm_update_lbrv(struct kvm_vcpu *vcpu)
1041 {
1042         struct vcpu_svm *svm = to_svm(vcpu);
1043         bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
1044         bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
1045                             (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
1046                             (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
1047
1048         if (enable_lbrv == current_enable_lbrv)
1049                 return;
1050
1051         if (enable_lbrv)
1052                 svm_enable_lbrv(vcpu);
1053         else
1054                 svm_disable_lbrv(vcpu);
1055 }
1056
1057 void disable_nmi_singlestep(struct vcpu_svm *svm)
1058 {
1059         svm->nmi_singlestep = false;
1060
1061         if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1062                 /* Clear our flags if they were not set by the guest */
1063                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1064                         svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1065                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1066                         svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1067         }
1068 }
1069
1070 static void grow_ple_window(struct kvm_vcpu *vcpu)
1071 {
1072         struct vcpu_svm *svm = to_svm(vcpu);
1073         struct vmcb_control_area *control = &svm->vmcb->control;
1074         int old = control->pause_filter_count;
1075
1076         if (kvm_pause_in_guest(vcpu->kvm))
1077                 return;
1078
1079         control->pause_filter_count = __grow_ple_window(old,
1080                                                         pause_filter_count,
1081                                                         pause_filter_count_grow,
1082                                                         pause_filter_count_max);
1083
1084         if (control->pause_filter_count != old) {
1085                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1086                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1087                                             control->pause_filter_count, old);
1088         }
1089 }
1090
1091 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1092 {
1093         struct vcpu_svm *svm = to_svm(vcpu);
1094         struct vmcb_control_area *control = &svm->vmcb->control;
1095         int old = control->pause_filter_count;
1096
1097         if (kvm_pause_in_guest(vcpu->kvm))
1098                 return;
1099
1100         control->pause_filter_count =
1101                                 __shrink_ple_window(old,
1102                                                     pause_filter_count,
1103                                                     pause_filter_count_shrink,
1104                                                     pause_filter_count);
1105         if (control->pause_filter_count != old) {
1106                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1107                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1108                                             control->pause_filter_count, old);
1109         }
1110 }
1111
1112 static void svm_hardware_unsetup(void)
1113 {
1114         int cpu;
1115
1116         sev_hardware_unsetup();
1117
1118         for_each_possible_cpu(cpu)
1119                 svm_cpu_uninit(cpu);
1120
1121         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1122         get_order(IOPM_SIZE));
1123         iopm_base = 0;
1124 }
1125
1126 static void init_seg(struct vmcb_seg *seg)
1127 {
1128         seg->selector = 0;
1129         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1130                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1131         seg->limit = 0xffff;
1132         seg->base = 0;
1133 }
1134
1135 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1136 {
1137         seg->selector = 0;
1138         seg->attrib = SVM_SELECTOR_P_MASK | type;
1139         seg->limit = 0xffff;
1140         seg->base = 0;
1141 }
1142
1143 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1144 {
1145         struct vcpu_svm *svm = to_svm(vcpu);
1146
1147         return svm->nested.ctl.tsc_offset;
1148 }
1149
1150 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1151 {
1152         struct vcpu_svm *svm = to_svm(vcpu);
1153
1154         return svm->tsc_ratio_msr;
1155 }
1156
1157 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
1158 {
1159         struct vcpu_svm *svm = to_svm(vcpu);
1160
1161         svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1162         svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
1163         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1164 }
1165
1166 void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
1167 {
1168         preempt_disable();
1169         if (to_svm(vcpu)->guest_state_loaded)
1170                 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1171         preempt_enable();
1172 }
1173
1174 /* Evaluate instruction intercepts that depend on guest CPUID features. */
1175 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1176                                               struct vcpu_svm *svm)
1177 {
1178         /*
1179          * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1180          * roots, or if INVPCID is disabled in the guest to inject #UD.
1181          */
1182         if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1183                 if (!npt_enabled ||
1184                     !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1185                         svm_set_intercept(svm, INTERCEPT_INVPCID);
1186                 else
1187                         svm_clr_intercept(svm, INTERCEPT_INVPCID);
1188         }
1189
1190         if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1191                 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1192                         svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1193                 else
1194                         svm_set_intercept(svm, INTERCEPT_RDTSCP);
1195         }
1196 }
1197
1198 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1199 {
1200         struct vcpu_svm *svm = to_svm(vcpu);
1201
1202         if (guest_cpuid_is_intel(vcpu)) {
1203                 /*
1204                  * We must intercept SYSENTER_EIP and SYSENTER_ESP
1205                  * accesses because the processor only stores 32 bits.
1206                  * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1207                  */
1208                 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1209                 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1210                 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1211
1212                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1213                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1214         } else {
1215                 /*
1216                  * If hardware supports Virtual VMLOAD VMSAVE then enable it
1217                  * in VMCB and clear intercepts to avoid #VMEXIT.
1218                  */
1219                 if (vls) {
1220                         svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1221                         svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1222                         svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1223                 }
1224                 /* No need to intercept these MSRs */
1225                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1226                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1227         }
1228 }
1229
1230 static void init_vmcb(struct kvm_vcpu *vcpu)
1231 {
1232         struct vcpu_svm *svm = to_svm(vcpu);
1233         struct vmcb *vmcb = svm->vmcb01.ptr;
1234         struct vmcb_control_area *control = &vmcb->control;
1235         struct vmcb_save_area *save = &vmcb->save;
1236
1237         svm_set_intercept(svm, INTERCEPT_CR0_READ);
1238         svm_set_intercept(svm, INTERCEPT_CR3_READ);
1239         svm_set_intercept(svm, INTERCEPT_CR4_READ);
1240         svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1241         svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1242         svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1243         if (!kvm_vcpu_apicv_active(vcpu))
1244                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1245
1246         set_dr_intercepts(svm);
1247
1248         set_exception_intercept(svm, PF_VECTOR);
1249         set_exception_intercept(svm, UD_VECTOR);
1250         set_exception_intercept(svm, MC_VECTOR);
1251         set_exception_intercept(svm, AC_VECTOR);
1252         set_exception_intercept(svm, DB_VECTOR);
1253         /*
1254          * Guest access to VMware backdoor ports could legitimately
1255          * trigger #GP because of TSS I/O permission bitmap.
1256          * We intercept those #GP and allow access to them anyway
1257          * as VMware does.
1258          */
1259         if (enable_vmware_backdoor)
1260                 set_exception_intercept(svm, GP_VECTOR);
1261
1262         svm_set_intercept(svm, INTERCEPT_INTR);
1263         svm_set_intercept(svm, INTERCEPT_NMI);
1264
1265         if (intercept_smi)
1266                 svm_set_intercept(svm, INTERCEPT_SMI);
1267
1268         svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1269         svm_set_intercept(svm, INTERCEPT_RDPMC);
1270         svm_set_intercept(svm, INTERCEPT_CPUID);
1271         svm_set_intercept(svm, INTERCEPT_INVD);
1272         svm_set_intercept(svm, INTERCEPT_INVLPG);
1273         svm_set_intercept(svm, INTERCEPT_INVLPGA);
1274         svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1275         svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1276         svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1277         svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1278         svm_set_intercept(svm, INTERCEPT_VMRUN);
1279         svm_set_intercept(svm, INTERCEPT_VMMCALL);
1280         svm_set_intercept(svm, INTERCEPT_VMLOAD);
1281         svm_set_intercept(svm, INTERCEPT_VMSAVE);
1282         svm_set_intercept(svm, INTERCEPT_STGI);
1283         svm_set_intercept(svm, INTERCEPT_CLGI);
1284         svm_set_intercept(svm, INTERCEPT_SKINIT);
1285         svm_set_intercept(svm, INTERCEPT_WBINVD);
1286         svm_set_intercept(svm, INTERCEPT_XSETBV);
1287         svm_set_intercept(svm, INTERCEPT_RDPRU);
1288         svm_set_intercept(svm, INTERCEPT_RSM);
1289
1290         if (!kvm_mwait_in_guest(vcpu->kvm)) {
1291                 svm_set_intercept(svm, INTERCEPT_MONITOR);
1292                 svm_set_intercept(svm, INTERCEPT_MWAIT);
1293         }
1294
1295         if (!kvm_hlt_in_guest(vcpu->kvm))
1296                 svm_set_intercept(svm, INTERCEPT_HLT);
1297
1298         control->iopm_base_pa = __sme_set(iopm_base);
1299         control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1300         control->int_ctl = V_INTR_MASKING_MASK;
1301
1302         init_seg(&save->es);
1303         init_seg(&save->ss);
1304         init_seg(&save->ds);
1305         init_seg(&save->fs);
1306         init_seg(&save->gs);
1307
1308         save->cs.selector = 0xf000;
1309         save->cs.base = 0xffff0000;
1310         /* Executable/Readable Code Segment */
1311         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1312                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1313         save->cs.limit = 0xffff;
1314
1315         save->gdtr.base = 0;
1316         save->gdtr.limit = 0xffff;
1317         save->idtr.base = 0;
1318         save->idtr.limit = 0xffff;
1319
1320         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1321         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1322
1323         if (npt_enabled) {
1324                 /* Setup VMCB for Nested Paging */
1325                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1326                 svm_clr_intercept(svm, INTERCEPT_INVLPG);
1327                 clr_exception_intercept(svm, PF_VECTOR);
1328                 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1329                 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1330                 save->g_pat = vcpu->arch.pat;
1331                 save->cr3 = 0;
1332         }
1333         svm->current_vmcb->asid_generation = 0;
1334         svm->asid = 0;
1335
1336         svm->nested.vmcb12_gpa = INVALID_GPA;
1337         svm->nested.last_vmcb12_gpa = INVALID_GPA;
1338
1339         if (!kvm_pause_in_guest(vcpu->kvm)) {
1340                 control->pause_filter_count = pause_filter_count;
1341                 if (pause_filter_thresh)
1342                         control->pause_filter_thresh = pause_filter_thresh;
1343                 svm_set_intercept(svm, INTERCEPT_PAUSE);
1344         } else {
1345                 svm_clr_intercept(svm, INTERCEPT_PAUSE);
1346         }
1347
1348         svm_recalc_instruction_intercepts(vcpu, svm);
1349
1350         /*
1351          * If the host supports V_SPEC_CTRL then disable the interception
1352          * of MSR_IA32_SPEC_CTRL.
1353          */
1354         if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1355                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1356
1357         if (kvm_vcpu_apicv_active(vcpu))
1358                 avic_init_vmcb(svm, vmcb);
1359
1360         if (vnmi)
1361                 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
1362
1363         if (vgif) {
1364                 svm_clr_intercept(svm, INTERCEPT_STGI);
1365                 svm_clr_intercept(svm, INTERCEPT_CLGI);
1366                 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1367         }
1368
1369         if (sev_guest(vcpu->kvm))
1370                 sev_init_vmcb(svm);
1371
1372         svm_hv_init_vmcb(vmcb);
1373         init_vmcb_after_set_cpuid(vcpu);
1374
1375         vmcb_mark_all_dirty(vmcb);
1376
1377         enable_gif(svm);
1378 }
1379
1380 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1381 {
1382         struct vcpu_svm *svm = to_svm(vcpu);
1383
1384         svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1385
1386         svm_init_osvw(vcpu);
1387         vcpu->arch.microcode_version = 0x01000065;
1388         svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1389
1390         svm->nmi_masked = false;
1391         svm->awaiting_iret_completion = false;
1392
1393         if (sev_es_guest(vcpu->kvm))
1394                 sev_es_vcpu_reset(svm);
1395 }
1396
1397 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1398 {
1399         struct vcpu_svm *svm = to_svm(vcpu);
1400
1401         svm->spec_ctrl = 0;
1402         svm->virt_spec_ctrl = 0;
1403
1404         init_vmcb(vcpu);
1405
1406         if (!init_event)
1407                 __svm_vcpu_reset(vcpu);
1408 }
1409
1410 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1411 {
1412         svm->current_vmcb = target_vmcb;
1413         svm->vmcb = target_vmcb->ptr;
1414 }
1415
1416 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1417 {
1418         struct vcpu_svm *svm;
1419         struct page *vmcb01_page;
1420         struct page *vmsa_page = NULL;
1421         int err;
1422
1423         BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1424         svm = to_svm(vcpu);
1425
1426         err = -ENOMEM;
1427         vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1428         if (!vmcb01_page)
1429                 goto out;
1430
1431         if (sev_es_guest(vcpu->kvm)) {
1432                 /*
1433                  * SEV-ES guests require a separate VMSA page used to contain
1434                  * the encrypted register state of the guest.
1435                  */
1436                 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1437                 if (!vmsa_page)
1438                         goto error_free_vmcb_page;
1439
1440                 /*
1441                  * SEV-ES guests maintain an encrypted version of their FPU
1442                  * state which is restored and saved on VMRUN and VMEXIT.
1443                  * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1444                  * do xsave/xrstor on it.
1445                  */
1446                 fpstate_set_confidential(&vcpu->arch.guest_fpu);
1447         }
1448
1449         err = avic_init_vcpu(svm);
1450         if (err)
1451                 goto error_free_vmsa_page;
1452
1453         svm->msrpm = svm_vcpu_alloc_msrpm();
1454         if (!svm->msrpm) {
1455                 err = -ENOMEM;
1456                 goto error_free_vmsa_page;
1457         }
1458
1459         svm->x2avic_msrs_intercepted = true;
1460
1461         svm->vmcb01.ptr = page_address(vmcb01_page);
1462         svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1463         svm_switch_vmcb(svm, &svm->vmcb01);
1464
1465         if (vmsa_page)
1466                 svm->sev_es.vmsa = page_address(vmsa_page);
1467
1468         svm->guest_state_loaded = false;
1469
1470         return 0;
1471
1472 error_free_vmsa_page:
1473         if (vmsa_page)
1474                 __free_page(vmsa_page);
1475 error_free_vmcb_page:
1476         __free_page(vmcb01_page);
1477 out:
1478         return err;
1479 }
1480
1481 static void svm_clear_current_vmcb(struct vmcb *vmcb)
1482 {
1483         int i;
1484
1485         for_each_online_cpu(i)
1486                 cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
1487 }
1488
1489 static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1490 {
1491         struct vcpu_svm *svm = to_svm(vcpu);
1492
1493         /*
1494          * The vmcb page can be recycled, causing a false negative in
1495          * svm_vcpu_load(). So, ensure that no logical CPU has this
1496          * vmcb page recorded as its current vmcb.
1497          */
1498         svm_clear_current_vmcb(svm->vmcb);
1499
1500         svm_leave_nested(vcpu);
1501         svm_free_nested(svm);
1502
1503         sev_free_vcpu(vcpu);
1504
1505         __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1506         __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1507 }
1508
1509 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1510 {
1511         struct vcpu_svm *svm = to_svm(vcpu);
1512         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1513
1514         if (sev_es_guest(vcpu->kvm))
1515                 sev_es_unmap_ghcb(svm);
1516
1517         if (svm->guest_state_loaded)
1518                 return;
1519
1520         /*
1521          * Save additional host state that will be restored on VMEXIT (sev-es)
1522          * or subsequent vmload of host save area.
1523          */
1524         vmsave(sd->save_area_pa);
1525         if (sev_es_guest(vcpu->kvm)) {
1526                 struct sev_es_save_area *hostsa;
1527                 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1528
1529                 sev_es_prepare_switch_to_guest(hostsa);
1530         }
1531
1532         if (tsc_scaling)
1533                 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1534
1535         if (likely(tsc_aux_uret_slot >= 0))
1536                 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1537
1538         svm->guest_state_loaded = true;
1539 }
1540
1541 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1542 {
1543         to_svm(vcpu)->guest_state_loaded = false;
1544 }
1545
1546 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1547 {
1548         struct vcpu_svm *svm = to_svm(vcpu);
1549         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
1550
1551         if (sd->current_vmcb != svm->vmcb) {
1552                 sd->current_vmcb = svm->vmcb;
1553
1554                 if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
1555                         indirect_branch_prediction_barrier();
1556         }
1557         if (kvm_vcpu_apicv_active(vcpu))
1558                 avic_vcpu_load(vcpu, cpu);
1559 }
1560
1561 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1562 {
1563         if (kvm_vcpu_apicv_active(vcpu))
1564                 avic_vcpu_put(vcpu);
1565
1566         svm_prepare_host_switch(vcpu);
1567
1568         ++vcpu->stat.host_state_reload;
1569 }
1570
1571 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1572 {
1573         struct vcpu_svm *svm = to_svm(vcpu);
1574         unsigned long rflags = svm->vmcb->save.rflags;
1575
1576         if (svm->nmi_singlestep) {
1577                 /* Hide our flags if they were not set by the guest */
1578                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1579                         rflags &= ~X86_EFLAGS_TF;
1580                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1581                         rflags &= ~X86_EFLAGS_RF;
1582         }
1583         return rflags;
1584 }
1585
1586 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1587 {
1588         if (to_svm(vcpu)->nmi_singlestep)
1589                 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1590
1591        /*
1592         * Any change of EFLAGS.VM is accompanied by a reload of SS
1593         * (caused by either a task switch or an inter-privilege IRET),
1594         * so we do not need to update the CPL here.
1595         */
1596         to_svm(vcpu)->vmcb->save.rflags = rflags;
1597 }
1598
1599 static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1600 {
1601         struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1602
1603         return sev_es_guest(vcpu->kvm)
1604                 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1605                 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1606 }
1607
1608 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1609 {
1610         kvm_register_mark_available(vcpu, reg);
1611
1612         switch (reg) {
1613         case VCPU_EXREG_PDPTR:
1614                 /*
1615                  * When !npt_enabled, mmu->pdptrs[] is already available since
1616                  * it is always updated per SDM when moving to CRs.
1617                  */
1618                 if (npt_enabled)
1619                         load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1620                 break;
1621         default:
1622                 KVM_BUG_ON(1, vcpu->kvm);
1623         }
1624 }
1625
1626 static void svm_set_vintr(struct vcpu_svm *svm)
1627 {
1628         struct vmcb_control_area *control;
1629
1630         /*
1631          * The following fields are ignored when AVIC is enabled
1632          */
1633         WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1634
1635         svm_set_intercept(svm, INTERCEPT_VINTR);
1636
1637         /*
1638          * Recalculating intercepts may have cleared the VINTR intercept.  If
1639          * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
1640          * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
1641          * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
1642          * interrupts will never be unblocked while L2 is running.
1643          */
1644         if (!svm_is_intercept(svm, INTERCEPT_VINTR))
1645                 return;
1646
1647         /*
1648          * This is just a dummy VINTR to actually cause a vmexit to happen.
1649          * Actual injection of virtual interrupts happens through EVENTINJ.
1650          */
1651         control = &svm->vmcb->control;
1652         control->int_vector = 0x0;
1653         control->int_ctl &= ~V_INTR_PRIO_MASK;
1654         control->int_ctl |= V_IRQ_MASK |
1655                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1656         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1657 }
1658
1659 static void svm_clear_vintr(struct vcpu_svm *svm)
1660 {
1661         svm_clr_intercept(svm, INTERCEPT_VINTR);
1662
1663         /* Drop int_ctl fields related to VINTR injection.  */
1664         svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1665         if (is_guest_mode(&svm->vcpu)) {
1666                 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1667
1668                 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1669                         (svm->nested.ctl.int_ctl & V_TPR_MASK));
1670
1671                 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1672                         V_IRQ_INJECTION_BITS_MASK;
1673
1674                 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1675         }
1676
1677         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1678 }
1679
1680 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1681 {
1682         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1683         struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1684
1685         switch (seg) {
1686         case VCPU_SREG_CS: return &save->cs;
1687         case VCPU_SREG_DS: return &save->ds;
1688         case VCPU_SREG_ES: return &save->es;
1689         case VCPU_SREG_FS: return &save01->fs;
1690         case VCPU_SREG_GS: return &save01->gs;
1691         case VCPU_SREG_SS: return &save->ss;
1692         case VCPU_SREG_TR: return &save01->tr;
1693         case VCPU_SREG_LDTR: return &save01->ldtr;
1694         }
1695         BUG();
1696         return NULL;
1697 }
1698
1699 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1700 {
1701         struct vmcb_seg *s = svm_seg(vcpu, seg);
1702
1703         return s->base;
1704 }
1705
1706 static void svm_get_segment(struct kvm_vcpu *vcpu,
1707                             struct kvm_segment *var, int seg)
1708 {
1709         struct vmcb_seg *s = svm_seg(vcpu, seg);
1710
1711         var->base = s->base;
1712         var->limit = s->limit;
1713         var->selector = s->selector;
1714         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1715         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1716         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1717         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1718         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1719         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1720         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1721
1722         /*
1723          * AMD CPUs circa 2014 track the G bit for all segments except CS.
1724          * However, the SVM spec states that the G bit is not observed by the
1725          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1726          * So let's synthesize a legal G bit for all segments, this helps
1727          * running KVM nested. It also helps cross-vendor migration, because
1728          * Intel's vmentry has a check on the 'G' bit.
1729          */
1730         var->g = s->limit > 0xfffff;
1731
1732         /*
1733          * AMD's VMCB does not have an explicit unusable field, so emulate it
1734          * for cross vendor migration purposes by "not present"
1735          */
1736         var->unusable = !var->present;
1737
1738         switch (seg) {
1739         case VCPU_SREG_TR:
1740                 /*
1741                  * Work around a bug where the busy flag in the tr selector
1742                  * isn't exposed
1743                  */
1744                 var->type |= 0x2;
1745                 break;
1746         case VCPU_SREG_DS:
1747         case VCPU_SREG_ES:
1748         case VCPU_SREG_FS:
1749         case VCPU_SREG_GS:
1750                 /*
1751                  * The accessed bit must always be set in the segment
1752                  * descriptor cache, although it can be cleared in the
1753                  * descriptor, the cached bit always remains at 1. Since
1754                  * Intel has a check on this, set it here to support
1755                  * cross-vendor migration.
1756                  */
1757                 if (!var->unusable)
1758                         var->type |= 0x1;
1759                 break;
1760         case VCPU_SREG_SS:
1761                 /*
1762                  * On AMD CPUs sometimes the DB bit in the segment
1763                  * descriptor is left as 1, although the whole segment has
1764                  * been made unusable. Clear it here to pass an Intel VMX
1765                  * entry check when cross vendor migrating.
1766                  */
1767                 if (var->unusable)
1768                         var->db = 0;
1769                 /* This is symmetric with svm_set_segment() */
1770                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1771                 break;
1772         }
1773 }
1774
1775 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1776 {
1777         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1778
1779         return save->cpl;
1780 }
1781
1782 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1783 {
1784         struct kvm_segment cs;
1785
1786         svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1787         *db = cs.db;
1788         *l = cs.l;
1789 }
1790
1791 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1792 {
1793         struct vcpu_svm *svm = to_svm(vcpu);
1794
1795         dt->size = svm->vmcb->save.idtr.limit;
1796         dt->address = svm->vmcb->save.idtr.base;
1797 }
1798
1799 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1800 {
1801         struct vcpu_svm *svm = to_svm(vcpu);
1802
1803         svm->vmcb->save.idtr.limit = dt->size;
1804         svm->vmcb->save.idtr.base = dt->address ;
1805         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1806 }
1807
1808 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1809 {
1810         struct vcpu_svm *svm = to_svm(vcpu);
1811
1812         dt->size = svm->vmcb->save.gdtr.limit;
1813         dt->address = svm->vmcb->save.gdtr.base;
1814 }
1815
1816 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1817 {
1818         struct vcpu_svm *svm = to_svm(vcpu);
1819
1820         svm->vmcb->save.gdtr.limit = dt->size;
1821         svm->vmcb->save.gdtr.base = dt->address ;
1822         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1823 }
1824
1825 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1826 {
1827         struct vcpu_svm *svm = to_svm(vcpu);
1828
1829         /*
1830          * For guests that don't set guest_state_protected, the cr3 update is
1831          * handled via kvm_mmu_load() while entering the guest. For guests
1832          * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1833          * VMCB save area now, since the save area will become the initial
1834          * contents of the VMSA, and future VMCB save area updates won't be
1835          * seen.
1836          */
1837         if (sev_es_guest(vcpu->kvm)) {
1838                 svm->vmcb->save.cr3 = cr3;
1839                 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1840         }
1841 }
1842
1843 static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1844 {
1845         return true;
1846 }
1847
1848 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1849 {
1850         struct vcpu_svm *svm = to_svm(vcpu);
1851         u64 hcr0 = cr0;
1852         bool old_paging = is_paging(vcpu);
1853
1854 #ifdef CONFIG_X86_64
1855         if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1856                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1857                         vcpu->arch.efer |= EFER_LMA;
1858                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1859                 }
1860
1861                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1862                         vcpu->arch.efer &= ~EFER_LMA;
1863                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1864                 }
1865         }
1866 #endif
1867         vcpu->arch.cr0 = cr0;
1868
1869         if (!npt_enabled) {
1870                 hcr0 |= X86_CR0_PG | X86_CR0_WP;
1871                 if (old_paging != is_paging(vcpu))
1872                         svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1873         }
1874
1875         /*
1876          * re-enable caching here because the QEMU bios
1877          * does not do it - this results in some delay at
1878          * reboot
1879          */
1880         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1881                 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1882
1883         svm->vmcb->save.cr0 = hcr0;
1884         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1885
1886         /*
1887          * SEV-ES guests must always keep the CR intercepts cleared. CR
1888          * tracking is done using the CR write traps.
1889          */
1890         if (sev_es_guest(vcpu->kvm))
1891                 return;
1892
1893         if (hcr0 == cr0) {
1894                 /* Selective CR0 write remains on.  */
1895                 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1896                 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1897         } else {
1898                 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1899                 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1900         }
1901 }
1902
1903 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1904 {
1905         return true;
1906 }
1907
1908 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1909 {
1910         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1911         unsigned long old_cr4 = vcpu->arch.cr4;
1912
1913         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1914                 svm_flush_tlb_current(vcpu);
1915
1916         vcpu->arch.cr4 = cr4;
1917         if (!npt_enabled) {
1918                 cr4 |= X86_CR4_PAE;
1919
1920                 if (!is_paging(vcpu))
1921                         cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1922         }
1923         cr4 |= host_cr4_mce;
1924         to_svm(vcpu)->vmcb->save.cr4 = cr4;
1925         vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1926
1927         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1928                 kvm_update_cpuid_runtime(vcpu);
1929 }
1930
1931 static void svm_set_segment(struct kvm_vcpu *vcpu,
1932                             struct kvm_segment *var, int seg)
1933 {
1934         struct vcpu_svm *svm = to_svm(vcpu);
1935         struct vmcb_seg *s = svm_seg(vcpu, seg);
1936
1937         s->base = var->base;
1938         s->limit = var->limit;
1939         s->selector = var->selector;
1940         s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1941         s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1942         s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1943         s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1944         s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1945         s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1946         s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1947         s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1948
1949         /*
1950          * This is always accurate, except if SYSRET returned to a segment
1951          * with SS.DPL != 3.  Intel does not have this quirk, and always
1952          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1953          * would entail passing the CPL to userspace and back.
1954          */
1955         if (seg == VCPU_SREG_SS)
1956                 /* This is symmetric with svm_get_segment() */
1957                 svm->vmcb->save.cpl = (var->dpl & 3);
1958
1959         vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1960 }
1961
1962 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1963 {
1964         struct vcpu_svm *svm = to_svm(vcpu);
1965
1966         clr_exception_intercept(svm, BP_VECTOR);
1967
1968         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1969                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1970                         set_exception_intercept(svm, BP_VECTOR);
1971         }
1972 }
1973
1974 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1975 {
1976         if (sd->next_asid > sd->max_asid) {
1977                 ++sd->asid_generation;
1978                 sd->next_asid = sd->min_asid;
1979                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1980                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1981         }
1982
1983         svm->current_vmcb->asid_generation = sd->asid_generation;
1984         svm->asid = sd->next_asid++;
1985 }
1986
1987 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1988 {
1989         struct vmcb *vmcb = svm->vmcb;
1990
1991         if (svm->vcpu.arch.guest_state_protected)
1992                 return;
1993
1994         if (unlikely(value != vmcb->save.dr6)) {
1995                 vmcb->save.dr6 = value;
1996                 vmcb_mark_dirty(vmcb, VMCB_DR);
1997         }
1998 }
1999
2000 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
2001 {
2002         struct vcpu_svm *svm = to_svm(vcpu);
2003
2004         if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
2005                 return;
2006
2007         get_debugreg(vcpu->arch.db[0], 0);
2008         get_debugreg(vcpu->arch.db[1], 1);
2009         get_debugreg(vcpu->arch.db[2], 2);
2010         get_debugreg(vcpu->arch.db[3], 3);
2011         /*
2012          * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
2013          * because db_interception might need it.  We can do it before vmentry.
2014          */
2015         vcpu->arch.dr6 = svm->vmcb->save.dr6;
2016         vcpu->arch.dr7 = svm->vmcb->save.dr7;
2017         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2018         set_dr_intercepts(svm);
2019 }
2020
2021 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
2022 {
2023         struct vcpu_svm *svm = to_svm(vcpu);
2024
2025         if (vcpu->arch.guest_state_protected)
2026                 return;
2027
2028         svm->vmcb->save.dr7 = value;
2029         vmcb_mark_dirty(svm->vmcb, VMCB_DR);
2030 }
2031
2032 static int pf_interception(struct kvm_vcpu *vcpu)
2033 {
2034         struct vcpu_svm *svm = to_svm(vcpu);
2035
2036         u64 fault_address = svm->vmcb->control.exit_info_2;
2037         u64 error_code = svm->vmcb->control.exit_info_1;
2038
2039         return kvm_handle_page_fault(vcpu, error_code, fault_address,
2040                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2041                         svm->vmcb->control.insn_bytes : NULL,
2042                         svm->vmcb->control.insn_len);
2043 }
2044
2045 static int npf_interception(struct kvm_vcpu *vcpu)
2046 {
2047         struct vcpu_svm *svm = to_svm(vcpu);
2048
2049         u64 fault_address = svm->vmcb->control.exit_info_2;
2050         u64 error_code = svm->vmcb->control.exit_info_1;
2051
2052         trace_kvm_page_fault(vcpu, fault_address, error_code);
2053         return kvm_mmu_page_fault(vcpu, fault_address, error_code,
2054                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2055                         svm->vmcb->control.insn_bytes : NULL,
2056                         svm->vmcb->control.insn_len);
2057 }
2058
2059 static int db_interception(struct kvm_vcpu *vcpu)
2060 {
2061         struct kvm_run *kvm_run = vcpu->run;
2062         struct vcpu_svm *svm = to_svm(vcpu);
2063
2064         if (!(vcpu->guest_debug &
2065               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2066                 !svm->nmi_singlestep) {
2067                 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
2068                 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
2069                 return 1;
2070         }
2071
2072         if (svm->nmi_singlestep) {
2073                 disable_nmi_singlestep(svm);
2074                 /* Make sure we check for pending NMIs upon entry */
2075                 kvm_make_request(KVM_REQ_EVENT, vcpu);
2076         }
2077
2078         if (vcpu->guest_debug &
2079             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2080                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2081                 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2082                 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
2083                 kvm_run->debug.arch.pc =
2084                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2085                 kvm_run->debug.arch.exception = DB_VECTOR;
2086                 return 0;
2087         }
2088
2089         return 1;
2090 }
2091
2092 static int bp_interception(struct kvm_vcpu *vcpu)
2093 {
2094         struct vcpu_svm *svm = to_svm(vcpu);
2095         struct kvm_run *kvm_run = vcpu->run;
2096
2097         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2098         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2099         kvm_run->debug.arch.exception = BP_VECTOR;
2100         return 0;
2101 }
2102
2103 static int ud_interception(struct kvm_vcpu *vcpu)
2104 {
2105         return handle_ud(vcpu);
2106 }
2107
2108 static int ac_interception(struct kvm_vcpu *vcpu)
2109 {
2110         kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2111         return 1;
2112 }
2113
2114 static bool is_erratum_383(void)
2115 {
2116         int err, i;
2117         u64 value;
2118
2119         if (!erratum_383_found)
2120                 return false;
2121
2122         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2123         if (err)
2124                 return false;
2125
2126         /* Bit 62 may or may not be set for this mce */
2127         value &= ~(1ULL << 62);
2128
2129         if (value != 0xb600000000010015ULL)
2130                 return false;
2131
2132         /* Clear MCi_STATUS registers */
2133         for (i = 0; i < 6; ++i)
2134                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2135
2136         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2137         if (!err) {
2138                 u32 low, high;
2139
2140                 value &= ~(1ULL << 2);
2141                 low    = lower_32_bits(value);
2142                 high   = upper_32_bits(value);
2143
2144                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2145         }
2146
2147         /* Flush tlb to evict multi-match entries */
2148         __flush_tlb_all();
2149
2150         return true;
2151 }
2152
2153 static void svm_handle_mce(struct kvm_vcpu *vcpu)
2154 {
2155         if (is_erratum_383()) {
2156                 /*
2157                  * Erratum 383 triggered. Guest state is corrupt so kill the
2158                  * guest.
2159                  */
2160                 pr_err("Guest triggered AMD Erratum 383\n");
2161
2162                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2163
2164                 return;
2165         }
2166
2167         /*
2168          * On an #MC intercept the MCE handler is not called automatically in
2169          * the host. So do it by hand here.
2170          */
2171         kvm_machine_check();
2172 }
2173
2174 static int mc_interception(struct kvm_vcpu *vcpu)
2175 {
2176         return 1;
2177 }
2178
2179 static int shutdown_interception(struct kvm_vcpu *vcpu)
2180 {
2181         struct kvm_run *kvm_run = vcpu->run;
2182         struct vcpu_svm *svm = to_svm(vcpu);
2183
2184         /*
2185          * The VM save area has already been encrypted so it
2186          * cannot be reinitialized - just terminate.
2187          */
2188         if (sev_es_guest(vcpu->kvm))
2189                 return -EINVAL;
2190
2191         /*
2192          * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2193          * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2194          * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2195          * userspace.  At a platform view, INIT is acceptable behavior as
2196          * there exist bare metal platforms that automatically INIT the CPU
2197          * in response to shutdown.
2198          */
2199         clear_page(svm->vmcb);
2200         kvm_vcpu_reset(vcpu, true);
2201
2202         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2203         return 0;
2204 }
2205
2206 static int io_interception(struct kvm_vcpu *vcpu)
2207 {
2208         struct vcpu_svm *svm = to_svm(vcpu);
2209         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2210         int size, in, string;
2211         unsigned port;
2212
2213         ++vcpu->stat.io_exits;
2214         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2215         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2216         port = io_info >> 16;
2217         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2218
2219         if (string) {
2220                 if (sev_es_guest(vcpu->kvm))
2221                         return sev_es_string_io(svm, size, port, in);
2222                 else
2223                         return kvm_emulate_instruction(vcpu, 0);
2224         }
2225
2226         svm->next_rip = svm->vmcb->control.exit_info_2;
2227
2228         return kvm_fast_pio(vcpu, size, port, in);
2229 }
2230
2231 static int nmi_interception(struct kvm_vcpu *vcpu)
2232 {
2233         return 1;
2234 }
2235
2236 static int smi_interception(struct kvm_vcpu *vcpu)
2237 {
2238         return 1;
2239 }
2240
2241 static int intr_interception(struct kvm_vcpu *vcpu)
2242 {
2243         ++vcpu->stat.irq_exits;
2244         return 1;
2245 }
2246
2247 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2248 {
2249         struct vcpu_svm *svm = to_svm(vcpu);
2250         struct vmcb *vmcb12;
2251         struct kvm_host_map map;
2252         int ret;
2253
2254         if (nested_svm_check_permissions(vcpu))
2255                 return 1;
2256
2257         ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2258         if (ret) {
2259                 if (ret == -EINVAL)
2260                         kvm_inject_gp(vcpu, 0);
2261                 return 1;
2262         }
2263
2264         vmcb12 = map.hva;
2265
2266         ret = kvm_skip_emulated_instruction(vcpu);
2267
2268         if (vmload) {
2269                 svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2270                 svm->sysenter_eip_hi = 0;
2271                 svm->sysenter_esp_hi = 0;
2272         } else {
2273                 svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2274         }
2275
2276         kvm_vcpu_unmap(vcpu, &map, true);
2277
2278         return ret;
2279 }
2280
2281 static int vmload_interception(struct kvm_vcpu *vcpu)
2282 {
2283         return vmload_vmsave_interception(vcpu, true);
2284 }
2285
2286 static int vmsave_interception(struct kvm_vcpu *vcpu)
2287 {
2288         return vmload_vmsave_interception(vcpu, false);
2289 }
2290
2291 static int vmrun_interception(struct kvm_vcpu *vcpu)
2292 {
2293         if (nested_svm_check_permissions(vcpu))
2294                 return 1;
2295
2296         return nested_svm_vmrun(vcpu);
2297 }
2298
2299 enum {
2300         NONE_SVM_INSTR,
2301         SVM_INSTR_VMRUN,
2302         SVM_INSTR_VMLOAD,
2303         SVM_INSTR_VMSAVE,
2304 };
2305
2306 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2307 static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2308 {
2309         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2310
2311         if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2312                 return NONE_SVM_INSTR;
2313
2314         switch (ctxt->modrm) {
2315         case 0xd8: /* VMRUN */
2316                 return SVM_INSTR_VMRUN;
2317         case 0xda: /* VMLOAD */
2318                 return SVM_INSTR_VMLOAD;
2319         case 0xdb: /* VMSAVE */
2320                 return SVM_INSTR_VMSAVE;
2321         default:
2322                 break;
2323         }
2324
2325         return NONE_SVM_INSTR;
2326 }
2327
2328 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2329 {
2330         const int guest_mode_exit_codes[] = {
2331                 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2332                 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2333                 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2334         };
2335         int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2336                 [SVM_INSTR_VMRUN] = vmrun_interception,
2337                 [SVM_INSTR_VMLOAD] = vmload_interception,
2338                 [SVM_INSTR_VMSAVE] = vmsave_interception,
2339         };
2340         struct vcpu_svm *svm = to_svm(vcpu);
2341         int ret;
2342
2343         if (is_guest_mode(vcpu)) {
2344                 /* Returns '1' or -errno on failure, '0' on success. */
2345                 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2346                 if (ret)
2347                         return ret;
2348                 return 1;
2349         }
2350         return svm_instr_handlers[opcode](vcpu);
2351 }
2352
2353 /*
2354  * #GP handling code. Note that #GP can be triggered under the following two
2355  * cases:
2356  *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2357  *      some AMD CPUs when EAX of these instructions are in the reserved memory
2358  *      regions (e.g. SMM memory on host).
2359  *   2) VMware backdoor
2360  */
2361 static int gp_interception(struct kvm_vcpu *vcpu)
2362 {
2363         struct vcpu_svm *svm = to_svm(vcpu);
2364         u32 error_code = svm->vmcb->control.exit_info_1;
2365         int opcode;
2366
2367         /* Both #GP cases have zero error_code */
2368         if (error_code)
2369                 goto reinject;
2370
2371         /* Decode the instruction for usage later */
2372         if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2373                 goto reinject;
2374
2375         opcode = svm_instr_opcode(vcpu);
2376
2377         if (opcode == NONE_SVM_INSTR) {
2378                 if (!enable_vmware_backdoor)
2379                         goto reinject;
2380
2381                 /*
2382                  * VMware backdoor emulation on #GP interception only handles
2383                  * IN{S}, OUT{S}, and RDPMC.
2384                  */
2385                 if (!is_guest_mode(vcpu))
2386                         return kvm_emulate_instruction(vcpu,
2387                                 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2388         } else {
2389                 /* All SVM instructions expect page aligned RAX */
2390                 if (svm->vmcb->save.rax & ~PAGE_MASK)
2391                         goto reinject;
2392
2393                 return emulate_svm_instr(vcpu, opcode);
2394         }
2395
2396 reinject:
2397         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2398         return 1;
2399 }
2400
2401 void svm_set_gif(struct vcpu_svm *svm, bool value)
2402 {
2403         if (value) {
2404                 /*
2405                  * If VGIF is enabled, the STGI intercept is only added to
2406                  * detect the opening of the SMI/NMI window; remove it now.
2407                  * Likewise, clear the VINTR intercept, we will set it
2408                  * again while processing KVM_REQ_EVENT if needed.
2409                  */
2410                 if (vgif)
2411                         svm_clr_intercept(svm, INTERCEPT_STGI);
2412                 if (svm_is_intercept(svm, INTERCEPT_VINTR))
2413                         svm_clear_vintr(svm);
2414
2415                 enable_gif(svm);
2416                 if (svm->vcpu.arch.smi_pending ||
2417                     svm->vcpu.arch.nmi_pending ||
2418                     kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2419                     kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2420                         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2421         } else {
2422                 disable_gif(svm);
2423
2424                 /*
2425                  * After a CLGI no interrupts should come.  But if vGIF is
2426                  * in use, we still rely on the VINTR intercept (rather than
2427                  * STGI) to detect an open interrupt window.
2428                 */
2429                 if (!vgif)
2430                         svm_clear_vintr(svm);
2431         }
2432 }
2433
2434 static int stgi_interception(struct kvm_vcpu *vcpu)
2435 {
2436         int ret;
2437
2438         if (nested_svm_check_permissions(vcpu))
2439                 return 1;
2440
2441         ret = kvm_skip_emulated_instruction(vcpu);
2442         svm_set_gif(to_svm(vcpu), true);
2443         return ret;
2444 }
2445
2446 static int clgi_interception(struct kvm_vcpu *vcpu)
2447 {
2448         int ret;
2449
2450         if (nested_svm_check_permissions(vcpu))
2451                 return 1;
2452
2453         ret = kvm_skip_emulated_instruction(vcpu);
2454         svm_set_gif(to_svm(vcpu), false);
2455         return ret;
2456 }
2457
2458 static int invlpga_interception(struct kvm_vcpu *vcpu)
2459 {
2460         gva_t gva = kvm_rax_read(vcpu);
2461         u32 asid = kvm_rcx_read(vcpu);
2462
2463         /* FIXME: Handle an address size prefix. */
2464         if (!is_long_mode(vcpu))
2465                 gva = (u32)gva;
2466
2467         trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2468
2469         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2470         kvm_mmu_invlpg(vcpu, gva);
2471
2472         return kvm_skip_emulated_instruction(vcpu);
2473 }
2474
2475 static int skinit_interception(struct kvm_vcpu *vcpu)
2476 {
2477         trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2478
2479         kvm_queue_exception(vcpu, UD_VECTOR);
2480         return 1;
2481 }
2482
2483 static int task_switch_interception(struct kvm_vcpu *vcpu)
2484 {
2485         struct vcpu_svm *svm = to_svm(vcpu);
2486         u16 tss_selector;
2487         int reason;
2488         int int_type = svm->vmcb->control.exit_int_info &
2489                 SVM_EXITINTINFO_TYPE_MASK;
2490         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2491         uint32_t type =
2492                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2493         uint32_t idt_v =
2494                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2495         bool has_error_code = false;
2496         u32 error_code = 0;
2497
2498         tss_selector = (u16)svm->vmcb->control.exit_info_1;
2499
2500         if (svm->vmcb->control.exit_info_2 &
2501             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2502                 reason = TASK_SWITCH_IRET;
2503         else if (svm->vmcb->control.exit_info_2 &
2504                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2505                 reason = TASK_SWITCH_JMP;
2506         else if (idt_v)
2507                 reason = TASK_SWITCH_GATE;
2508         else
2509                 reason = TASK_SWITCH_CALL;
2510
2511         if (reason == TASK_SWITCH_GATE) {
2512                 switch (type) {
2513                 case SVM_EXITINTINFO_TYPE_NMI:
2514                         vcpu->arch.nmi_injected = false;
2515                         break;
2516                 case SVM_EXITINTINFO_TYPE_EXEPT:
2517                         if (svm->vmcb->control.exit_info_2 &
2518                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2519                                 has_error_code = true;
2520                                 error_code =
2521                                         (u32)svm->vmcb->control.exit_info_2;
2522                         }
2523                         kvm_clear_exception_queue(vcpu);
2524                         break;
2525                 case SVM_EXITINTINFO_TYPE_INTR:
2526                 case SVM_EXITINTINFO_TYPE_SOFT:
2527                         kvm_clear_interrupt_queue(vcpu);
2528                         break;
2529                 default:
2530                         break;
2531                 }
2532         }
2533
2534         if (reason != TASK_SWITCH_GATE ||
2535             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2536             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2537              (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2538                 if (!svm_skip_emulated_instruction(vcpu))
2539                         return 0;
2540         }
2541
2542         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2543                 int_vec = -1;
2544
2545         return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2546                                has_error_code, error_code);
2547 }
2548
2549 static void svm_clr_iret_intercept(struct vcpu_svm *svm)
2550 {
2551         if (!sev_es_guest(svm->vcpu.kvm))
2552                 svm_clr_intercept(svm, INTERCEPT_IRET);
2553 }
2554
2555 static void svm_set_iret_intercept(struct vcpu_svm *svm)
2556 {
2557         if (!sev_es_guest(svm->vcpu.kvm))
2558                 svm_set_intercept(svm, INTERCEPT_IRET);
2559 }
2560
2561 static int iret_interception(struct kvm_vcpu *vcpu)
2562 {
2563         struct vcpu_svm *svm = to_svm(vcpu);
2564
2565         WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
2566
2567         ++vcpu->stat.nmi_window_exits;
2568         svm->awaiting_iret_completion = true;
2569
2570         svm_clr_iret_intercept(svm);
2571         svm->nmi_iret_rip = kvm_rip_read(vcpu);
2572
2573         kvm_make_request(KVM_REQ_EVENT, vcpu);
2574         return 1;
2575 }
2576
2577 static int invlpg_interception(struct kvm_vcpu *vcpu)
2578 {
2579         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2580                 return kvm_emulate_instruction(vcpu, 0);
2581
2582         kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2583         return kvm_skip_emulated_instruction(vcpu);
2584 }
2585
2586 static int emulate_on_interception(struct kvm_vcpu *vcpu)
2587 {
2588         return kvm_emulate_instruction(vcpu, 0);
2589 }
2590
2591 static int rsm_interception(struct kvm_vcpu *vcpu)
2592 {
2593         return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2594 }
2595
2596 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2597                                             unsigned long val)
2598 {
2599         struct vcpu_svm *svm = to_svm(vcpu);
2600         unsigned long cr0 = vcpu->arch.cr0;
2601         bool ret = false;
2602
2603         if (!is_guest_mode(vcpu) ||
2604             (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2605                 return false;
2606
2607         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2608         val &= ~SVM_CR0_SELECTIVE_MASK;
2609
2610         if (cr0 ^ val) {
2611                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2612                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2613         }
2614
2615         return ret;
2616 }
2617
2618 #define CR_VALID (1ULL << 63)
2619
2620 static int cr_interception(struct kvm_vcpu *vcpu)
2621 {
2622         struct vcpu_svm *svm = to_svm(vcpu);
2623         int reg, cr;
2624         unsigned long val;
2625         int err;
2626
2627         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2628                 return emulate_on_interception(vcpu);
2629
2630         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2631                 return emulate_on_interception(vcpu);
2632
2633         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2634         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2635                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2636         else
2637                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2638
2639         err = 0;
2640         if (cr >= 16) { /* mov to cr */
2641                 cr -= 16;
2642                 val = kvm_register_read(vcpu, reg);
2643                 trace_kvm_cr_write(cr, val);
2644                 switch (cr) {
2645                 case 0:
2646                         if (!check_selective_cr0_intercepted(vcpu, val))
2647                                 err = kvm_set_cr0(vcpu, val);
2648                         else
2649                                 return 1;
2650
2651                         break;
2652                 case 3:
2653                         err = kvm_set_cr3(vcpu, val);
2654                         break;
2655                 case 4:
2656                         err = kvm_set_cr4(vcpu, val);
2657                         break;
2658                 case 8:
2659                         err = kvm_set_cr8(vcpu, val);
2660                         break;
2661                 default:
2662                         WARN(1, "unhandled write to CR%d", cr);
2663                         kvm_queue_exception(vcpu, UD_VECTOR);
2664                         return 1;
2665                 }
2666         } else { /* mov from cr */
2667                 switch (cr) {
2668                 case 0:
2669                         val = kvm_read_cr0(vcpu);
2670                         break;
2671                 case 2:
2672                         val = vcpu->arch.cr2;
2673                         break;
2674                 case 3:
2675                         val = kvm_read_cr3(vcpu);
2676                         break;
2677                 case 4:
2678                         val = kvm_read_cr4(vcpu);
2679                         break;
2680                 case 8:
2681                         val = kvm_get_cr8(vcpu);
2682                         break;
2683                 default:
2684                         WARN(1, "unhandled read from CR%d", cr);
2685                         kvm_queue_exception(vcpu, UD_VECTOR);
2686                         return 1;
2687                 }
2688                 kvm_register_write(vcpu, reg, val);
2689                 trace_kvm_cr_read(cr, val);
2690         }
2691         return kvm_complete_insn_gp(vcpu, err);
2692 }
2693
2694 static int cr_trap(struct kvm_vcpu *vcpu)
2695 {
2696         struct vcpu_svm *svm = to_svm(vcpu);
2697         unsigned long old_value, new_value;
2698         unsigned int cr;
2699         int ret = 0;
2700
2701         new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2702
2703         cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2704         switch (cr) {
2705         case 0:
2706                 old_value = kvm_read_cr0(vcpu);
2707                 svm_set_cr0(vcpu, new_value);
2708
2709                 kvm_post_set_cr0(vcpu, old_value, new_value);
2710                 break;
2711         case 4:
2712                 old_value = kvm_read_cr4(vcpu);
2713                 svm_set_cr4(vcpu, new_value);
2714
2715                 kvm_post_set_cr4(vcpu, old_value, new_value);
2716                 break;
2717         case 8:
2718                 ret = kvm_set_cr8(vcpu, new_value);
2719                 break;
2720         default:
2721                 WARN(1, "unhandled CR%d write trap", cr);
2722                 kvm_queue_exception(vcpu, UD_VECTOR);
2723                 return 1;
2724         }
2725
2726         return kvm_complete_insn_gp(vcpu, ret);
2727 }
2728
2729 static int dr_interception(struct kvm_vcpu *vcpu)
2730 {
2731         struct vcpu_svm *svm = to_svm(vcpu);
2732         int reg, dr;
2733         unsigned long val;
2734         int err = 0;
2735
2736         /*
2737          * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
2738          * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
2739          */
2740         if (sev_es_guest(vcpu->kvm))
2741                 return 1;
2742
2743         if (vcpu->guest_debug == 0) {
2744                 /*
2745                  * No more DR vmexits; force a reload of the debug registers
2746                  * and reenter on this instruction.  The next vmexit will
2747                  * retrieve the full state of the debug registers.
2748                  */
2749                 clr_dr_intercepts(svm);
2750                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2751                 return 1;
2752         }
2753
2754         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2755                 return emulate_on_interception(vcpu);
2756
2757         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2758         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2759         if (dr >= 16) { /* mov to DRn  */
2760                 dr -= 16;
2761                 val = kvm_register_read(vcpu, reg);
2762                 err = kvm_set_dr(vcpu, dr, val);
2763         } else {
2764                 kvm_get_dr(vcpu, dr, &val);
2765                 kvm_register_write(vcpu, reg, val);
2766         }
2767
2768         return kvm_complete_insn_gp(vcpu, err);
2769 }
2770
2771 static int cr8_write_interception(struct kvm_vcpu *vcpu)
2772 {
2773         int r;
2774
2775         u8 cr8_prev = kvm_get_cr8(vcpu);
2776         /* instruction emulation calls kvm_set_cr8() */
2777         r = cr_interception(vcpu);
2778         if (lapic_in_kernel(vcpu))
2779                 return r;
2780         if (cr8_prev <= kvm_get_cr8(vcpu))
2781                 return r;
2782         vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2783         return 0;
2784 }
2785
2786 static int efer_trap(struct kvm_vcpu *vcpu)
2787 {
2788         struct msr_data msr_info;
2789         int ret;
2790
2791         /*
2792          * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2793          * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2794          * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2795          * the guest doesn't have X86_FEATURE_SVM.
2796          */
2797         msr_info.host_initiated = false;
2798         msr_info.index = MSR_EFER;
2799         msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2800         ret = kvm_set_msr_common(vcpu, &msr_info);
2801
2802         return kvm_complete_insn_gp(vcpu, ret);
2803 }
2804
2805 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2806 {
2807         msr->data = 0;
2808
2809         switch (msr->index) {
2810         case MSR_AMD64_DE_CFG:
2811                 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2812                         msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2813                 break;
2814         default:
2815                 return KVM_MSR_RET_INVALID;
2816         }
2817
2818         return 0;
2819 }
2820
2821 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2822 {
2823         struct vcpu_svm *svm = to_svm(vcpu);
2824
2825         switch (msr_info->index) {
2826         case MSR_AMD64_TSC_RATIO:
2827                 if (!msr_info->host_initiated &&
2828                     !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
2829                         return 1;
2830                 msr_info->data = svm->tsc_ratio_msr;
2831                 break;
2832         case MSR_STAR:
2833                 msr_info->data = svm->vmcb01.ptr->save.star;
2834                 break;
2835 #ifdef CONFIG_X86_64
2836         case MSR_LSTAR:
2837                 msr_info->data = svm->vmcb01.ptr->save.lstar;
2838                 break;
2839         case MSR_CSTAR:
2840                 msr_info->data = svm->vmcb01.ptr->save.cstar;
2841                 break;
2842         case MSR_KERNEL_GS_BASE:
2843                 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2844                 break;
2845         case MSR_SYSCALL_MASK:
2846                 msr_info->data = svm->vmcb01.ptr->save.sfmask;
2847                 break;
2848 #endif
2849         case MSR_IA32_SYSENTER_CS:
2850                 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2851                 break;
2852         case MSR_IA32_SYSENTER_EIP:
2853                 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2854                 if (guest_cpuid_is_intel(vcpu))
2855                         msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2856                 break;
2857         case MSR_IA32_SYSENTER_ESP:
2858                 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2859                 if (guest_cpuid_is_intel(vcpu))
2860                         msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2861                 break;
2862         case MSR_TSC_AUX:
2863                 msr_info->data = svm->tsc_aux;
2864                 break;
2865         case MSR_IA32_DEBUGCTLMSR:
2866                 msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
2867                 break;
2868         case MSR_IA32_LASTBRANCHFROMIP:
2869                 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
2870                 break;
2871         case MSR_IA32_LASTBRANCHTOIP:
2872                 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
2873                 break;
2874         case MSR_IA32_LASTINTFROMIP:
2875                 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
2876                 break;
2877         case MSR_IA32_LASTINTTOIP:
2878                 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
2879                 break;
2880         case MSR_VM_HSAVE_PA:
2881                 msr_info->data = svm->nested.hsave_msr;
2882                 break;
2883         case MSR_VM_CR:
2884                 msr_info->data = svm->nested.vm_cr_msr;
2885                 break;
2886         case MSR_IA32_SPEC_CTRL:
2887                 if (!msr_info->host_initiated &&
2888                     !guest_has_spec_ctrl_msr(vcpu))
2889                         return 1;
2890
2891                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2892                         msr_info->data = svm->vmcb->save.spec_ctrl;
2893                 else
2894                         msr_info->data = svm->spec_ctrl;
2895                 break;
2896         case MSR_AMD64_VIRT_SPEC_CTRL:
2897                 if (!msr_info->host_initiated &&
2898                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2899                         return 1;
2900
2901                 msr_info->data = svm->virt_spec_ctrl;
2902                 break;
2903         case MSR_F15H_IC_CFG: {
2904
2905                 int family, model;
2906
2907                 family = guest_cpuid_family(vcpu);
2908                 model  = guest_cpuid_model(vcpu);
2909
2910                 if (family < 0 || model < 0)
2911                         return kvm_get_msr_common(vcpu, msr_info);
2912
2913                 msr_info->data = 0;
2914
2915                 if (family == 0x15 &&
2916                     (model >= 0x2 && model < 0x20))
2917                         msr_info->data = 0x1E;
2918                 }
2919                 break;
2920         case MSR_AMD64_DE_CFG:
2921                 msr_info->data = svm->msr_decfg;
2922                 break;
2923         default:
2924                 return kvm_get_msr_common(vcpu, msr_info);
2925         }
2926         return 0;
2927 }
2928
2929 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2930 {
2931         struct vcpu_svm *svm = to_svm(vcpu);
2932         if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2933                 return kvm_complete_insn_gp(vcpu, err);
2934
2935         ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2936         ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2937                                 X86_TRAP_GP |
2938                                 SVM_EVTINJ_TYPE_EXEPT |
2939                                 SVM_EVTINJ_VALID);
2940         return 1;
2941 }
2942
2943 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2944 {
2945         struct vcpu_svm *svm = to_svm(vcpu);
2946         int svm_dis, chg_mask;
2947
2948         if (data & ~SVM_VM_CR_VALID_MASK)
2949                 return 1;
2950
2951         chg_mask = SVM_VM_CR_VALID_MASK;
2952
2953         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2954                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2955
2956         svm->nested.vm_cr_msr &= ~chg_mask;
2957         svm->nested.vm_cr_msr |= (data & chg_mask);
2958
2959         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2960
2961         /* check for svm_disable while efer.svme is set */
2962         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2963                 return 1;
2964
2965         return 0;
2966 }
2967
2968 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2969 {
2970         struct vcpu_svm *svm = to_svm(vcpu);
2971         int ret = 0;
2972
2973         u32 ecx = msr->index;
2974         u64 data = msr->data;
2975         switch (ecx) {
2976         case MSR_AMD64_TSC_RATIO:
2977
2978                 if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
2979
2980                         if (!msr->host_initiated)
2981                                 return 1;
2982                         /*
2983                          * In case TSC scaling is not enabled, always
2984                          * leave this MSR at the default value.
2985                          *
2986                          * Due to bug in qemu 6.2.0, it would try to set
2987                          * this msr to 0 if tsc scaling is not enabled.
2988                          * Ignore this value as well.
2989                          */
2990                         if (data != 0 && data != svm->tsc_ratio_msr)
2991                                 return 1;
2992                         break;
2993                 }
2994
2995                 if (data & SVM_TSC_RATIO_RSVD)
2996                         return 1;
2997
2998                 svm->tsc_ratio_msr = data;
2999
3000                 if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
3001                     is_guest_mode(vcpu))
3002                         nested_svm_update_tsc_ratio_msr(vcpu);
3003
3004                 break;
3005         case MSR_IA32_CR_PAT:
3006                 ret = kvm_set_msr_common(vcpu, msr);
3007                 if (ret)
3008                         break;
3009
3010                 svm->vmcb01.ptr->save.g_pat = data;
3011                 if (is_guest_mode(vcpu))
3012                         nested_vmcb02_compute_g_pat(svm);
3013                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
3014                 break;
3015         case MSR_IA32_SPEC_CTRL:
3016                 if (!msr->host_initiated &&
3017                     !guest_has_spec_ctrl_msr(vcpu))
3018                         return 1;
3019
3020                 if (kvm_spec_ctrl_test_value(data))
3021                         return 1;
3022
3023                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3024                         svm->vmcb->save.spec_ctrl = data;
3025                 else
3026                         svm->spec_ctrl = data;
3027                 if (!data)
3028                         break;
3029
3030                 /*
3031                  * For non-nested:
3032                  * When it's written (to non-zero) for the first time, pass
3033                  * it through.
3034                  *
3035                  * For nested:
3036                  * The handling of the MSR bitmap for L2 guests is done in
3037                  * nested_svm_vmrun_msrpm.
3038                  * We update the L1 MSR bit as well since it will end up
3039                  * touching the MSR anyway now.
3040                  */
3041                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
3042                 break;
3043         case MSR_AMD64_VIRT_SPEC_CTRL:
3044                 if (!msr->host_initiated &&
3045                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
3046                         return 1;
3047
3048                 if (data & ~SPEC_CTRL_SSBD)
3049                         return 1;
3050
3051                 svm->virt_spec_ctrl = data;
3052                 break;
3053         case MSR_STAR:
3054                 svm->vmcb01.ptr->save.star = data;
3055                 break;
3056 #ifdef CONFIG_X86_64
3057         case MSR_LSTAR:
3058                 svm->vmcb01.ptr->save.lstar = data;
3059                 break;
3060         case MSR_CSTAR:
3061                 svm->vmcb01.ptr->save.cstar = data;
3062                 break;
3063         case MSR_KERNEL_GS_BASE:
3064                 svm->vmcb01.ptr->save.kernel_gs_base = data;
3065                 break;
3066         case MSR_SYSCALL_MASK:
3067                 svm->vmcb01.ptr->save.sfmask = data;
3068                 break;
3069 #endif
3070         case MSR_IA32_SYSENTER_CS:
3071                 svm->vmcb01.ptr->save.sysenter_cs = data;
3072                 break;
3073         case MSR_IA32_SYSENTER_EIP:
3074                 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
3075                 /*
3076                  * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
3077                  * when we spoof an Intel vendor ID (for cross vendor migration).
3078                  * In this case we use this intercept to track the high
3079                  * 32 bit part of these msrs to support Intel's
3080                  * implementation of SYSENTER/SYSEXIT.
3081                  */
3082                 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3083                 break;
3084         case MSR_IA32_SYSENTER_ESP:
3085                 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
3086                 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3087                 break;
3088         case MSR_TSC_AUX:
3089                 /*
3090                  * TSC_AUX is usually changed only during boot and never read
3091                  * directly.  Intercept TSC_AUX instead of exposing it to the
3092                  * guest via direct_access_msrs, and switch it via user return.
3093                  */
3094                 preempt_disable();
3095                 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
3096                 preempt_enable();
3097                 if (ret)
3098                         break;
3099
3100                 svm->tsc_aux = data;
3101                 break;
3102         case MSR_IA32_DEBUGCTLMSR:
3103                 if (!lbrv) {
3104                         kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3105                         break;
3106                 }
3107                 if (data & DEBUGCTL_RESERVED_BITS)
3108                         return 1;
3109
3110                 svm_get_lbr_vmcb(svm)->save.dbgctl = data;
3111                 svm_update_lbrv(vcpu);
3112                 break;
3113         case MSR_VM_HSAVE_PA:
3114                 /*
3115                  * Old kernels did not validate the value written to
3116                  * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
3117                  * value to allow live migrating buggy or malicious guests
3118                  * originating from those kernels.
3119                  */
3120                 if (!msr->host_initiated && !page_address_valid(vcpu, data))
3121                         return 1;
3122
3123                 svm->nested.hsave_msr = data & PAGE_MASK;
3124                 break;
3125         case MSR_VM_CR:
3126                 return svm_set_vm_cr(vcpu, data);
3127         case MSR_VM_IGNNE:
3128                 kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3129                 break;
3130         case MSR_AMD64_DE_CFG: {
3131                 struct kvm_msr_entry msr_entry;
3132
3133                 msr_entry.index = msr->index;
3134                 if (svm_get_msr_feature(&msr_entry))
3135                         return 1;
3136
3137                 /* Check the supported bits */
3138                 if (data & ~msr_entry.data)
3139                         return 1;
3140
3141                 /* Don't allow the guest to change a bit, #GP */
3142                 if (!msr->host_initiated && (data ^ msr_entry.data))
3143                         return 1;
3144
3145                 svm->msr_decfg = data;
3146                 break;
3147         }
3148         default:
3149                 return kvm_set_msr_common(vcpu, msr);
3150         }
3151         return ret;
3152 }
3153
3154 static int msr_interception(struct kvm_vcpu *vcpu)
3155 {
3156         if (to_svm(vcpu)->vmcb->control.exit_info_1)
3157                 return kvm_emulate_wrmsr(vcpu);
3158         else
3159                 return kvm_emulate_rdmsr(vcpu);
3160 }
3161
3162 static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3163 {
3164         kvm_make_request(KVM_REQ_EVENT, vcpu);
3165         svm_clear_vintr(to_svm(vcpu));
3166
3167         /*
3168          * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3169          * In this case AVIC was temporarily disabled for
3170          * requesting the IRQ window and we have to re-enable it.
3171          *
3172          * If running nested, still remove the VM wide AVIC inhibit to
3173          * support case in which the interrupt window was requested when the
3174          * vCPU was not running nested.
3175
3176          * All vCPUs which run still run nested, will remain to have their
3177          * AVIC still inhibited due to per-cpu AVIC inhibition.
3178          */
3179         kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3180
3181         ++vcpu->stat.irq_window_exits;
3182         return 1;
3183 }
3184
3185 static int pause_interception(struct kvm_vcpu *vcpu)
3186 {
3187         bool in_kernel;
3188         /*
3189          * CPL is not made available for an SEV-ES guest, therefore
3190          * vcpu->arch.preempted_in_kernel can never be true.  Just
3191          * set in_kernel to false as well.
3192          */
3193         in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3194
3195         grow_ple_window(vcpu);
3196
3197         kvm_vcpu_on_spin(vcpu, in_kernel);
3198         return kvm_skip_emulated_instruction(vcpu);
3199 }
3200
3201 static int invpcid_interception(struct kvm_vcpu *vcpu)
3202 {
3203         struct vcpu_svm *svm = to_svm(vcpu);
3204         unsigned long type;
3205         gva_t gva;
3206
3207         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3208                 kvm_queue_exception(vcpu, UD_VECTOR);
3209                 return 1;
3210         }
3211
3212         /*
3213          * For an INVPCID intercept:
3214          * EXITINFO1 provides the linear address of the memory operand.
3215          * EXITINFO2 provides the contents of the register operand.
3216          */
3217         type = svm->vmcb->control.exit_info_2;
3218         gva = svm->vmcb->control.exit_info_1;
3219
3220         return kvm_handle_invpcid(vcpu, type, gva);
3221 }
3222
3223 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3224         [SVM_EXIT_READ_CR0]                     = cr_interception,
3225         [SVM_EXIT_READ_CR3]                     = cr_interception,
3226         [SVM_EXIT_READ_CR4]                     = cr_interception,
3227         [SVM_EXIT_READ_CR8]                     = cr_interception,
3228         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3229         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3230         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3231         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3232         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3233         [SVM_EXIT_READ_DR0]                     = dr_interception,
3234         [SVM_EXIT_READ_DR1]                     = dr_interception,
3235         [SVM_EXIT_READ_DR2]                     = dr_interception,
3236         [SVM_EXIT_READ_DR3]                     = dr_interception,
3237         [SVM_EXIT_READ_DR4]                     = dr_interception,
3238         [SVM_EXIT_READ_DR5]                     = dr_interception,
3239         [SVM_EXIT_READ_DR6]                     = dr_interception,
3240         [SVM_EXIT_READ_DR7]                     = dr_interception,
3241         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3242         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3243         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3244         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3245         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3246         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3247         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3248         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3249         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3250         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3251         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3252         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3253         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3254         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3255         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
3256         [SVM_EXIT_INTR]                         = intr_interception,
3257         [SVM_EXIT_NMI]                          = nmi_interception,
3258         [SVM_EXIT_SMI]                          = smi_interception,
3259         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3260         [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
3261         [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
3262         [SVM_EXIT_IRET]                         = iret_interception,
3263         [SVM_EXIT_INVD]                         = kvm_emulate_invd,
3264         [SVM_EXIT_PAUSE]                        = pause_interception,
3265         [SVM_EXIT_HLT]                          = kvm_emulate_halt,
3266         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3267         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3268         [SVM_EXIT_IOIO]                         = io_interception,
3269         [SVM_EXIT_MSR]                          = msr_interception,
3270         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3271         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3272         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3273         [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
3274         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3275         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3276         [SVM_EXIT_STGI]                         = stgi_interception,
3277         [SVM_EXIT_CLGI]                         = clgi_interception,
3278         [SVM_EXIT_SKINIT]                       = skinit_interception,
3279         [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
3280         [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3281         [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
3282         [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
3283         [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
3284         [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
3285         [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
3286         [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
3287         [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
3288         [SVM_EXIT_CR8_WRITE_TRAP]               = cr_trap,
3289         [SVM_EXIT_INVPCID]                      = invpcid_interception,
3290         [SVM_EXIT_NPF]                          = npf_interception,
3291         [SVM_EXIT_RSM]                          = rsm_interception,
3292         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3293         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3294         [SVM_EXIT_VMGEXIT]                      = sev_handle_vmgexit,
3295 };
3296
3297 static void dump_vmcb(struct kvm_vcpu *vcpu)
3298 {
3299         struct vcpu_svm *svm = to_svm(vcpu);
3300         struct vmcb_control_area *control = &svm->vmcb->control;
3301         struct vmcb_save_area *save = &svm->vmcb->save;
3302         struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3303
3304         if (!dump_invalid_vmcb) {
3305                 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3306                 return;
3307         }
3308
3309         pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3310                svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3311         pr_err("VMCB Control Area:\n");
3312         pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3313         pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3314         pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3315         pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3316         pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3317         pr_err("%-20s%08x %08x\n", "intercepts:",
3318               control->intercepts[INTERCEPT_WORD3],
3319                control->intercepts[INTERCEPT_WORD4]);
3320         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3321         pr_err("%-20s%d\n", "pause filter threshold:",
3322                control->pause_filter_thresh);
3323         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3324         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3325         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3326         pr_err("%-20s%d\n", "asid:", control->asid);
3327         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3328         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3329         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3330         pr_err("%-20s%08x\n", "int_state:", control->int_state);
3331         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3332         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3333         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3334         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3335         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3336         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3337         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3338         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3339         pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3340         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3341         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3342         pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3343         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3344         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3345         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3346         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3347         pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3348         pr_err("VMCB State Save Area:\n");
3349         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3350                "es:",
3351                save->es.selector, save->es.attrib,
3352                save->es.limit, save->es.base);
3353         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3354                "cs:",
3355                save->cs.selector, save->cs.attrib,
3356                save->cs.limit, save->cs.base);
3357         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3358                "ss:",
3359                save->ss.selector, save->ss.attrib,
3360                save->ss.limit, save->ss.base);
3361         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3362                "ds:",
3363                save->ds.selector, save->ds.attrib,
3364                save->ds.limit, save->ds.base);
3365         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3366                "fs:",
3367                save01->fs.selector, save01->fs.attrib,
3368                save01->fs.limit, save01->fs.base);
3369         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3370                "gs:",
3371                save01->gs.selector, save01->gs.attrib,
3372                save01->gs.limit, save01->gs.base);
3373         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3374                "gdtr:",
3375                save->gdtr.selector, save->gdtr.attrib,
3376                save->gdtr.limit, save->gdtr.base);
3377         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3378                "ldtr:",
3379                save01->ldtr.selector, save01->ldtr.attrib,
3380                save01->ldtr.limit, save01->ldtr.base);
3381         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3382                "idtr:",
3383                save->idtr.selector, save->idtr.attrib,
3384                save->idtr.limit, save->idtr.base);
3385         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3386                "tr:",
3387                save01->tr.selector, save01->tr.attrib,
3388                save01->tr.limit, save01->tr.base);
3389         pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3390                save->vmpl, save->cpl, save->efer);
3391         pr_err("%-15s %016llx %-13s %016llx\n",
3392                "cr0:", save->cr0, "cr2:", save->cr2);
3393         pr_err("%-15s %016llx %-13s %016llx\n",
3394                "cr3:", save->cr3, "cr4:", save->cr4);
3395         pr_err("%-15s %016llx %-13s %016llx\n",
3396                "dr6:", save->dr6, "dr7:", save->dr7);
3397         pr_err("%-15s %016llx %-13s %016llx\n",
3398                "rip:", save->rip, "rflags:", save->rflags);
3399         pr_err("%-15s %016llx %-13s %016llx\n",
3400                "rsp:", save->rsp, "rax:", save->rax);
3401         pr_err("%-15s %016llx %-13s %016llx\n",
3402                "star:", save01->star, "lstar:", save01->lstar);
3403         pr_err("%-15s %016llx %-13s %016llx\n",
3404                "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3405         pr_err("%-15s %016llx %-13s %016llx\n",
3406                "kernel_gs_base:", save01->kernel_gs_base,
3407                "sysenter_cs:", save01->sysenter_cs);
3408         pr_err("%-15s %016llx %-13s %016llx\n",
3409                "sysenter_esp:", save01->sysenter_esp,
3410                "sysenter_eip:", save01->sysenter_eip);
3411         pr_err("%-15s %016llx %-13s %016llx\n",
3412                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3413         pr_err("%-15s %016llx %-13s %016llx\n",
3414                "br_from:", save->br_from, "br_to:", save->br_to);
3415         pr_err("%-15s %016llx %-13s %016llx\n",
3416                "excp_from:", save->last_excp_from,
3417                "excp_to:", save->last_excp_to);
3418 }
3419
3420 static bool svm_check_exit_valid(u64 exit_code)
3421 {
3422         return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3423                 svm_exit_handlers[exit_code]);
3424 }
3425
3426 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3427 {
3428         vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3429         dump_vmcb(vcpu);
3430         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3431         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3432         vcpu->run->internal.ndata = 2;
3433         vcpu->run->internal.data[0] = exit_code;
3434         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3435         return 0;
3436 }
3437
3438 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3439 {
3440         if (!svm_check_exit_valid(exit_code))
3441                 return svm_handle_invalid_exit(vcpu, exit_code);
3442
3443 #ifdef CONFIG_RETPOLINE
3444         if (exit_code == SVM_EXIT_MSR)
3445                 return msr_interception(vcpu);
3446         else if (exit_code == SVM_EXIT_VINTR)
3447                 return interrupt_window_interception(vcpu);
3448         else if (exit_code == SVM_EXIT_INTR)
3449                 return intr_interception(vcpu);
3450         else if (exit_code == SVM_EXIT_HLT)
3451                 return kvm_emulate_halt(vcpu);
3452         else if (exit_code == SVM_EXIT_NPF)
3453                 return npf_interception(vcpu);
3454 #endif
3455         return svm_exit_handlers[exit_code](vcpu);
3456 }
3457
3458 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3459                               u64 *info1, u64 *info2,
3460                               u32 *intr_info, u32 *error_code)
3461 {
3462         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3463
3464         *reason = control->exit_code;
3465         *info1 = control->exit_info_1;
3466         *info2 = control->exit_info_2;
3467         *intr_info = control->exit_int_info;
3468         if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3469             (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3470                 *error_code = control->exit_int_info_err;
3471         else
3472                 *error_code = 0;
3473 }
3474
3475 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3476 {
3477         struct vcpu_svm *svm = to_svm(vcpu);
3478         struct kvm_run *kvm_run = vcpu->run;
3479         u32 exit_code = svm->vmcb->control.exit_code;
3480
3481         /* SEV-ES guests must use the CR write traps to track CR registers. */
3482         if (!sev_es_guest(vcpu->kvm)) {
3483                 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3484                         vcpu->arch.cr0 = svm->vmcb->save.cr0;
3485                 if (npt_enabled)
3486                         vcpu->arch.cr3 = svm->vmcb->save.cr3;
3487         }
3488
3489         if (is_guest_mode(vcpu)) {
3490                 int vmexit;
3491
3492                 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3493
3494                 vmexit = nested_svm_exit_special(svm);
3495
3496                 if (vmexit == NESTED_EXIT_CONTINUE)
3497                         vmexit = nested_svm_exit_handled(svm);
3498
3499                 if (vmexit == NESTED_EXIT_DONE)
3500                         return 1;
3501         }
3502
3503         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3504                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3505                 kvm_run->fail_entry.hardware_entry_failure_reason
3506                         = svm->vmcb->control.exit_code;
3507                 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3508                 dump_vmcb(vcpu);
3509                 return 0;
3510         }
3511
3512         if (exit_fastpath != EXIT_FASTPATH_NONE)
3513                 return 1;
3514
3515         return svm_invoke_exit_handler(vcpu, exit_code);
3516 }
3517
3518 static void pre_svm_run(struct kvm_vcpu *vcpu)
3519 {
3520         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3521         struct vcpu_svm *svm = to_svm(vcpu);
3522
3523         /*
3524          * If the previous vmrun of the vmcb occurred on a different physical
3525          * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3526          * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3527          */
3528         if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3529                 svm->current_vmcb->asid_generation = 0;
3530                 vmcb_mark_all_dirty(svm->vmcb);
3531                 svm->current_vmcb->cpu = vcpu->cpu;
3532         }
3533
3534         if (sev_guest(vcpu->kvm))
3535                 return pre_sev_run(svm, vcpu->cpu);
3536
3537         /* FIXME: handle wraparound of asid_generation */
3538         if (svm->current_vmcb->asid_generation != sd->asid_generation)
3539                 new_asid(svm, sd);
3540 }
3541
3542 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3543 {
3544         struct vcpu_svm *svm = to_svm(vcpu);
3545
3546         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3547
3548         if (svm->nmi_l1_to_l2)
3549                 return;
3550
3551         svm->nmi_masked = true;
3552         svm_set_iret_intercept(svm);
3553         ++vcpu->stat.nmi_injections;
3554 }
3555
3556 static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
3557 {
3558         struct vcpu_svm *svm = to_svm(vcpu);
3559
3560         if (!is_vnmi_enabled(svm))
3561                 return false;
3562
3563         return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
3564 }
3565
3566 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
3567 {
3568         struct vcpu_svm *svm = to_svm(vcpu);
3569
3570         if (!is_vnmi_enabled(svm))
3571                 return false;
3572
3573         if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
3574                 return false;
3575
3576         svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
3577         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
3578
3579         /*
3580          * Because the pending NMI is serviced by hardware, KVM can't know when
3581          * the NMI is "injected", but for all intents and purposes, passing the
3582          * NMI off to hardware counts as injection.
3583          */
3584         ++vcpu->stat.nmi_injections;
3585
3586         return true;
3587 }
3588
3589 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3590 {
3591         struct vcpu_svm *svm = to_svm(vcpu);
3592         u32 type;
3593
3594         if (vcpu->arch.interrupt.soft) {
3595                 if (svm_update_soft_interrupt_rip(vcpu))
3596                         return;
3597
3598                 type = SVM_EVTINJ_TYPE_SOFT;
3599         } else {
3600                 type = SVM_EVTINJ_TYPE_INTR;
3601         }
3602
3603         trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3604                            vcpu->arch.interrupt.soft, reinjected);
3605         ++vcpu->stat.irq_injections;
3606
3607         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3608                                        SVM_EVTINJ_VALID | type;
3609 }
3610
3611 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3612                                      int trig_mode, int vector)
3613 {
3614         /*
3615          * apic->apicv_active must be read after vcpu->mode.
3616          * Pairs with smp_store_release in vcpu_enter_guest.
3617          */
3618         bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3619
3620         /* Note, this is called iff the local APIC is in-kernel. */
3621         if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3622                 /* Process the interrupt via kvm_check_and_inject_events(). */
3623                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3624                 kvm_vcpu_kick(vcpu);
3625                 return;
3626         }
3627
3628         trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3629         if (in_guest_mode) {
3630                 /*
3631                  * Signal the doorbell to tell hardware to inject the IRQ.  If
3632                  * the vCPU exits the guest before the doorbell chimes, hardware
3633                  * will automatically process AVIC interrupts at the next VMRUN.
3634                  */
3635                 avic_ring_doorbell(vcpu);
3636         } else {
3637                 /*
3638                  * Wake the vCPU if it was blocking.  KVM will then detect the
3639                  * pending IRQ when checking if the vCPU has a wake event.
3640                  */
3641                 kvm_vcpu_wake_up(vcpu);
3642         }
3643 }
3644
3645 static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3646                                   int trig_mode, int vector)
3647 {
3648         kvm_lapic_set_irr(vector, apic);
3649
3650         /*
3651          * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3652          * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3653          * the read of guest_mode.  This guarantees that either VMRUN will see
3654          * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3655          * will signal the doorbell if the CPU has already entered the guest.
3656          */
3657         smp_mb__after_atomic();
3658         svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3659 }
3660
3661 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3662 {
3663         struct vcpu_svm *svm = to_svm(vcpu);
3664
3665         /*
3666          * SEV-ES guests must always keep the CR intercepts cleared. CR
3667          * tracking is done using the CR write traps.
3668          */
3669         if (sev_es_guest(vcpu->kvm))
3670                 return;
3671
3672         if (nested_svm_virtualize_tpr(vcpu))
3673                 return;
3674
3675         svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3676
3677         if (irr == -1)
3678                 return;
3679
3680         if (tpr >= irr)
3681                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3682 }
3683
3684 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3685 {
3686         struct vcpu_svm *svm = to_svm(vcpu);
3687
3688         if (is_vnmi_enabled(svm))
3689                 return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
3690         else
3691                 return svm->nmi_masked;
3692 }
3693
3694 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3695 {
3696         struct vcpu_svm *svm = to_svm(vcpu);
3697
3698         if (is_vnmi_enabled(svm)) {
3699                 if (masked)
3700                         svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
3701                 else
3702                         svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
3703
3704         } else {
3705                 svm->nmi_masked = masked;
3706                 if (masked)
3707                         svm_set_iret_intercept(svm);
3708                 else
3709                         svm_clr_iret_intercept(svm);
3710         }
3711 }
3712
3713 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3714 {
3715         struct vcpu_svm *svm = to_svm(vcpu);
3716         struct vmcb *vmcb = svm->vmcb;
3717
3718         if (!gif_set(svm))
3719                 return true;
3720
3721         if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3722                 return false;
3723
3724         if (svm_get_nmi_mask(vcpu))
3725                 return true;
3726
3727         return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
3728 }
3729
3730 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3731 {
3732         struct vcpu_svm *svm = to_svm(vcpu);
3733         if (svm->nested.nested_run_pending)
3734                 return -EBUSY;
3735
3736         if (svm_nmi_blocked(vcpu))
3737                 return 0;
3738
3739         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3740         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3741                 return -EBUSY;
3742         return 1;
3743 }
3744
3745 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3746 {
3747         struct vcpu_svm *svm = to_svm(vcpu);
3748         struct vmcb *vmcb = svm->vmcb;
3749
3750         if (!gif_set(svm))
3751                 return true;
3752
3753         if (is_guest_mode(vcpu)) {
3754                 /* As long as interrupts are being delivered...  */
3755                 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3756                     ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3757                     : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3758                         return true;
3759
3760                 /* ... vmexits aren't blocked by the interrupt shadow  */
3761                 if (nested_exit_on_intr(svm))
3762                         return false;
3763         } else {
3764                 if (!svm_get_if_flag(vcpu))
3765                         return true;
3766         }
3767
3768         return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3769 }
3770
3771 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3772 {
3773         struct vcpu_svm *svm = to_svm(vcpu);
3774
3775         if (svm->nested.nested_run_pending)
3776                 return -EBUSY;
3777
3778         if (svm_interrupt_blocked(vcpu))
3779                 return 0;
3780
3781         /*
3782          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3783          * e.g. if the IRQ arrived asynchronously after checking nested events.
3784          */
3785         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3786                 return -EBUSY;
3787
3788         return 1;
3789 }
3790
3791 static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3792 {
3793         struct vcpu_svm *svm = to_svm(vcpu);
3794
3795         /*
3796          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3797          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3798          * get that intercept, this function will be called again though and
3799          * we'll get the vintr intercept. However, if the vGIF feature is
3800          * enabled, the STGI interception will not occur. Enable the irq
3801          * window under the assumption that the hardware will set the GIF.
3802          */
3803         if (vgif || gif_set(svm)) {
3804                 /*
3805                  * IRQ window is not needed when AVIC is enabled,
3806                  * unless we have pending ExtINT since it cannot be injected
3807                  * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3808                  * and fallback to injecting IRQ via V_IRQ.
3809                  *
3810                  * If running nested, AVIC is already locally inhibited
3811                  * on this vCPU, therefore there is no need to request
3812                  * the VM wide AVIC inhibition.
3813                  */
3814                 if (!is_guest_mode(vcpu))
3815                         kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3816
3817                 svm_set_vintr(svm);
3818         }
3819 }
3820
3821 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3822 {
3823         struct vcpu_svm *svm = to_svm(vcpu);
3824
3825         /*
3826          * KVM should never request an NMI window when vNMI is enabled, as KVM
3827          * allows at most one to-be-injected NMI and one pending NMI, i.e. if
3828          * two NMIs arrive simultaneously, KVM will inject one and set
3829          * V_NMI_PENDING for the other.  WARN, but continue with the standard
3830          * single-step approach to try and salvage the pending NMI.
3831          */
3832         WARN_ON_ONCE(is_vnmi_enabled(svm));
3833
3834         if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
3835                 return; /* IRET will cause a vm exit */
3836
3837         /*
3838          * SEV-ES guests are responsible for signaling when a vCPU is ready to
3839          * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
3840          * KVM can't intercept and single-step IRET to detect when NMIs are
3841          * unblocked (architecturally speaking).  See SVM_VMGEXIT_NMI_COMPLETE.
3842          *
3843          * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
3844          * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
3845          * supported NAEs in the GHCB protocol.
3846          */
3847         if (sev_es_guest(vcpu->kvm))
3848                 return;
3849
3850         if (!gif_set(svm)) {
3851                 if (vgif)
3852                         svm_set_intercept(svm, INTERCEPT_STGI);
3853                 return; /* STGI will cause a vm exit */
3854         }
3855
3856         /*
3857          * Something prevents NMI from been injected. Single step over possible
3858          * problem (IRET or exception injection or interrupt shadow)
3859          */
3860         svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3861         svm->nmi_singlestep = true;
3862         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3863 }
3864
3865 static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
3866 {
3867         struct vcpu_svm *svm = to_svm(vcpu);
3868
3869         /*
3870          * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
3871          * A TLB flush for the current ASID flushes both "host" and "guest" TLB
3872          * entries, and thus is a superset of Hyper-V's fine grained flushing.
3873          */
3874         kvm_hv_vcpu_purge_flush_tlb(vcpu);
3875
3876         /*
3877          * Flush only the current ASID even if the TLB flush was invoked via
3878          * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3879          * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3880          * unconditionally does a TLB flush on both nested VM-Enter and nested
3881          * VM-Exit (via kvm_mmu_reset_context()).
3882          */
3883         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3884                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3885         else
3886                 svm->current_vmcb->asid_generation--;
3887 }
3888
3889 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3890 {
3891         hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
3892
3893         /*
3894          * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
3895          * flush the NPT mappings via hypercall as flushing the ASID only
3896          * affects virtual to physical mappings, it does not invalidate guest
3897          * physical to host physical mappings.
3898          */
3899         if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
3900                 hyperv_flush_guest_mapping(root_tdp);
3901
3902         svm_flush_tlb_asid(vcpu);
3903 }
3904
3905 static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
3906 {
3907         /*
3908          * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
3909          * flushes should be routed to hv_flush_remote_tlbs() without requesting
3910          * a "regular" remote flush.  Reaching this point means either there's
3911          * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
3912          * which might be fatal to the guest.  Yell, but try to recover.
3913          */
3914         if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
3915                 hv_flush_remote_tlbs(vcpu->kvm);
3916
3917         svm_flush_tlb_asid(vcpu);
3918 }
3919
3920 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3921 {
3922         struct vcpu_svm *svm = to_svm(vcpu);
3923
3924         invlpga(gva, svm->vmcb->control.asid);
3925 }
3926
3927 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3928 {
3929         struct vcpu_svm *svm = to_svm(vcpu);
3930
3931         if (nested_svm_virtualize_tpr(vcpu))
3932                 return;
3933
3934         if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3935                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3936                 kvm_set_cr8(vcpu, cr8);
3937         }
3938 }
3939
3940 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3941 {
3942         struct vcpu_svm *svm = to_svm(vcpu);
3943         u64 cr8;
3944
3945         if (nested_svm_virtualize_tpr(vcpu) ||
3946             kvm_vcpu_apicv_active(vcpu))
3947                 return;
3948
3949         cr8 = kvm_get_cr8(vcpu);
3950         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3951         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3952 }
3953
3954 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3955                                         int type)
3956 {
3957         bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3958         bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3959         struct vcpu_svm *svm = to_svm(vcpu);
3960
3961         /*
3962          * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3963          * associated with the original soft exception/interrupt.  next_rip is
3964          * cleared on all exits that can occur while vectoring an event, so KVM
3965          * needs to manually set next_rip for re-injection.  Unlike the !nrips
3966          * case below, this needs to be done if and only if KVM is re-injecting
3967          * the same event, i.e. if the event is a soft exception/interrupt,
3968          * otherwise next_rip is unused on VMRUN.
3969          */
3970         if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3971             kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3972                 svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3973         /*
3974          * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3975          * injecting the soft exception/interrupt.  That advancement needs to
3976          * be unwound if vectoring didn't complete.  Note, the new event may
3977          * not be the injected event, e.g. if KVM injected an INTn, the INTn
3978          * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3979          * be the reported vectored event, but RIP still needs to be unwound.
3980          */
3981         else if (!nrips && (is_soft || is_exception) &&
3982                  kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3983                 kvm_rip_write(vcpu, svm->soft_int_old_rip);
3984 }
3985
3986 static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3987 {
3988         struct vcpu_svm *svm = to_svm(vcpu);
3989         u8 vector;
3990         int type;
3991         u32 exitintinfo = svm->vmcb->control.exit_int_info;
3992         bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
3993         bool soft_int_injected = svm->soft_int_injected;
3994
3995         svm->nmi_l1_to_l2 = false;
3996         svm->soft_int_injected = false;
3997
3998         /*
3999          * If we've made progress since setting awaiting_iret_completion, we've
4000          * executed an IRET and can allow NMI injection.
4001          */
4002         if (svm->awaiting_iret_completion &&
4003             kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
4004                 svm->awaiting_iret_completion = false;
4005                 svm->nmi_masked = false;
4006                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4007         }
4008
4009         vcpu->arch.nmi_injected = false;
4010         kvm_clear_exception_queue(vcpu);
4011         kvm_clear_interrupt_queue(vcpu);
4012
4013         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
4014                 return;
4015
4016         kvm_make_request(KVM_REQ_EVENT, vcpu);
4017
4018         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
4019         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
4020
4021         if (soft_int_injected)
4022                 svm_complete_soft_interrupt(vcpu, vector, type);
4023
4024         switch (type) {
4025         case SVM_EXITINTINFO_TYPE_NMI:
4026                 vcpu->arch.nmi_injected = true;
4027                 svm->nmi_l1_to_l2 = nmi_l1_to_l2;
4028                 break;
4029         case SVM_EXITINTINFO_TYPE_EXEPT:
4030                 /*
4031                  * Never re-inject a #VC exception.
4032                  */
4033                 if (vector == X86_TRAP_VC)
4034                         break;
4035
4036                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
4037                         u32 err = svm->vmcb->control.exit_int_info_err;
4038                         kvm_requeue_exception_e(vcpu, vector, err);
4039
4040                 } else
4041                         kvm_requeue_exception(vcpu, vector);
4042                 break;
4043         case SVM_EXITINTINFO_TYPE_INTR:
4044                 kvm_queue_interrupt(vcpu, vector, false);
4045                 break;
4046         case SVM_EXITINTINFO_TYPE_SOFT:
4047                 kvm_queue_interrupt(vcpu, vector, true);
4048                 break;
4049         default:
4050                 break;
4051         }
4052
4053 }
4054
4055 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4056 {
4057         struct vcpu_svm *svm = to_svm(vcpu);
4058         struct vmcb_control_area *control = &svm->vmcb->control;
4059
4060         control->exit_int_info = control->event_inj;
4061         control->exit_int_info_err = control->event_inj_err;
4062         control->event_inj = 0;
4063         svm_complete_interrupts(vcpu);
4064 }
4065
4066 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
4067 {
4068         return 1;
4069 }
4070
4071 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
4072 {
4073         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
4074             to_svm(vcpu)->vmcb->control.exit_info_1)
4075                 return handle_fastpath_set_msr_irqoff(vcpu);
4076
4077         return EXIT_FASTPATH_NONE;
4078 }
4079
4080 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
4081 {
4082         struct vcpu_svm *svm = to_svm(vcpu);
4083
4084         guest_state_enter_irqoff();
4085
4086         amd_clear_divider();
4087
4088         if (sev_es_guest(vcpu->kvm))
4089                 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
4090         else
4091                 __svm_vcpu_run(svm, spec_ctrl_intercepted);
4092
4093         guest_state_exit_irqoff();
4094 }
4095
4096 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
4097 {
4098         struct vcpu_svm *svm = to_svm(vcpu);
4099         bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
4100
4101         trace_kvm_entry(vcpu);
4102
4103         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4104         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4105         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4106
4107         /*
4108          * Disable singlestep if we're injecting an interrupt/exception.
4109          * We don't want our modified rflags to be pushed on the stack where
4110          * we might not be able to easily reset them if we disabled NMI
4111          * singlestep later.
4112          */
4113         if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
4114                 /*
4115                  * Event injection happens before external interrupts cause a
4116                  * vmexit and interrupts are disabled here, so smp_send_reschedule
4117                  * is enough to force an immediate vmexit.
4118                  */
4119                 disable_nmi_singlestep(svm);
4120                 smp_send_reschedule(vcpu->cpu);
4121         }
4122
4123         pre_svm_run(vcpu);
4124
4125         sync_lapic_to_cr8(vcpu);
4126
4127         if (unlikely(svm->asid != svm->vmcb->control.asid)) {
4128                 svm->vmcb->control.asid = svm->asid;
4129                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
4130         }
4131         svm->vmcb->save.cr2 = vcpu->arch.cr2;
4132
4133         svm_hv_update_vp_id(svm->vmcb, vcpu);
4134
4135         /*
4136          * Run with all-zero DR6 unless needed, so that we can get the exact cause
4137          * of a #DB.
4138          */
4139         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
4140                 svm_set_dr6(svm, vcpu->arch.dr6);
4141         else
4142                 svm_set_dr6(svm, DR6_ACTIVE_LOW);
4143
4144         clgi();
4145         kvm_load_guest_xsave_state(vcpu);
4146
4147         kvm_wait_lapic_expire(vcpu);
4148
4149         /*
4150          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
4151          * it's non-zero. Since vmentry is serialising on affected CPUs, there
4152          * is no need to worry about the conditional branch over the wrmsr
4153          * being speculatively taken.
4154          */
4155         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4156                 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
4157
4158         svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
4159
4160         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4161                 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
4162
4163         if (!sev_es_guest(vcpu->kvm)) {
4164                 vcpu->arch.cr2 = svm->vmcb->save.cr2;
4165                 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4166                 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4167                 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4168         }
4169         vcpu->arch.regs_dirty = 0;
4170
4171         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4172                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4173
4174         kvm_load_host_xsave_state(vcpu);
4175         stgi();
4176
4177         /* Any pending NMI will happen here */
4178
4179         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4180                 kvm_after_interrupt(vcpu);
4181
4182         sync_cr8_to_lapic(vcpu);
4183
4184         svm->next_rip = 0;
4185         if (is_guest_mode(vcpu)) {
4186                 nested_sync_control_from_vmcb02(svm);
4187
4188                 /* Track VMRUNs that have made past consistency checking */
4189                 if (svm->nested.nested_run_pending &&
4190                     svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4191                         ++vcpu->stat.nested_run;
4192
4193                 svm->nested.nested_run_pending = 0;
4194         }
4195
4196         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4197         vmcb_mark_all_clean(svm->vmcb);
4198
4199         /* if exit due to PF check for async PF */
4200         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4201                 vcpu->arch.apf.host_apf_flags =
4202                         kvm_read_and_reset_apf_flags();
4203
4204         vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4205
4206         /*
4207          * We need to handle MC intercepts here before the vcpu has a chance to
4208          * change the physical cpu
4209          */
4210         if (unlikely(svm->vmcb->control.exit_code ==
4211                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
4212                 svm_handle_mce(vcpu);
4213
4214         trace_kvm_exit(vcpu, KVM_ISA_SVM);
4215
4216         svm_complete_interrupts(vcpu);
4217
4218         if (is_guest_mode(vcpu))
4219                 return EXIT_FASTPATH_NONE;
4220
4221         return svm_exit_handlers_fastpath(vcpu);
4222 }
4223
4224 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4225                              int root_level)
4226 {
4227         struct vcpu_svm *svm = to_svm(vcpu);
4228         unsigned long cr3;
4229
4230         if (npt_enabled) {
4231                 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4232                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4233
4234                 hv_track_root_tdp(vcpu, root_hpa);
4235
4236                 cr3 = vcpu->arch.cr3;
4237         } else if (root_level >= PT64_ROOT_4LEVEL) {
4238                 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4239         } else {
4240                 /* PCID in the guest should be impossible with a 32-bit MMU. */
4241                 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4242                 cr3 = root_hpa;
4243         }
4244
4245         svm->vmcb->save.cr3 = cr3;
4246         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4247 }
4248
4249 static void
4250 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4251 {
4252         /*
4253          * Patch in the VMMCALL instruction:
4254          */
4255         hypercall[0] = 0x0f;
4256         hypercall[1] = 0x01;
4257         hypercall[2] = 0xd9;
4258 }
4259
4260 /*
4261  * The kvm parameter can be NULL (module initialization, or invocation before
4262  * VM creation). Be sure to check the kvm parameter before using it.
4263  */
4264 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4265 {
4266         switch (index) {
4267         case MSR_IA32_MCG_EXT_CTL:
4268         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
4269                 return false;
4270         case MSR_IA32_SMBASE:
4271                 if (!IS_ENABLED(CONFIG_KVM_SMM))
4272                         return false;
4273                 /* SEV-ES guests do not support SMM, so report false */
4274                 if (kvm && sev_es_guest(kvm))
4275                         return false;
4276                 break;
4277         default:
4278                 break;
4279         }
4280
4281         return true;
4282 }
4283
4284 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4285 {
4286         struct vcpu_svm *svm = to_svm(vcpu);
4287         struct kvm_cpuid_entry2 *best;
4288
4289         /*
4290          * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
4291          * can only disable all variants of by disallowing CR4.OSXSAVE from
4292          * being set.  As a result, if the host has XSAVE and XSAVES, and the
4293          * guest has XSAVE enabled, the guest can execute XSAVES without
4294          * faulting.  Treat XSAVES as enabled in this case regardless of
4295          * whether it's advertised to the guest so that KVM context switches
4296          * XSS on VM-Enter/VM-Exit.  Failure to do so would effectively give
4297          * the guest read/write access to the host's XSS.
4298          */
4299         if (boot_cpu_has(X86_FEATURE_XSAVE) &&
4300             boot_cpu_has(X86_FEATURE_XSAVES) &&
4301             guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
4302                 kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
4303
4304         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
4305         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
4306         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
4307
4308         /*
4309          * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
4310          * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
4311          * SVM on Intel is bonkers and extremely unlikely to work).
4312          */
4313         if (!guest_cpuid_is_intel(vcpu))
4314                 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4315
4316         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
4317         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
4318         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
4319         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
4320
4321         svm_recalc_instruction_intercepts(vcpu, svm);
4322
4323         if (boot_cpu_has(X86_FEATURE_IBPB))
4324                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
4325                                      !!guest_has_pred_cmd_msr(vcpu));
4326
4327         if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
4328                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
4329                                      !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
4330
4331         /* For sev guests, the memory encryption bit is not reserved in CR3.  */
4332         if (sev_guest(vcpu->kvm)) {
4333                 best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4334                 if (best)
4335                         vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4336         }
4337
4338         init_vmcb_after_set_cpuid(vcpu);
4339 }
4340
4341 static bool svm_has_wbinvd_exit(void)
4342 {
4343         return true;
4344 }
4345
4346 #define PRE_EX(exit)  { .exit_code = (exit), \
4347                         .stage = X86_ICPT_PRE_EXCEPT, }
4348 #define POST_EX(exit) { .exit_code = (exit), \
4349                         .stage = X86_ICPT_POST_EXCEPT, }
4350 #define POST_MEM(exit) { .exit_code = (exit), \
4351                         .stage = X86_ICPT_POST_MEMACCESS, }
4352
4353 static const struct __x86_intercept {
4354         u32 exit_code;
4355         enum x86_intercept_stage stage;
4356 } x86_intercept_map[] = {
4357         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4358         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4359         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4360         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4361         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4362         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4363         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4364         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4365         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4366         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4367         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4368         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4369         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4370         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4371         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4372         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4373         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4374         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4375         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4376         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4377         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4378         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4379         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4380         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4381         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4382         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4383         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4384         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4385         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4386         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4387         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4388         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4389         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4390         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4391         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4392         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4393         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4394         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4395         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4396         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4397         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4398         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4399         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4400         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4401         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4402         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4403         [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
4404 };
4405
4406 #undef PRE_EX
4407 #undef POST_EX
4408 #undef POST_MEM
4409
4410 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4411                                struct x86_instruction_info *info,
4412                                enum x86_intercept_stage stage,
4413                                struct x86_exception *exception)
4414 {
4415         struct vcpu_svm *svm = to_svm(vcpu);
4416         int vmexit, ret = X86EMUL_CONTINUE;
4417         struct __x86_intercept icpt_info;
4418         struct vmcb *vmcb = svm->vmcb;
4419
4420         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4421                 goto out;
4422
4423         icpt_info = x86_intercept_map[info->intercept];
4424
4425         if (stage != icpt_info.stage)
4426                 goto out;
4427
4428         switch (icpt_info.exit_code) {
4429         case SVM_EXIT_READ_CR0:
4430                 if (info->intercept == x86_intercept_cr_read)
4431                         icpt_info.exit_code += info->modrm_reg;
4432                 break;
4433         case SVM_EXIT_WRITE_CR0: {
4434                 unsigned long cr0, val;
4435
4436                 if (info->intercept == x86_intercept_cr_write)
4437                         icpt_info.exit_code += info->modrm_reg;
4438
4439                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4440                     info->intercept == x86_intercept_clts)
4441                         break;
4442
4443                 if (!(vmcb12_is_intercept(&svm->nested.ctl,
4444                                         INTERCEPT_SELECTIVE_CR0)))
4445                         break;
4446
4447                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4448                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4449
4450                 if (info->intercept == x86_intercept_lmsw) {
4451                         cr0 &= 0xfUL;
4452                         val &= 0xfUL;
4453                         /* lmsw can't clear PE - catch this here */
4454                         if (cr0 & X86_CR0_PE)
4455                                 val |= X86_CR0_PE;
4456                 }
4457
4458                 if (cr0 ^ val)
4459                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4460
4461                 break;
4462         }
4463         case SVM_EXIT_READ_DR0:
4464         case SVM_EXIT_WRITE_DR0:
4465                 icpt_info.exit_code += info->modrm_reg;
4466                 break;
4467         case SVM_EXIT_MSR:
4468                 if (info->intercept == x86_intercept_wrmsr)
4469                         vmcb->control.exit_info_1 = 1;
4470                 else
4471                         vmcb->control.exit_info_1 = 0;
4472                 break;
4473         case SVM_EXIT_PAUSE:
4474                 /*
4475                  * We get this for NOP only, but pause
4476                  * is rep not, check this here
4477                  */
4478                 if (info->rep_prefix != REPE_PREFIX)
4479                         goto out;
4480                 break;
4481         case SVM_EXIT_IOIO: {
4482                 u64 exit_info;
4483                 u32 bytes;
4484
4485                 if (info->intercept == x86_intercept_in ||
4486                     info->intercept == x86_intercept_ins) {
4487                         exit_info = ((info->src_val & 0xffff) << 16) |
4488                                 SVM_IOIO_TYPE_MASK;
4489                         bytes = info->dst_bytes;
4490                 } else {
4491                         exit_info = (info->dst_val & 0xffff) << 16;
4492                         bytes = info->src_bytes;
4493                 }
4494
4495                 if (info->intercept == x86_intercept_outs ||
4496                     info->intercept == x86_intercept_ins)
4497                         exit_info |= SVM_IOIO_STR_MASK;
4498
4499                 if (info->rep_prefix)
4500                         exit_info |= SVM_IOIO_REP_MASK;
4501
4502                 bytes = min(bytes, 4u);
4503
4504                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4505
4506                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4507
4508                 vmcb->control.exit_info_1 = exit_info;
4509                 vmcb->control.exit_info_2 = info->next_rip;
4510
4511                 break;
4512         }
4513         default:
4514                 break;
4515         }
4516
4517         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4518         if (static_cpu_has(X86_FEATURE_NRIPS))
4519                 vmcb->control.next_rip  = info->next_rip;
4520         vmcb->control.exit_code = icpt_info.exit_code;
4521         vmexit = nested_svm_exit_handled(svm);
4522
4523         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4524                                            : X86EMUL_CONTINUE;
4525
4526 out:
4527         return ret;
4528 }
4529
4530 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4531 {
4532         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4533                 vcpu->arch.at_instruction_boundary = true;
4534 }
4535
4536 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4537 {
4538         if (!kvm_pause_in_guest(vcpu->kvm))
4539                 shrink_ple_window(vcpu);
4540 }
4541
4542 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4543 {
4544         /* [63:9] are reserved. */
4545         vcpu->arch.mcg_cap &= 0x1ff;
4546 }
4547
4548 #ifdef CONFIG_KVM_SMM
4549 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4550 {
4551         struct vcpu_svm *svm = to_svm(vcpu);
4552
4553         /* Per APM Vol.2 15.22.2 "Response to SMI" */
4554         if (!gif_set(svm))
4555                 return true;
4556
4557         return is_smm(vcpu);
4558 }
4559
4560 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4561 {
4562         struct vcpu_svm *svm = to_svm(vcpu);
4563         if (svm->nested.nested_run_pending)
4564                 return -EBUSY;
4565
4566         if (svm_smi_blocked(vcpu))
4567                 return 0;
4568
4569         /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4570         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4571                 return -EBUSY;
4572
4573         return 1;
4574 }
4575
4576 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4577 {
4578         struct vcpu_svm *svm = to_svm(vcpu);
4579         struct kvm_host_map map_save;
4580         int ret;
4581
4582         if (!is_guest_mode(vcpu))
4583                 return 0;
4584
4585         /*
4586          * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
4587          * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4588          */
4589
4590         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4591                 return 1;
4592
4593         smram->smram64.svm_guest_flag = 1;
4594         smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4595
4596         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4597         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4598         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4599
4600         ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4601         if (ret)
4602                 return ret;
4603
4604         /*
4605          * KVM uses VMCB01 to store L1 host state while L2 runs but
4606          * VMCB01 is going to be used during SMM and thus the state will
4607          * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4608          * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4609          * format of the area is identical to guest save area offsetted
4610          * by 0x400 (matches the offset of 'struct vmcb_save_area'
4611          * within 'struct vmcb'). Note: HSAVE area may also be used by
4612          * L1 hypervisor to save additional host context (e.g. KVM does
4613          * that, see svm_prepare_switch_to_guest()) which must be
4614          * preserved.
4615          */
4616         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4617                 return 1;
4618
4619         BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4620
4621         svm_copy_vmrun_state(map_save.hva + 0x400,
4622                              &svm->vmcb01.ptr->save);
4623
4624         kvm_vcpu_unmap(vcpu, &map_save, true);
4625         return 0;
4626 }
4627
4628 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4629 {
4630         struct vcpu_svm *svm = to_svm(vcpu);
4631         struct kvm_host_map map, map_save;
4632         struct vmcb *vmcb12;
4633         int ret;
4634
4635         const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4636
4637         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4638                 return 0;
4639
4640         /* Non-zero if SMI arrived while vCPU was in guest mode. */
4641         if (!smram64->svm_guest_flag)
4642                 return 0;
4643
4644         if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4645                 return 1;
4646
4647         if (!(smram64->efer & EFER_SVME))
4648                 return 1;
4649
4650         if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4651                 return 1;
4652
4653         ret = 1;
4654         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4655                 goto unmap_map;
4656
4657         if (svm_allocate_nested(svm))
4658                 goto unmap_save;
4659
4660         /*
4661          * Restore L1 host state from L1 HSAVE area as VMCB01 was
4662          * used during SMM (see svm_enter_smm())
4663          */
4664
4665         svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4666
4667         /*
4668          * Enter the nested guest now
4669          */
4670
4671         vmcb_mark_all_dirty(svm->vmcb01.ptr);
4672
4673         vmcb12 = map.hva;
4674         nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4675         nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4676         ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
4677
4678         if (ret)
4679                 goto unmap_save;
4680
4681         svm->nested.nested_run_pending = 1;
4682
4683 unmap_save:
4684         kvm_vcpu_unmap(vcpu, &map_save, true);
4685 unmap_map:
4686         kvm_vcpu_unmap(vcpu, &map, true);
4687         return ret;
4688 }
4689
4690 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4691 {
4692         struct vcpu_svm *svm = to_svm(vcpu);
4693
4694         if (!gif_set(svm)) {
4695                 if (vgif)
4696                         svm_set_intercept(svm, INTERCEPT_STGI);
4697                 /* STGI will cause a vm exit */
4698         } else {
4699                 /* We must be in SMM; RSM will cause a vmexit anyway.  */
4700         }
4701 }
4702 #endif
4703
4704 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4705                                         void *insn, int insn_len)
4706 {
4707         bool smep, smap, is_user;
4708         u64 error_code;
4709
4710         /* Emulation is always possible when KVM has access to all guest state. */
4711         if (!sev_guest(vcpu->kvm))
4712                 return true;
4713
4714         /* #UD and #GP should never be intercepted for SEV guests. */
4715         WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4716                                   EMULTYPE_TRAP_UD_FORCED |
4717                                   EMULTYPE_VMWARE_GP));
4718
4719         /*
4720          * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4721          * to guest register state.
4722          */
4723         if (sev_es_guest(vcpu->kvm))
4724                 return false;
4725
4726         /*
4727          * Emulation is possible if the instruction is already decoded, e.g.
4728          * when completing I/O after returning from userspace.
4729          */
4730         if (emul_type & EMULTYPE_NO_DECODE)
4731                 return true;
4732
4733         /*
4734          * Emulation is possible for SEV guests if and only if a prefilled
4735          * buffer containing the bytes of the intercepted instruction is
4736          * available. SEV guest memory is encrypted with a guest specific key
4737          * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4738          * decode garbage.
4739          *
4740          * If KVM is NOT trying to simply skip an instruction, inject #UD if
4741          * KVM reached this point without an instruction buffer.  In practice,
4742          * this path should never be hit by a well-behaved guest, e.g. KVM
4743          * doesn't intercept #UD or #GP for SEV guests, but this path is still
4744          * theoretically reachable, e.g. via unaccelerated fault-like AVIC
4745          * access, and needs to be handled by KVM to avoid putting the guest
4746          * into an infinite loop.   Injecting #UD is somewhat arbitrary, but
4747          * its the least awful option given lack of insight into the guest.
4748          *
4749          * If KVM is trying to skip an instruction, simply resume the guest.
4750          * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
4751          * will attempt to re-inject the INT3/INTO and skip the instruction.
4752          * In that scenario, retrying the INT3/INTO and hoping the guest will
4753          * make forward progress is the only option that has a chance of
4754          * success (and in practice it will work the vast majority of the time).
4755          */
4756         if (unlikely(!insn)) {
4757                 if (!(emul_type & EMULTYPE_SKIP))
4758                         kvm_queue_exception(vcpu, UD_VECTOR);
4759                 return false;
4760         }
4761
4762         /*
4763          * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4764          * will be empty if the DecodeAssist microcode cannot fetch bytes for
4765          * the faulting instruction because the code fetch itself faulted, e.g.
4766          * the guest attempted to fetch from emulated MMIO or a guest page
4767          * table used to translate CS:RIP resides in emulated MMIO.
4768          */
4769         if (likely(insn_len))
4770                 return true;
4771
4772         /*
4773          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4774          *
4775          * Errata:
4776          * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4777          * possible that CPU microcode implementing DecodeAssist will fail to
4778          * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4779          * be '0'.  This happens because microcode reads CS:RIP using a _data_
4780          * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4781          * gives up and does not fill the instruction bytes buffer.
4782          *
4783          * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4784          * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4785          * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4786          * GuestIntrBytes field of the VMCB.
4787          *
4788          * This does _not_ mean that the erratum has been encountered, as the
4789          * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4790          * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4791          * encountered a reserved/not-present #PF.
4792          *
4793          * To hit the erratum, the following conditions must be true:
4794          *    1. CR4.SMAP=1 (obviously).
4795          *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4796          *       have been hit as the guest would have encountered a SMEP
4797          *       violation #PF, not a #NPF.
4798          *    3. The #NPF is not due to a code fetch, in which case failure to
4799          *       retrieve the instruction bytes is legitimate (see abvoe).
4800          *
4801          * In addition, don't apply the erratum workaround if the #NPF occurred
4802          * while translating guest page tables (see below).
4803          */
4804         error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4805         if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4806                 goto resume_guest;
4807
4808         smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
4809         smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
4810         is_user = svm_get_cpl(vcpu) == 3;
4811         if (smap && (!smep || is_user)) {
4812                 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
4813
4814                 /*
4815                  * If the fault occurred in userspace, arbitrarily inject #GP
4816                  * to avoid killing the guest and to hopefully avoid confusing
4817                  * the guest kernel too much, e.g. injecting #PF would not be
4818                  * coherent with respect to the guest's page tables.  Request
4819                  * triple fault if the fault occurred in the kernel as there's
4820                  * no fault that KVM can inject without confusing the guest.
4821                  * In practice, the triple fault is moot as no sane SEV kernel
4822                  * will execute from user memory while also running with SMAP=1.
4823                  */
4824                 if (is_user)
4825                         kvm_inject_gp(vcpu, 0);
4826                 else
4827                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4828         }
4829
4830 resume_guest:
4831         /*
4832          * If the erratum was not hit, simply resume the guest and let it fault
4833          * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4834          * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4835          * userspace will kill the guest, and letting the emulator read garbage
4836          * will yield random behavior and potentially corrupt the guest.
4837          *
4838          * Simply resuming the guest is technically not a violation of the SEV
4839          * architecture.  AMD's APM states that all code fetches and page table
4840          * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4841          * APM also states that encrypted accesses to MMIO are "ignored", but
4842          * doesn't explicitly define "ignored", i.e. doing nothing and letting
4843          * the guest spin is technically "ignoring" the access.
4844          */
4845         return false;
4846 }
4847
4848 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4849 {
4850         struct vcpu_svm *svm = to_svm(vcpu);
4851
4852         return !gif_set(svm);
4853 }
4854
4855 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4856 {
4857         if (!sev_es_guest(vcpu->kvm))
4858                 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4859
4860         sev_vcpu_deliver_sipi_vector(vcpu, vector);
4861 }
4862
4863 static void svm_vm_destroy(struct kvm *kvm)
4864 {
4865         avic_vm_destroy(kvm);
4866         sev_vm_destroy(kvm);
4867 }
4868
4869 static int svm_vm_init(struct kvm *kvm)
4870 {
4871         if (!pause_filter_count || !pause_filter_thresh)
4872                 kvm->arch.pause_in_guest = true;
4873
4874         if (enable_apicv) {
4875                 int ret = avic_vm_init(kvm);
4876                 if (ret)
4877                         return ret;
4878         }
4879
4880         return 0;
4881 }
4882
4883 static struct kvm_x86_ops svm_x86_ops __initdata = {
4884         .name = KBUILD_MODNAME,
4885
4886         .check_processor_compatibility = svm_check_processor_compat,
4887
4888         .hardware_unsetup = svm_hardware_unsetup,
4889         .hardware_enable = svm_hardware_enable,
4890         .hardware_disable = svm_hardware_disable,
4891         .has_emulated_msr = svm_has_emulated_msr,
4892
4893         .vcpu_create = svm_vcpu_create,
4894         .vcpu_free = svm_vcpu_free,
4895         .vcpu_reset = svm_vcpu_reset,
4896
4897         .vm_size = sizeof(struct kvm_svm),
4898         .vm_init = svm_vm_init,
4899         .vm_destroy = svm_vm_destroy,
4900
4901         .prepare_switch_to_guest = svm_prepare_switch_to_guest,
4902         .vcpu_load = svm_vcpu_load,
4903         .vcpu_put = svm_vcpu_put,
4904         .vcpu_blocking = avic_vcpu_blocking,
4905         .vcpu_unblocking = avic_vcpu_unblocking,
4906
4907         .update_exception_bitmap = svm_update_exception_bitmap,
4908         .get_msr_feature = svm_get_msr_feature,
4909         .get_msr = svm_get_msr,
4910         .set_msr = svm_set_msr,
4911         .get_segment_base = svm_get_segment_base,
4912         .get_segment = svm_get_segment,
4913         .set_segment = svm_set_segment,
4914         .get_cpl = svm_get_cpl,
4915         .get_cs_db_l_bits = svm_get_cs_db_l_bits,
4916         .is_valid_cr0 = svm_is_valid_cr0,
4917         .set_cr0 = svm_set_cr0,
4918         .post_set_cr3 = sev_post_set_cr3,
4919         .is_valid_cr4 = svm_is_valid_cr4,
4920         .set_cr4 = svm_set_cr4,
4921         .set_efer = svm_set_efer,
4922         .get_idt = svm_get_idt,
4923         .set_idt = svm_set_idt,
4924         .get_gdt = svm_get_gdt,
4925         .set_gdt = svm_set_gdt,
4926         .set_dr7 = svm_set_dr7,
4927         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4928         .cache_reg = svm_cache_reg,
4929         .get_rflags = svm_get_rflags,
4930         .set_rflags = svm_set_rflags,
4931         .get_if_flag = svm_get_if_flag,
4932
4933         .flush_tlb_all = svm_flush_tlb_all,
4934         .flush_tlb_current = svm_flush_tlb_current,
4935         .flush_tlb_gva = svm_flush_tlb_gva,
4936         .flush_tlb_guest = svm_flush_tlb_asid,
4937
4938         .vcpu_pre_run = svm_vcpu_pre_run,
4939         .vcpu_run = svm_vcpu_run,
4940         .handle_exit = svm_handle_exit,
4941         .skip_emulated_instruction = svm_skip_emulated_instruction,
4942         .update_emulated_instruction = NULL,
4943         .set_interrupt_shadow = svm_set_interrupt_shadow,
4944         .get_interrupt_shadow = svm_get_interrupt_shadow,
4945         .patch_hypercall = svm_patch_hypercall,
4946         .inject_irq = svm_inject_irq,
4947         .inject_nmi = svm_inject_nmi,
4948         .is_vnmi_pending = svm_is_vnmi_pending,
4949         .set_vnmi_pending = svm_set_vnmi_pending,
4950         .inject_exception = svm_inject_exception,
4951         .cancel_injection = svm_cancel_injection,
4952         .interrupt_allowed = svm_interrupt_allowed,
4953         .nmi_allowed = svm_nmi_allowed,
4954         .get_nmi_mask = svm_get_nmi_mask,
4955         .set_nmi_mask = svm_set_nmi_mask,
4956         .enable_nmi_window = svm_enable_nmi_window,
4957         .enable_irq_window = svm_enable_irq_window,
4958         .update_cr8_intercept = svm_update_cr8_intercept,
4959         .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
4960         .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4961         .apicv_post_state_restore = avic_apicv_post_state_restore,
4962         .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
4963
4964         .get_exit_info = svm_get_exit_info,
4965
4966         .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4967
4968         .has_wbinvd_exit = svm_has_wbinvd_exit,
4969
4970         .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4971         .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4972         .write_tsc_offset = svm_write_tsc_offset,
4973         .write_tsc_multiplier = svm_write_tsc_multiplier,
4974
4975         .load_mmu_pgd = svm_load_mmu_pgd,
4976
4977         .check_intercept = svm_check_intercept,
4978         .handle_exit_irqoff = svm_handle_exit_irqoff,
4979
4980         .request_immediate_exit = __kvm_request_immediate_exit,
4981
4982         .sched_in = svm_sched_in,
4983
4984         .nested_ops = &svm_nested_ops,
4985
4986         .deliver_interrupt = svm_deliver_interrupt,
4987         .pi_update_irte = avic_pi_update_irte,
4988         .setup_mce = svm_setup_mce,
4989
4990 #ifdef CONFIG_KVM_SMM
4991         .smi_allowed = svm_smi_allowed,
4992         .enter_smm = svm_enter_smm,
4993         .leave_smm = svm_leave_smm,
4994         .enable_smi_window = svm_enable_smi_window,
4995 #endif
4996
4997         .mem_enc_ioctl = sev_mem_enc_ioctl,
4998         .mem_enc_register_region = sev_mem_enc_register_region,
4999         .mem_enc_unregister_region = sev_mem_enc_unregister_region,
5000         .guest_memory_reclaimed = sev_guest_memory_reclaimed,
5001
5002         .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
5003         .vm_move_enc_context_from = sev_vm_move_enc_context_from,
5004
5005         .can_emulate_instruction = svm_can_emulate_instruction,
5006
5007         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
5008
5009         .msr_filter_changed = svm_msr_filter_changed,
5010         .complete_emulated_msr = svm_complete_emulated_msr,
5011
5012         .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
5013         .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
5014 };
5015
5016 /*
5017  * The default MMIO mask is a single bit (excluding the present bit),
5018  * which could conflict with the memory encryption bit. Check for
5019  * memory encryption support and override the default MMIO mask if
5020  * memory encryption is enabled.
5021  */
5022 static __init void svm_adjust_mmio_mask(void)
5023 {
5024         unsigned int enc_bit, mask_bit;
5025         u64 msr, mask;
5026
5027         /* If there is no memory encryption support, use existing mask */
5028         if (cpuid_eax(0x80000000) < 0x8000001f)
5029                 return;
5030
5031         /* If memory encryption is not enabled, use existing mask */
5032         rdmsrl(MSR_AMD64_SYSCFG, msr);
5033         if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
5034                 return;
5035
5036         enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
5037         mask_bit = boot_cpu_data.x86_phys_bits;
5038
5039         /* Increment the mask bit if it is the same as the encryption bit */
5040         if (enc_bit == mask_bit)
5041                 mask_bit++;
5042
5043         /*
5044          * If the mask bit location is below 52, then some bits above the
5045          * physical addressing limit will always be reserved, so use the
5046          * rsvd_bits() function to generate the mask. This mask, along with
5047          * the present bit, will be used to generate a page fault with
5048          * PFER.RSV = 1.
5049          *
5050          * If the mask bit location is 52 (or above), then clear the mask.
5051          */
5052         mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
5053
5054         kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
5055 }
5056
5057 static __init void svm_set_cpu_caps(void)
5058 {
5059         kvm_set_cpu_caps();
5060
5061         kvm_caps.supported_perf_cap = 0;
5062         kvm_caps.supported_xss = 0;
5063
5064         /* CPUID 0x80000001 and 0x8000000A (SVM features) */
5065         if (nested) {
5066                 kvm_cpu_cap_set(X86_FEATURE_SVM);
5067                 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
5068
5069                 if (nrips)
5070                         kvm_cpu_cap_set(X86_FEATURE_NRIPS);
5071
5072                 if (npt_enabled)
5073                         kvm_cpu_cap_set(X86_FEATURE_NPT);
5074
5075                 if (tsc_scaling)
5076                         kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
5077
5078                 if (vls)
5079                         kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
5080                 if (lbrv)
5081                         kvm_cpu_cap_set(X86_FEATURE_LBRV);
5082
5083                 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
5084                         kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
5085
5086                 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
5087                         kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
5088
5089                 if (vgif)
5090                         kvm_cpu_cap_set(X86_FEATURE_VGIF);
5091
5092                 if (vnmi)
5093                         kvm_cpu_cap_set(X86_FEATURE_VNMI);
5094
5095                 /* Nested VM can receive #VMEXIT instead of triggering #GP */
5096                 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
5097         }
5098
5099         /* CPUID 0x80000008 */
5100         if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
5101             boot_cpu_has(X86_FEATURE_AMD_SSBD))
5102                 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
5103
5104         if (enable_pmu) {
5105                 /*
5106                  * Enumerate support for PERFCTR_CORE if and only if KVM has
5107                  * access to enough counters to virtualize "core" support,
5108                  * otherwise limit vPMU support to the legacy number of counters.
5109                  */
5110                 if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
5111                         kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
5112                                                           kvm_pmu_cap.num_counters_gp);
5113                 else
5114                         kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
5115
5116                 if (kvm_pmu_cap.version != 2 ||
5117                     !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
5118                         kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
5119         }
5120
5121         /* CPUID 0x8000001F (SME/SEV features) */
5122         sev_set_cpu_caps();
5123 }
5124
5125 static __init int svm_hardware_setup(void)
5126 {
5127         int cpu;
5128         struct page *iopm_pages;
5129         void *iopm_va;
5130         int r;
5131         unsigned int order = get_order(IOPM_SIZE);
5132
5133         /*
5134          * NX is required for shadow paging and for NPT if the NX huge pages
5135          * mitigation is enabled.
5136          */
5137         if (!boot_cpu_has(X86_FEATURE_NX)) {
5138                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
5139                 return -EOPNOTSUPP;
5140         }
5141         kvm_enable_efer_bits(EFER_NX);
5142
5143         iopm_pages = alloc_pages(GFP_KERNEL, order);
5144
5145         if (!iopm_pages)
5146                 return -ENOMEM;
5147
5148         iopm_va = page_address(iopm_pages);
5149         memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
5150         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
5151
5152         init_msrpm_offsets();
5153
5154         kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
5155                                      XFEATURE_MASK_BNDCSR);
5156
5157         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
5158                 kvm_enable_efer_bits(EFER_FFXSR);
5159
5160         if (tsc_scaling) {
5161                 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
5162                         tsc_scaling = false;
5163                 } else {
5164                         pr_info("TSC scaling supported\n");
5165                         kvm_caps.has_tsc_control = true;
5166                 }
5167         }
5168         kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
5169         kvm_caps.tsc_scaling_ratio_frac_bits = 32;
5170
5171         tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
5172
5173         if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
5174                 kvm_enable_efer_bits(EFER_AUTOIBRS);
5175
5176         /* Check for pause filtering support */
5177         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
5178                 pause_filter_count = 0;
5179                 pause_filter_thresh = 0;
5180         } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
5181                 pause_filter_thresh = 0;
5182         }
5183
5184         if (nested) {
5185                 pr_info("Nested Virtualization enabled\n");
5186                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
5187         }
5188
5189         /*
5190          * KVM's MMU doesn't support using 2-level paging for itself, and thus
5191          * NPT isn't supported if the host is using 2-level paging since host
5192          * CR4 is unchanged on VMRUN.
5193          */
5194         if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
5195                 npt_enabled = false;
5196
5197         if (!boot_cpu_has(X86_FEATURE_NPT))
5198                 npt_enabled = false;
5199
5200         /* Force VM NPT level equal to the host's paging level */
5201         kvm_configure_mmu(npt_enabled, get_npt_level(),
5202                           get_npt_level(), PG_LEVEL_1G);
5203         pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5204
5205         /* Setup shadow_me_value and shadow_me_mask */
5206         kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5207
5208         svm_adjust_mmio_mask();
5209
5210         nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
5211
5212         /*
5213          * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5214          * may be modified by svm_adjust_mmio_mask()), as well as nrips.
5215          */
5216         sev_hardware_setup();
5217
5218         svm_hv_hardware_setup();
5219
5220         for_each_possible_cpu(cpu) {
5221                 r = svm_cpu_init(cpu);
5222                 if (r)
5223                         goto err;
5224         }
5225
5226         enable_apicv = avic = avic && avic_hardware_setup();
5227
5228         if (!enable_apicv) {
5229                 svm_x86_ops.vcpu_blocking = NULL;
5230                 svm_x86_ops.vcpu_unblocking = NULL;
5231                 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5232         } else if (!x2avic_enabled) {
5233                 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
5234         }
5235
5236         if (vls) {
5237                 if (!npt_enabled ||
5238                     !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5239                     !IS_ENABLED(CONFIG_X86_64)) {
5240                         vls = false;
5241                 } else {
5242                         pr_info("Virtual VMLOAD VMSAVE supported\n");
5243                 }
5244         }
5245
5246         if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5247                 svm_gp_erratum_intercept = false;
5248
5249         if (vgif) {
5250                 if (!boot_cpu_has(X86_FEATURE_VGIF))
5251                         vgif = false;
5252                 else
5253                         pr_info("Virtual GIF supported\n");
5254         }
5255
5256         vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
5257         if (vnmi)
5258                 pr_info("Virtual NMI enabled\n");
5259
5260         if (!vnmi) {
5261                 svm_x86_ops.is_vnmi_pending = NULL;
5262                 svm_x86_ops.set_vnmi_pending = NULL;
5263         }
5264
5265
5266         if (lbrv) {
5267                 if (!boot_cpu_has(X86_FEATURE_LBRV))
5268                         lbrv = false;
5269                 else
5270                         pr_info("LBR virtualization supported\n");
5271         }
5272
5273         if (!enable_pmu)
5274                 pr_info("PMU virtualization is disabled\n");
5275
5276         svm_set_cpu_caps();
5277
5278         /*
5279          * It seems that on AMD processors PTE's accessed bit is
5280          * being set by the CPU hardware before the NPF vmexit.
5281          * This is not expected behaviour and our tests fail because
5282          * of it.
5283          * A workaround here is to disable support for
5284          * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5285          * In this case userspace can know if there is support using
5286          * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5287          * it
5288          * If future AMD CPU models change the behaviour described above,
5289          * this variable can be changed accordingly
5290          */
5291         allow_smaller_maxphyaddr = !npt_enabled;
5292
5293         return 0;
5294
5295 err:
5296         svm_hardware_unsetup();
5297         return r;
5298 }
5299
5300
5301 static struct kvm_x86_init_ops svm_init_ops __initdata = {
5302         .hardware_setup = svm_hardware_setup,
5303
5304         .runtime_ops = &svm_x86_ops,
5305         .pmu_ops = &amd_pmu_ops,
5306 };
5307
5308 static void __svm_exit(void)
5309 {
5310         kvm_x86_vendor_exit();
5311
5312         cpu_emergency_unregister_virt_callback(svm_emergency_disable);
5313 }
5314
5315 static int __init svm_init(void)
5316 {
5317         int r;
5318
5319         __unused_size_checks();
5320
5321         if (!kvm_is_svm_supported())
5322                 return -EOPNOTSUPP;
5323
5324         r = kvm_x86_vendor_init(&svm_init_ops);
5325         if (r)
5326                 return r;
5327
5328         cpu_emergency_register_virt_callback(svm_emergency_disable);
5329
5330         /*
5331          * Common KVM initialization _must_ come last, after this, /dev/kvm is
5332          * exposed to userspace!
5333          */
5334         r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
5335                      THIS_MODULE);
5336         if (r)
5337                 goto err_kvm_init;
5338
5339         return 0;
5340
5341 err_kvm_init:
5342         __svm_exit();
5343         return r;
5344 }
5345
5346 static void __exit svm_exit(void)
5347 {
5348         kvm_exit();
5349         __svm_exit();
5350 }
5351
5352 module_init(svm_init)
5353 module_exit(svm_exit)