1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * Yaniv Kamay <yaniv@qumranet.com>
12 * Avi Kivity <avi@qumranet.com>
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17 #include <linux/kvm_types.h>
18 #include <linux/hashtable.h>
19 #include <linux/amd-iommu.h>
20 #include <linux/kvm_host.h>
22 #include <asm/irq_remapping.h>
31 * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
32 * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
33 * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
35 * For the vCPU ID, use however many bits are currently allowed for the max
36 * guest physical APIC ID (limited by the size of the physical ID table), and
37 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
38 * size of the GATag is defined by hardware (32 bits), but is an opaque value
39 * as far as hardware is concerned.
41 #define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
43 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
44 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
46 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
47 #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
49 #define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
50 ((vcpu_id) & AVIC_VCPU_ID_MASK))
51 #define AVIC_GATAG(vm_id, vcpu_id) \
53 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
55 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
56 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
60 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
62 static bool force_avic;
63 module_param_unsafe(force_avic, bool, 0444);
66 * This hash table is used to map VM_ID to a struct kvm_svm,
67 * when handling AMD IOMMU GALOG notification to schedule in
70 #define SVM_VM_DATA_HASH_BITS 8
71 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
72 static u32 next_vm_id = 0;
73 static bool next_vm_id_wrapped = 0;
74 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
78 * This is a wrapper of struct amd_iommu_ir_data.
80 struct amd_svm_iommu_ir {
81 struct list_head node; /* Used by SVM for per-vcpu ir_list */
82 void *data; /* Storing pointer to struct amd_ir_data */
85 static void avic_activate_vmcb(struct vcpu_svm *svm)
87 struct vmcb *vmcb = svm->vmcb01.ptr;
89 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
90 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
92 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
95 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
96 * accesses, while interrupt injection to a running vCPU can be
97 * achieved using AVIC doorbell. KVM disables the APIC access page
98 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
99 * AVIC in hybrid mode activates only the doorbell mechanism.
101 if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
102 vmcb->control.int_ctl |= X2APIC_MODE_MASK;
103 vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
104 /* Disabling MSR intercept for x2APIC registers */
105 svm_set_x2apic_msr_interception(svm, false);
108 * Flush the TLB, the guest may have inserted a non-APIC
109 * mapping into the TLB while AVIC was disabled.
111 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
113 /* For xAVIC and hybrid-xAVIC modes */
114 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
115 /* Enabling MSR intercept for x2APIC registers */
116 svm_set_x2apic_msr_interception(svm, true);
120 static void avic_deactivate_vmcb(struct vcpu_svm *svm)
122 struct vmcb *vmcb = svm->vmcb01.ptr;
124 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
125 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
128 * If running nested and the guest uses its own MSR bitmap, there
129 * is no need to update L0's msr bitmap
131 if (is_guest_mode(&svm->vcpu) &&
132 vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
135 /* Enabling MSR intercept for x2APIC registers */
136 svm_set_x2apic_msr_interception(svm, true);
140 * This function is called from IOMMU driver to notify
141 * SVM to schedule in a particular vCPU of a particular VM.
143 int avic_ga_log_notifier(u32 ga_tag)
146 struct kvm_svm *kvm_svm;
147 struct kvm_vcpu *vcpu = NULL;
148 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
149 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
151 pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
152 trace_kvm_avic_ga_log(vm_id, vcpu_id);
154 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
155 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
156 if (kvm_svm->avic_vm_id != vm_id)
158 vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
161 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
164 * At this point, the IOMMU should have already set the pending
165 * bit in the vAPIC backing page. So, we just need to schedule
169 kvm_vcpu_wake_up(vcpu);
174 void avic_vm_destroy(struct kvm *kvm)
177 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
182 if (kvm_svm->avic_logical_id_table_page)
183 __free_page(kvm_svm->avic_logical_id_table_page);
184 if (kvm_svm->avic_physical_id_table_page)
185 __free_page(kvm_svm->avic_physical_id_table_page);
187 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
188 hash_del(&kvm_svm->hnode);
189 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
192 int avic_vm_init(struct kvm *kvm)
196 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
205 /* Allocating physical APIC ID table (4KB) */
206 p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
210 kvm_svm->avic_physical_id_table_page = p_page;
212 /* Allocating logical APIC ID table (4KB) */
213 l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
217 kvm_svm->avic_logical_id_table_page = l_page;
219 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
221 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
222 if (vm_id == 0) { /* id is 1-based, zero is not okay */
223 next_vm_id_wrapped = 1;
226 /* Is it still in use? Only possible if wrapped at least once */
227 if (next_vm_id_wrapped) {
228 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
229 if (k2->avic_vm_id == vm_id)
233 kvm_svm->avic_vm_id = vm_id;
234 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
235 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
240 avic_vm_destroy(kvm);
244 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
246 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
247 phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
248 phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
249 phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
251 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
252 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
253 vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
254 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
256 if (kvm_apicv_activated(svm->vcpu.kvm))
257 avic_activate_vmcb(svm);
259 avic_deactivate_vmcb(svm);
262 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
265 u64 *avic_physical_id_table;
266 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
268 if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
269 (index > X2AVIC_MAX_PHYSICAL_ID))
272 avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
274 return &avic_physical_id_table[index];
277 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
279 u64 *entry, new_entry;
280 int id = vcpu->vcpu_id;
281 struct vcpu_svm *svm = to_svm(vcpu);
283 if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
284 (id > X2AVIC_MAX_PHYSICAL_ID))
287 if (!vcpu->arch.apic->regs)
290 if (kvm_apicv_activated(vcpu->kvm)) {
294 * Note, AVIC hardware walks the nested page table to check
295 * permissions, but does not use the SPA address specified in
296 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
297 * pointer field of the VMCB.
299 ret = kvm_alloc_apic_access_page(vcpu->kvm);
304 svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
306 /* Setting AVIC backing page address in the phy APIC ID table */
307 entry = avic_get_physical_id_entry(vcpu, id);
311 new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
312 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
313 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
314 WRITE_ONCE(*entry, new_entry);
316 svm->avic_physical_id_cache = entry;
321 void avic_ring_doorbell(struct kvm_vcpu *vcpu)
324 * Note, the vCPU could get migrated to a different pCPU at any point,
325 * which could result in signalling the wrong/previous pCPU. But if
326 * that happens the vCPU is guaranteed to do a VMRUN (after being
327 * migrated) and thus will process pending interrupts, i.e. a doorbell
328 * is not needed (and the spurious one is harmless).
330 int cpu = READ_ONCE(vcpu->cpu);
332 if (cpu != get_cpu()) {
333 wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
334 trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
340 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
342 vcpu->arch.apic->irr_pending = true;
343 svm_complete_interrupt_delivery(vcpu,
344 icrl & APIC_MODE_MASK,
345 icrl & APIC_INT_LEVELTRIG,
346 icrl & APIC_VECTOR_MASK);
349 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
353 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
354 * i.e. APIC ID == vCPU ID.
356 struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
358 /* Once again, nothing to do if the target vCPU doesn't exist. */
359 if (unlikely(!target_vcpu))
362 avic_kick_vcpu(target_vcpu, icrl);
365 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
366 u32 logid_index, u32 icrl)
370 if (avic_logical_id_table) {
371 u32 logid_entry = avic_logical_id_table[logid_index];
373 /* Nothing to do if the logical destination is invalid. */
374 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
377 physical_id = logid_entry &
378 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
381 * For x2APIC, the logical APIC ID is a read-only value that is
382 * derived from the x2APIC ID, thus the x2APIC ID can be found
383 * by reversing the calculation (stored in logid_index). Note,
384 * bits 31:20 of the x2APIC ID aren't propagated to the logical
385 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
387 physical_id = logid_index;
390 avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
394 * A fast-path version of avic_kick_target_vcpus(), which attempts to match
395 * destination APIC ID to vCPU without looping through all vCPUs.
397 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
398 u32 icrl, u32 icrh, u32 index)
400 int dest_mode = icrl & APIC_DEST_MASK;
401 int shorthand = icrl & APIC_SHORT_MASK;
402 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
405 if (shorthand != APIC_DEST_NOSHORT)
408 if (apic_x2apic_mode(source))
411 dest = GET_XAPIC_DEST_FIELD(icrh);
413 if (dest_mode == APIC_DEST_PHYSICAL) {
414 /* broadcast destination, use slow path */
415 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
417 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
420 if (WARN_ON_ONCE(dest != index))
423 avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
425 u32 *avic_logical_id_table;
426 unsigned long bitmap, i;
429 if (apic_x2apic_mode(source)) {
430 /* 16 bit dest mask, 16 bit cluster id */
431 bitmap = dest & 0xFFFF;
432 cluster = (dest >> 16) << 4;
433 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
438 /* 4 bit desk mask, 4 bit cluster id */
440 cluster = (dest >> 4) << 2;
443 /* Nothing to do if there are no destinations in the cluster. */
444 if (unlikely(!bitmap))
447 if (apic_x2apic_mode(source))
448 avic_logical_id_table = NULL;
450 avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
453 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
454 * IDs, thus each bit in the destination is guaranteed to map
455 * to at most one vCPU.
457 for_each_set_bit(i, &bitmap, 16)
458 avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
465 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
466 u32 icrl, u32 icrh, u32 index)
468 u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
470 struct kvm_vcpu *vcpu;
472 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
475 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
478 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
479 * event. There's no need to signal doorbells, as hardware has handled
480 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
481 * since entered the guest will have processed pending IRQs at VMRUN.
483 kvm_for_each_vcpu(i, vcpu, kvm) {
484 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
485 dest, icrl & APIC_DEST_MASK))
486 avic_kick_vcpu(vcpu, icrl);
490 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
492 struct vcpu_svm *svm = to_svm(vcpu);
493 u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
494 u32 icrl = svm->vmcb->control.exit_info_1;
495 u32 id = svm->vmcb->control.exit_info_2 >> 32;
496 u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
497 struct kvm_lapic *apic = vcpu->arch.apic;
499 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
502 case AVIC_IPI_FAILURE_INVALID_TARGET:
503 case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
505 * Emulate IPIs that are not handled by AVIC hardware, which
506 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
507 * if _any_ targets are invalid, e.g. if the logical mode mask
508 * is a superset of running vCPUs.
510 * The exit is a trap, e.g. ICR holds the correct value and RIP
511 * has been advanced, KVM is responsible only for emulating the
512 * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
513 * in which case KVM needs to emulate the ICR write as well in
514 * order to clear the BUSY flag.
516 if (icrl & APIC_ICR_BUSY)
517 kvm_apic_write_nodecode(vcpu, APIC_ICR);
519 kvm_apic_send_ipi(apic, icrl, icrh);
521 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
523 * At this point, we expect that the AVIC HW has already
524 * set the appropriate IRR bits on the valid target
525 * vcpus. So, we just need to kick the appropriate vcpu.
527 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
529 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
530 WARN_ONCE(1, "Invalid backing page\n");
533 pr_err("Unknown IPI interception\n");
539 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
541 if (is_guest_mode(vcpu))
542 return APICV_INHIBIT_REASON_NESTED;
546 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
548 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
549 u32 *logical_apic_id_table;
552 ldr = GET_APIC_LOGICAL_ID(ldr);
557 cluster = (ldr >> 4);
562 if (!ldr || !is_power_of_2(ldr))
566 if (WARN_ON_ONCE(index > 7))
568 index += (cluster << 2);
570 logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
572 return &logical_apic_id_table[index];
575 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
578 u32 *entry, new_entry;
580 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
581 entry = avic_get_logical_id_entry(vcpu, ldr, flat);
585 new_entry = READ_ONCE(*entry);
586 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
587 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
588 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
589 WRITE_ONCE(*entry, new_entry);
592 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
594 struct vcpu_svm *svm = to_svm(vcpu);
595 bool flat = svm->dfr_reg == APIC_DFR_FLAT;
598 /* Note: x2AVIC does not use logical APIC ID table */
599 if (apic_x2apic_mode(vcpu->arch.apic))
602 entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
604 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
607 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
609 struct vcpu_svm *svm = to_svm(vcpu);
610 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
611 u32 id = kvm_xapic_id(vcpu->arch.apic);
613 /* AVIC does not support LDR update for x2APIC */
614 if (apic_x2apic_mode(vcpu->arch.apic))
617 if (ldr == svm->ldr_reg)
620 avic_invalidate_logical_id_entry(vcpu);
623 avic_ldr_write(vcpu, id, ldr);
626 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
628 struct vcpu_svm *svm = to_svm(vcpu);
629 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
631 if (svm->dfr_reg == dfr)
634 avic_invalidate_logical_id_entry(vcpu);
638 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
640 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
641 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
645 avic_handle_ldr_update(vcpu);
648 avic_handle_dfr_update(vcpu);
651 /* Ignore writes to Read Remote Data, it's read-only. */
657 kvm_apic_write_nodecode(vcpu, offset);
661 static bool is_avic_unaccelerated_access_trap(u32 offset)
690 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
692 struct vcpu_svm *svm = to_svm(vcpu);
694 u32 offset = svm->vmcb->control.exit_info_1 &
695 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
696 u32 vector = svm->vmcb->control.exit_info_2 &
697 AVIC_UNACCEL_ACCESS_VECTOR_MASK;
698 bool write = (svm->vmcb->control.exit_info_1 >> 32) &
699 AVIC_UNACCEL_ACCESS_WRITE_MASK;
700 bool trap = is_avic_unaccelerated_access_trap(offset);
702 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
703 trap, write, vector);
706 WARN_ONCE(!write, "svm: Handling trap read.\n");
707 ret = avic_unaccel_trap_write(vcpu);
710 ret = kvm_emulate_instruction(vcpu, 0);
716 int avic_init_vcpu(struct vcpu_svm *svm)
719 struct kvm_vcpu *vcpu = &svm->vcpu;
721 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
724 ret = avic_init_backing_page(vcpu);
728 INIT_LIST_HEAD(&svm->ir_list);
729 spin_lock_init(&svm->ir_list_lock);
730 svm->dfr_reg = APIC_DFR_FLAT;
735 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
737 avic_handle_dfr_update(vcpu);
738 avic_handle_ldr_update(vcpu);
741 static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
745 struct amd_svm_iommu_ir *ir;
746 struct vcpu_svm *svm = to_svm(vcpu);
748 if (!kvm_arch_has_assigned_device(vcpu->kvm))
752 * Here, we go through the per-vcpu ir_list to update all existing
753 * interrupt remapping table entry targeting this vcpu.
755 spin_lock_irqsave(&svm->ir_list_lock, flags);
757 if (list_empty(&svm->ir_list))
760 list_for_each_entry(ir, &svm->ir_list, node) {
762 ret = amd_iommu_activate_guest_mode(ir->data);
764 ret = amd_iommu_deactivate_guest_mode(ir->data);
769 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
773 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
776 struct amd_svm_iommu_ir *cur;
778 spin_lock_irqsave(&svm->ir_list_lock, flags);
779 list_for_each_entry(cur, &svm->ir_list, node) {
780 if (cur->data != pi->ir_data)
782 list_del(&cur->node);
786 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
789 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
793 struct amd_svm_iommu_ir *ir;
796 * In some cases, the existing irte is updated and re-set,
797 * so we need to check here if it's already been * added
800 if (pi->ir_data && (pi->prev_ga_tag != 0)) {
801 struct kvm *kvm = svm->vcpu.kvm;
802 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
803 struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
804 struct vcpu_svm *prev_svm;
811 prev_svm = to_svm(prev_vcpu);
812 svm_ir_list_del(prev_svm, pi);
816 * Allocating new amd_iommu_pi_data, which will get
817 * add to the per-vcpu ir_list.
819 ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
824 ir->data = pi->ir_data;
826 spin_lock_irqsave(&svm->ir_list_lock, flags);
827 list_add(&ir->node, &svm->ir_list);
828 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
835 * The HW cannot support posting multicast/broadcast
836 * interrupts to a vCPU. So, we still use legacy interrupt
837 * remapping for these kind of interrupts.
839 * For lowest-priority interrupts, we only support
840 * those with single CPU as the destination, e.g. user
841 * configures the interrupts via /proc/irq or uses
842 * irqbalance to make the interrupts single-CPU.
845 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
846 struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
848 struct kvm_lapic_irq irq;
849 struct kvm_vcpu *vcpu = NULL;
851 kvm_set_msi_irq(kvm, e, &irq);
853 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
854 !kvm_irq_is_postable(&irq)) {
855 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
856 __func__, irq.vector);
860 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
863 vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
864 vcpu_info->vector = irq.vector;
870 * avic_pi_update_irte - set IRTE for Posted-Interrupts
873 * @host_irq: host irq of the interrupt
874 * @guest_irq: gsi of the interrupt
875 * @set: set or unset PI
876 * returns 0 on success, < 0 on failure
878 int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
879 uint32_t guest_irq, bool set)
881 struct kvm_kernel_irq_routing_entry *e;
882 struct kvm_irq_routing_table *irq_rt;
885 if (!kvm_arch_has_assigned_device(kvm) ||
886 !irq_remapping_cap(IRQ_POSTING_CAP))
889 pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
890 __func__, host_irq, guest_irq, set);
892 idx = srcu_read_lock(&kvm->irq_srcu);
893 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
895 if (guest_irq >= irq_rt->nr_rt_entries ||
896 hlist_empty(&irq_rt->map[guest_irq])) {
897 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
898 guest_irq, irq_rt->nr_rt_entries);
902 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
903 struct vcpu_data vcpu_info;
904 struct vcpu_svm *svm = NULL;
906 if (e->type != KVM_IRQ_ROUTING_MSI)
910 * Here, we setup with legacy mode in the following cases:
911 * 1. When cannot target interrupt to a specific vcpu.
912 * 2. Unsetting posted interrupt.
913 * 3. APIC virtualization is disabled for the vcpu.
914 * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
916 if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
917 kvm_vcpu_apicv_active(&svm->vcpu)) {
918 struct amd_iommu_pi_data pi;
920 /* Try to enable guest_mode in IRTE */
921 pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
923 pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
925 pi.is_guest_mode = true;
926 pi.vcpu_data = &vcpu_info;
927 ret = irq_set_vcpu_affinity(host_irq, &pi);
930 * Here, we successfully setting up vcpu affinity in
931 * IOMMU guest mode. Now, we need to store the posted
932 * interrupt information in a per-vcpu ir_list so that
933 * we can reference to them directly when we update vcpu
934 * scheduling information in IOMMU irte.
936 if (!ret && pi.is_guest_mode)
937 svm_ir_list_add(svm, &pi);
939 /* Use legacy mode in IRTE */
940 struct amd_iommu_pi_data pi;
943 * Here, pi is used to:
944 * - Tell IOMMU to use legacy mode for this interrupt.
945 * - Retrieve ga_tag of prior interrupt remapping data.
948 pi.is_guest_mode = false;
949 ret = irq_set_vcpu_affinity(host_irq, &pi);
952 * Check if the posted interrupt was previously
953 * setup with the guest_mode by checking if the ga_tag
954 * was cached. If so, we need to clean up the per-vcpu
957 if (!ret && pi.prev_ga_tag) {
958 int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
959 struct kvm_vcpu *vcpu;
961 vcpu = kvm_get_vcpu_by_id(kvm, id);
963 svm_ir_list_del(to_svm(vcpu), &pi);
968 trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
969 e->gsi, vcpu_info.vector,
970 vcpu_info.pi_desc_addr, set);
974 pr_err("%s: failed to update PI IRTE\n", __func__);
981 srcu_read_unlock(&kvm->irq_srcu, idx);
986 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
990 struct amd_svm_iommu_ir *ir;
991 struct vcpu_svm *svm = to_svm(vcpu);
993 if (!kvm_arch_has_assigned_device(vcpu->kvm))
997 * Here, we go through the per-vcpu ir_list to update all existing
998 * interrupt remapping table entry targeting this vcpu.
1000 spin_lock_irqsave(&svm->ir_list_lock, flags);
1002 if (list_empty(&svm->ir_list))
1005 list_for_each_entry(ir, &svm->ir_list, node) {
1006 ret = amd_iommu_update_ga(cpu, r, ir->data);
1011 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1015 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1018 int h_physical_id = kvm_cpu_get_apicid(cpu);
1019 struct vcpu_svm *svm = to_svm(vcpu);
1021 lockdep_assert_preemption_disabled();
1023 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
1027 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
1028 * is being scheduled in after being preempted. The CPU entries in the
1029 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
1030 * If the vCPU was migrated, its new CPU value will be stuffed when the
1033 if (kvm_vcpu_is_blocking(vcpu))
1036 entry = READ_ONCE(*(svm->avic_physical_id_cache));
1037 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
1039 entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
1040 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
1041 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1043 WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1044 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
1047 void avic_vcpu_put(struct kvm_vcpu *vcpu)
1050 struct vcpu_svm *svm = to_svm(vcpu);
1052 lockdep_assert_preemption_disabled();
1054 entry = READ_ONCE(*(svm->avic_physical_id_cache));
1056 /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
1057 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
1060 avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
1062 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1063 WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1066 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
1068 struct vcpu_svm *svm = to_svm(vcpu);
1069 struct vmcb *vmcb = svm->vmcb01.ptr;
1071 if (!lapic_in_kernel(vcpu) || !enable_apicv)
1074 if (kvm_vcpu_apicv_active(vcpu)) {
1076 * During AVIC temporary deactivation, guest could update
1077 * APIC ID, DFR and LDR registers, which would not be trapped
1078 * by avic_unaccelerated_access_interception(). In this case,
1079 * we need to check and update the AVIC logical APIC ID table
1080 * accordingly before re-activating.
1082 avic_apicv_post_state_restore(vcpu);
1083 avic_activate_vmcb(svm);
1085 avic_deactivate_vmcb(svm);
1087 vmcb_mark_dirty(vmcb, VMCB_AVIC);
1090 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
1092 bool activated = kvm_vcpu_apicv_active(vcpu);
1097 avic_refresh_virtual_apic_mode(vcpu);
1100 avic_vcpu_load(vcpu, vcpu->cpu);
1102 avic_vcpu_put(vcpu);
1104 avic_set_pi_irte_mode(vcpu, activated);
1107 void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
1109 if (!kvm_vcpu_apicv_active(vcpu))
1113 * Unload the AVIC when the vCPU is about to block, _before_
1114 * the vCPU actually blocks.
1116 * Any IRQs that arrive before IsRunning=0 will not cause an
1117 * incomplete IPI vmexit on the source, therefore vIRR will also
1118 * be checked by kvm_vcpu_check_block() before blocking. The
1119 * memory barrier implicit in set_current_state orders writing
1120 * IsRunning=0 before reading the vIRR. The processor needs a
1121 * matching memory barrier on interrupt delivery between writing
1122 * IRR and reading IsRunning; the lack of this barrier might be
1123 * the cause of errata #1235).
1125 avic_vcpu_put(vcpu);
1128 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
1130 if (!kvm_vcpu_apicv_active(vcpu))
1133 avic_vcpu_load(vcpu, vcpu->cpu);
1138 * - The module param avic enable both xAPIC and x2APIC mode.
1139 * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
1140 * - The mode can be switched at run-time.
1142 bool avic_hardware_setup(void)
1147 /* AVIC is a prerequisite for x2AVIC. */
1148 if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
1149 if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
1150 pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
1151 pr_warn(FW_BUG "Try enable AVIC using force_avic option");
1156 if (boot_cpu_has(X86_FEATURE_AVIC)) {
1157 pr_info("AVIC enabled\n");
1158 } else if (force_avic) {
1160 * Some older systems does not advertise AVIC support.
1161 * See Revision Guide for specific AMD processor for more detail.
1163 pr_warn("AVIC is not supported in CPUID but force enabled");
1164 pr_warn("Your system might crash and burn");
1167 /* AVIC is a prerequisite for x2AVIC. */
1168 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
1170 pr_info("x2AVIC enabled\n");
1172 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);