From efd995dae5eba57c5d28d6886a85298b390a4f07 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 4 Mar 2022 11:43:13 -0500 Subject: [PATCH] KVM: x86/mmu: Zap defunct roots via asynchronous worker Zap defunct roots, a.k.a. roots that have been invalidated after their last reference was initially dropped, asynchronously via the existing work queue instead of forcing the work upon the unfortunate task that happened to drop the last reference. If a vCPU task drops the last reference, the vCPU is effectively blocked by the host for the entire duration of the zap. If the root being zapped happens be fully populated with 4kb leaf SPTEs, e.g. due to dirty logging being active, the zap can take several hundred seconds. Unsurprisingly, most guests are unhappy if a vCPU disappears for hundreds of seconds. E.g. running a synthetic selftest that triggers a vCPU root zap with ~64tb of guest memory and 4kb SPTEs blocks the vCPU for 900+ seconds. Offloading the zap to a worker drops the block time to <100ms. There is an important nuance to this change. If the same work item was queued twice before the work function has run, it would only execute once and one reference would be leaked. Therefore, now that queueing and flushing items is not anymore protected by kvm->slots_lock, kvm_tdp_mmu_invalidate_all_roots() has to check root->role.invalid and skip already invalid roots. On the other hand, kvm_mmu_zap_all_fast() must return only after those skipped roots have been zapped as well. These two requirements can be satisfied only if _all_ places that change invalid to true now schedule the worker before releasing the mmu_lock. There are just two, kvm_tdp_mmu_put_root() and kvm_tdp_mmu_invalidate_all_roots(). Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon Message-Id: <20220226001546.360188-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 8e31627bca59..3751a33ee258 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -162,23 +162,24 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, * So the root temporarily gets an extra reference, going to refcount=1 * while staying invalid. Readers still cannot acquire any reference; * but writers are now allowed to run if tdp_mmu_zap_root yields and - * they might take an extra reference if they themselves yield. Therefore, - * when the reference is given back after tdp_mmu_zap_root terminates, + * they might take an extra reference if they themselves yield. + * Therefore, when the reference is given back by the worker, * there is no guarantee that the refcount is still 1. If not, whoever * puts the last reference will free the page, but they will not have to * zap the root because a root cannot go from invalid to valid. */ if (!kvm_tdp_root_mark_invalid(root)) { refcount_set(&root->tdp_mmu_root_count, 1); - tdp_mmu_zap_root(kvm, root, shared); /* - * Give back the reference that was added back above. We now - * know that the root is invalid, so go ahead and free it if - * no one has taken a reference in the meanwhile. + * Zapping the root in a worker is not just "nice to have"; + * it is required because kvm_tdp_mmu_invalidate_all_roots() + * skips already-invalid roots. If kvm_tdp_mmu_put_root() did + * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() + * might return with some roots not zapped yet. */ - if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) - return; + tdp_mmu_schedule_zap_root(kvm, root); + return; } spin_lock(&kvm->arch.tdp_mmu_pages_lock); @@ -1022,7 +1023,8 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) lockdep_assert_held_write(&kvm->mmu_lock); list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { - if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { + if (!root->role.invalid && + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { root->role.invalid = true; tdp_mmu_schedule_zap_root(kvm, root); } -- 2.34.1