KVM: x86/mmu: Stop zapping invalidated TDP MMU roots asynchronously

author Sean Christopherson <seanjc@google.com>

Sat, 16 Sep 2023 00:39:15 +0000 (17:39 -0700)

committer Paolo Bonzini <pbonzini@redhat.com>

Sat, 23 Sep 2023 09:35:48 +0000 (05:35 -0400)
author Sean Christopherson <seanjc@google.com>
Sat, 16 Sep 2023 00:39:15 +0000 (17:39 -0700)
committer Paolo Bonzini <pbonzini@redhat.com>
Sat, 23 Sep 2023 09:35:48 +0000 (05:35 -0400)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 1a4def3..17715cb 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1419,7 +1419,6 @@ struct kvm_arch {
          * the thread holds the MMU lock in write mode.
          */
         spinlock_t tdp_mmu_pages_lock;
-       struct workqueue_struct *tdp_mmu_zap_wq;
  #endif /* CONFIG_X86_64 */
  
         /*
@@ -1835,7 +1834,7 @@ void kvm_mmu_vendor_module_exit(void);
  
  void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
  int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_init_vm(struct kvm *kvm);
  void kvm_mmu_uninit_vm(struct kvm *kvm);
  
  void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index 54f94f6..f7901cb 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6167,20 +6167,15 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
  }
  
-int kvm_mmu_init_vm(struct kvm *kvm)
+void kvm_mmu_init_vm(struct kvm *kvm)
  {
-       int r;
-
         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
         INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
         INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
         spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
  
-       if (tdp_mmu_enabled) {
-               r = kvm_mmu_init_tdp_mmu(kvm);
-               if (r < 0)
-                       return r;
-       }
+       if (tdp_mmu_enabled)
+               kvm_mmu_init_tdp_mmu(kvm);
  
         kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
         kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6189,8 +6184,6 @@ int kvm_mmu_init_vm(struct kvm *kvm)
  
         kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
         kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
-
-       return 0;
  }
  
  static void mmu_free_vm_memory_caches(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h

index b102014..decc1f1 100644 (file)
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -58,7 +58,12 @@ struct kvm_mmu_page {
  
         bool tdp_mmu_page;
         bool unsync;
-       u8 mmu_valid_gen;
+       union {
+               u8 mmu_valid_gen;
+
+               /* Only accessed under slots_lock.  */
+               bool tdp_mmu_scheduled_root_to_zap;
+       };
  
          /*
           * The shadow page can't be replaced by an equivalent huge page
@@ -100,13 +105,7 @@ struct kvm_mmu_page {
                 struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
                 tdp_ptep_t ptep;
         };
-       union {
-               DECLARE_BITMAP(unsync_child_bitmap, 512);
-               struct {
-                       struct work_struct tdp_mmu_async_work;
-                       void *tdp_mmu_async_data;
-               };
-       };
+       DECLARE_BITMAP(unsync_child_bitmap, 512);
  
         /*
          * Tracks shadow pages that, if zapped, would allow KVM to create an NX
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c

index aa90901..6cd4dd6 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -12,18 +12,10 @@
  #include <trace/events/kvm.h>
  
  /* Initializes the TDP MMU for the VM, if enabled. */
-int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  {
-       struct workqueue_struct *wq;
-
-       wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
-       if (!wq)
-               return -ENOMEM;
-
         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
-       kvm->arch.tdp_mmu_zap_wq = wq;
-       return 1;
  }
  
  /* Arbitrarily returns true so that this may be used in if statements. */
@@ -46,20 +38,15 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
          * ultimately frees all roots.
          */
         kvm_tdp_mmu_invalidate_all_roots(kvm);
-
-       /*
-        * Destroying a workqueue also first flushes the workqueue, i.e. no
-        * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
-        */
-       destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
+       kvm_tdp_mmu_zap_invalidated_roots(kvm);
  
         WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  
         /*
          * Ensure that all the outstanding RCU callbacks to free shadow pages
-        * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
-        * can call kvm_tdp_mmu_put_root and create new callbacks.
+        * can run before the VM is torn down.  Putting the last reference to
+        * zapped roots will create new callbacks.
          */
         rcu_barrier();
  }
@@ -86,46 +73,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
         tdp_mmu_free_sp(sp);
  }
  
-static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
-                            bool shared);
-
-static void tdp_mmu_zap_root_work(struct work_struct *work)
-{
-       struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
-                                                tdp_mmu_async_work);
-       struct kvm *kvm = root->tdp_mmu_async_data;
-
-       read_lock(&kvm->mmu_lock);
-
-       /*
-        * A TLB flush is not necessary as KVM performs a local TLB flush when
-        * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
-        * to a different pCPU.  Note, the local TLB flush on reuse also
-        * invalidates any paging-structure-cache entries, i.e. TLB entries for
-        * intermediate paging structures, that may be zapped, as such entries
-        * are associated with the ASID on both VMX and SVM.
-        */
-       tdp_mmu_zap_root(kvm, root, true);
-
-       /*
-        * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
-        * avoiding an infinite loop.  By design, the root is reachable while
-        * it's being asynchronously zapped, thus a different task can put its
-        * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
-        * asynchronously zapped root is unavoidable.
-        */
-       kvm_tdp_mmu_put_root(kvm, root, true);
-
-       read_unlock(&kvm->mmu_lock);
-}
-
-static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
-       root->tdp_mmu_async_data = kvm;
-       INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
-       queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
-}
-
  void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
                           bool shared)
  {
@@ -211,11 +158,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)   \
         __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
  
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                  \
-       for (_root = tdp_mmu_next_root(_kvm, NULL, false, false);               \
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared)                 \
+       for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false);             \
              _root;                                                             \
-            _root = tdp_mmu_next_root(_kvm, _root, false, false))              \
-               if (!kvm_lockdep_assert_mmu_lock_held(_kvm, false)) {           \
+            _root = tdp_mmu_next_root(_kvm, _root, _shared, false))            \
+               if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) {         \
                 } else
  
  /*
@@ -296,7 +243,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
          * by a memslot update or by the destruction of the VM.  Initialize the
          * refcount to two; one reference for the vCPU, and one reference for
          * the TDP MMU itself, which is held until the root is invalidated and
-        * is ultimately put by tdp_mmu_zap_root_work().
+        * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
          */
         refcount_set(&root->tdp_mmu_root_count, 2);
  
@@ -885,7 +832,7 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
  {
         struct kvm_mmu_page *root;
  
-       for_each_tdp_mmu_root_yield_safe(kvm, root)
+       for_each_tdp_mmu_root_yield_safe(kvm, root, false)
                 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
  
         return flush;
@@ -907,7 +854,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
          * is being destroyed or the userspace VMM has exited.  In both cases,
          * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
          */
-       for_each_tdp_mmu_root_yield_safe(kvm, root)
+       for_each_tdp_mmu_root_yield_safe(kvm, root, false)
                 tdp_mmu_zap_root(kvm, root, false);
  }
  
@@ -917,18 +864,47 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
   */
  void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
  {
-       flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
+       struct kvm_mmu_page *root;
+
+       read_lock(&kvm->mmu_lock);
+
+       for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
+               if (!root->tdp_mmu_scheduled_root_to_zap)
+                       continue;
+
+               root->tdp_mmu_scheduled_root_to_zap = false;
+               KVM_BUG_ON(!root->role.invalid, kvm);
+
+               /*
+                * A TLB flush is not necessary as KVM performs a local TLB
+                * flush when allocating a new root (see kvm_mmu_load()), and
+                * when migrating a vCPU to a different pCPU.  Note, the local
+                * TLB flush on reuse also invalidates paging-structure-cache
+                * entries, i.e. TLB entries for intermediate paging structures,
+                * that may be zapped, as such entries are associated with the
+                * ASID on both VMX and SVM.
+                */
+               tdp_mmu_zap_root(kvm, root, true);
+
+               /*
+                * The referenced needs to be put *after* zapping the root, as
+                * the root must be reachable by mmu_notifiers while it's being
+                * zapped
+                */
+               kvm_tdp_mmu_put_root(kvm, root, true);
+       }
+
+       read_unlock(&kvm->mmu_lock);
  }
  
  /*
   * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
   * is about to be zapped, e.g. in response to a memslots update.  The actual
- * zapping is performed asynchronously.  Using a separate workqueue makes it
- * easy to ensure that the destruction is performed before the "fast zap"
- * completes, without keeping a separate list of invalidated roots; the list is
- * effectively the list of work items in the workqueue.
+ * zapping is done separately so that it happens with mmu_lock with read,
+ * whereas invalidating roots must be done with mmu_lock held for write (unless
+ * the VM is being destroyed).
   *
- * Note, the asynchronous worker is gifted the TDP MMU's reference.
+ * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
   * See kvm_tdp_mmu_get_vcpu_root_hpa().
   */
  void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
@@ -953,19 +929,20 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
         /*
          * As above, mmu_lock isn't held when destroying the VM!  There can't
          * be other references to @kvm, i.e. nothing else can invalidate roots
-        * or be consuming roots, but walking the list of roots does need to be
-        * guarded against roots being deleted by the asynchronous zap worker.
+        * or get/put references to roots.
          */
-       rcu_read_lock();
-
-       list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
+       list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+               /*
+                * Note, invalid roots can outlive a memslot update!  Invalid
+                * roots must be *zapped* before the memslot update completes,
+                * but a different task can acquire a reference and keep the
+                * root alive after its been zapped.
+                */
                 if (!root->role.invalid) {
+                       root->tdp_mmu_scheduled_root_to_zap = true;
                         root->role.invalid = true;
-                       tdp_mmu_schedule_zap_root(kvm, root);
                 }
         }
-
-       rcu_read_unlock();
  }
  
  /*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h

index bc08895..733a3ae 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,7 +7,7 @@
  
  #include "spte.h"
  
-int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
  
  hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 6c9c81e..9f18b06 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12308,9 +12308,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
         if (ret)
                 goto out;
  
-       ret = kvm_mmu_init_vm(kvm);
-       if (ret)
-               goto out_page_track;
+       kvm_mmu_init_vm(kvm);
  
         ret = static_call(kvm_x86_vm_init)(kvm);
         if (ret)
@@ -12355,7 +12353,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  
  out_uninit_mmu:
         kvm_mmu_uninit_vm(kvm);
-out_page_track:
         kvm_page_track_cleanup(kvm);
  out:
         return ret;
author	Sean Christopherson <seanjc@google.com>
	Sat, 16 Sep 2023 00:39:15 +0000 (17:39 -0700)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Sat, 23 Sep 2023 09:35:48 +0000 (05:35 -0400)
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/mmu_internal.h		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history