KVM: Block memslot updates across range_start() and range_end()

author Paolo Bonzini <pbonzini@redhat.com>

Thu, 27 May 2021 12:09:15 +0000 (08:09 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Tue, 3 Aug 2021 07:44:03 +0000 (03:44 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Thu, 27 May 2021 12:09:15 +0000 (08:09 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Tue, 3 Aug 2021 07:44:03 +0000 (03:44 -0400)
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst

index 35eca377543dfd19e4f0b1ad8632399061094b22..8138201efb0919190ea4c8657fe014025810d9ea 100644 (file)
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows:
    can be taken inside a kvm->srcu read-side critical section,
    while kvm->slots_lock cannot.
  
+- kvm->mn_active_invalidate_count ensures that pairs of
+  invalidate_range_start() and invalidate_range_end() callbacks
+  use the same memslots array.  kvm->slots_lock and kvm->slots_arch_lock
+  are taken on the waiting side in install_new_memslots, so MMU notifiers
+  must not take either kvm->slots_lock or kvm->slots_arch_lock.
+
  On x86:
  
  - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index de58a0890b1ae878442c018acce661eaea14968d..5b6a69caccb58c81c197731e478d374724943828 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -548,6 +548,11 @@ struct kvm {
         struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
         struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
  
+       /* Used to wait for completion of MMU notifiers.  */
+       spinlock_t mn_invalidate_lock;
+       unsigned long mn_active_invalidate_count;
+       struct rcuwait mn_memslots_update_rcuwait;
+
         /*
          * created_vcpus is protected by kvm->lock, and is incremented
          * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 5cc79373827fa0794ab364e9a6510837ca09c788..8f9024d658666c277c02a6c0562a2322f43d6896 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -604,11 +604,9 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
         trace_kvm_set_spte_hva(address);
  
         /*
-        * .change_pte() must be surrounded by .invalidate_range_{start,end}(),
-        * and so always runs with an elevated notifier count.  This obviates
-        * the need to bump the sequence count.
+        * .change_pte() must be surrounded by .invalidate_range_{start,end}().
          */
-       WARN_ON_ONCE(!kvm->mmu_notifier_count);
+       WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
  
         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
  }
@@ -658,6 +656,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
  
         trace_kvm_unmap_hva_range(range->start, range->end);
  
+       /*
+        * Prevent memslot modification between range_start() and range_end()
+        * so that conditionally locking provides the same result in both
+        * functions.  Without that guarantee, the mmu_notifier_count
+        * adjustments will be imbalanced.
+        *
+        * Pairs with the decrement in range_end().
+        */
+       spin_lock(&kvm->mn_invalidate_lock);
+       kvm->mn_active_invalidate_count++;
+       spin_unlock(&kvm->mn_invalidate_lock);
+
         __kvm_handle_hva_range(kvm, &hva_range);
  
         return 0;
@@ -694,9 +704,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
                 .flush_on_ret   = false,
                 .may_block      = mmu_notifier_range_blockable(range),
         };
+       bool wake;
  
         __kvm_handle_hva_range(kvm, &hva_range);
  
+       /* Pairs with the increment in range_start(). */
+       spin_lock(&kvm->mn_invalidate_lock);
+       wake = (--kvm->mn_active_invalidate_count == 0);
+       spin_unlock(&kvm->mn_invalidate_lock);
+
+       /*
+        * There can only be one waiter, since the wait happens under
+        * slots_lock.
+        */
+       if (wake)
+               rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+
         BUG_ON(kvm->mmu_notifier_count < 0);
  }
  
@@ -977,6 +1000,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
         mutex_init(&kvm->irq_lock);
         mutex_init(&kvm->slots_lock);
         mutex_init(&kvm->slots_arch_lock);
+       spin_lock_init(&kvm->mn_invalidate_lock);
+       rcuwait_init(&kvm->mn_memslots_update_rcuwait);
+
         INIT_LIST_HEAD(&kvm->devices);
  
         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -1099,6 +1125,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
         kvm_coalesced_mmio_free(kvm);
  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+       /*
+        * At this point, pending calls to invalidate_range_start()
+        * have completed but no more MMU notifiers will run, so
+        * mn_active_invalidate_count may remain unbalanced.
+        * No threads can be waiting in install_new_memslots as the
+        * last reference on KVM has been dropped, but freeing
+        * memslots would deadlock without this manual intervention.
+        */
+       WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
+       kvm->mn_active_invalidate_count = 0;
  #else
         kvm_arch_flush_shadow_all(kvm);
  #endif
@@ -1360,7 +1396,21 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
  
+       /*
+        * Do not store the new memslots while there are invalidations in
+        * progress (preparatory change for the next commit).
+        */
+       spin_lock(&kvm->mn_invalidate_lock);
+       prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
+       while (kvm->mn_active_invalidate_count) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               spin_unlock(&kvm->mn_invalidate_lock);
+               schedule();
+               spin_lock(&kvm->mn_invalidate_lock);
+       }
+       finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
         rcu_assign_pointer(kvm->memslots[as_id], slots);
+       spin_unlock(&kvm->mn_invalidate_lock);
  
         /*
          * Acquired in kvm_set_memslot. Must be released before synchronize
author	Paolo Bonzini <pbonzini@redhat.com>
	Thu, 27 May 2021 12:09:15 +0000 (08:09 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 3 Aug 2021 07:44:03 +0000 (03:44 -0400)
Documentation/virt/kvm/locking.rst		patch \| blob \| history
include/linux/kvm_host.h		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history