KVM: x86/mmu: Zap _all_ roots when unmapping gfn range in TDP MMU
authorSean Christopherson <seanjc@google.com>
Wed, 15 Dec 2021 01:15:56 +0000 (01:15 +0000)
committerPaolo Bonzini <pbonzini@redhat.com>
Thu, 10 Feb 2022 18:47:07 +0000 (13:47 -0500)
Zap both valid and invalid roots when zapping/unmapping a gfn range, as
KVM must ensure it holds no references to the freed page after returning
from the unmap operation.  Most notably, the TDP MMU doesn't zap invalid
roots in mmu_notifier callbacks.  This leads to use-after-free and other
issues if the mmu_notifier runs to completion while an invalid root
zapper yields as KVM fails to honor the requirement that there must be
_no_ references to the page after the mmu_notifier returns.

The bug is most easily reproduced by hacking KVM to cause a collision
between set_nx_huge_pages() and kvm_mmu_notifier_release(), but the bug
exists between kvm_mmu_notifier_invalidate_range_start() and memslot
updates as well.  Invalidating a root ensures pages aren't accessible by
the guest, and KVM won't read or write page data itself, but KVM will
trigger e.g. kvm_set_pfn_dirty() when zapping SPTEs, and thus completing
a zap of an invalid root _after_ the mmu_notifier returns is fatal.

  WARNING: CPU: 24 PID: 1496 at arch/x86/kvm/../../../virt/kvm/kvm_main.c:173 [kvm]
  RIP: 0010:kvm_is_zone_device_pfn+0x96/0xa0 [kvm]
  Call Trace:
   <TASK>
   kvm_set_pfn_dirty+0xa8/0xe0 [kvm]
   __handle_changed_spte+0x2ab/0x5e0 [kvm]
   __handle_changed_spte+0x2ab/0x5e0 [kvm]
   __handle_changed_spte+0x2ab/0x5e0 [kvm]
   zap_gfn_range+0x1f3/0x310 [kvm]
   kvm_tdp_mmu_zap_invalidated_roots+0x50/0x90 [kvm]
   kvm_mmu_zap_all_fast+0x177/0x1a0 [kvm]
   set_nx_huge_pages+0xb4/0x190 [kvm]
   param_attr_store+0x70/0x100
   module_attr_store+0x19/0x30
   kernfs_fop_write_iter+0x119/0x1b0
   new_sync_write+0x11c/0x1b0
   vfs_write+0x1cc/0x270
   ksys_write+0x5f/0xe0
   do_syscall_64+0x38/0xc0
   entry_SYSCALL_64_after_hwframe+0x44/0xae
   </TASK>

Fixes: b7cccd397f31 ("KVM: x86/mmu: Fast invalidation for TDP MMU")
Cc: stable@vger.kernel.org
Cc: Ben Gardon <bgardon@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211215011557.399940-4-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/kvm/mmu/tdp_mmu.c

index f71026d0512d2b7d633b9e8dc58ff782c5d3e54d..88408839d2604199f9c191ace9f477a36ff8cc66 100644 (file)
@@ -99,15 +99,18 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
 }
 
 /*
- * Finds the next valid root after root (or the first valid root if root
- * is NULL), takes a reference on it, and returns that next root. If root
- * is not NULL, this thread should have already taken a reference on it, and
- * that reference will be dropped. If no valid root is found, this
- * function will return NULL.
+ * Returns the next root after @prev_root (or the first root if @prev_root is
+ * NULL).  A reference to the returned root is acquired, and the reference to
+ * @prev_root is released (the caller obviously must hold a reference to
+ * @prev_root if it's non-NULL).
+ *
+ * If @only_valid is true, invalid roots are skipped.
+ *
+ * Returns NULL if the end of tdp_mmu_roots was reached.
  */
 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
                                              struct kvm_mmu_page *prev_root,
-                                             bool shared)
+                                             bool shared, bool only_valid)
 {
        struct kvm_mmu_page *next_root;
 
@@ -122,7 +125,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
                                                   typeof(*next_root), link);
 
        while (next_root) {
-               if (!next_root->role.invalid &&
+               if ((!only_valid || !next_root->role.invalid) &&
                    kvm_tdp_mmu_get_root(kvm, next_root))
                        break;
 
@@ -148,13 +151,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  * mode. In the unlikely event that this thread must free a root, the lock
  * will be temporarily dropped and reacquired in write mode.
  */
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
-       for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);            \
-            _root;                                                     \
-            _root = tdp_mmu_next_root(_kvm, _root, _shared))           \
-               if (kvm_mmu_page_as_id(_root) != _as_id) {              \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
+       for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);       \
+            _root;                                                             \
+            _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))      \
+               if (kvm_mmu_page_as_id(_root) != _as_id) {                      \
                } else
 
+#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)   \
+       __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
+
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)         \
+       __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
+
 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                             \
        list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,         \
                                lockdep_is_held_type(&kvm->mmu_lock, 0) ||      \
@@ -1224,7 +1233,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
 
        lockdep_assert_held_read(&kvm->mmu_lock);
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+       for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
                spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
                             slot->base_gfn + slot->npages, min_level);
 
@@ -1294,7 +1303,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
 
        lockdep_assert_held_read(&kvm->mmu_lock);
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+       for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
                spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
                                slot->base_gfn + slot->npages);
 
@@ -1419,7 +1428,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
 
        lockdep_assert_held_read(&kvm->mmu_lock);
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+       for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
                zap_collapsible_spte_range(kvm, root, slot);
 }