KVM: x86/mmu: Split huge pages mapped by the TDP MMU when dirty logging is enabled

author David Matlack <dmatlack@google.com>

Wed, 19 Jan 2022 23:07:36 +0000 (23:07 +0000)

committer Paolo Bonzini <pbonzini@redhat.com>

Thu, 10 Feb 2022 18:50:42 +0000 (13:50 -0500)
author David Matlack <dmatlack@google.com>
Wed, 19 Jan 2022 23:07:36 +0000 (23:07 +0000)
committer Paolo Bonzini <pbonzini@redhat.com>
Thu, 10 Feb 2022 18:50:42 +0000 (13:50 -0500)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index f5a27f0..5d80a0f 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2339,6 +2339,30 @@
         kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
                         Default is 0 (don't ignore, but inject #GP)
  
+       kvm.eager_page_split=
+                       [KVM,X86] Controls whether or not KVM will try to
+                       proactively split all huge pages during dirty logging.
+                       Eager page splitting reduces interruptions to vCPU
+                       execution by eliminating the write-protection faults
+                       and MMU lock contention that would otherwise be
+                       required to split huge pages lazily.
+
+                       VM workloads that rarely perform writes or that write
+                       only to a small region of VM memory may benefit from
+                       disabling eager page splitting to allow huge pages to
+                       still be used for reads.
+
+                       The behavior of eager page splitting depends on whether
+                       KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
+                       disabled, all huge pages in a memslot will be eagerly
+                       split when dirty logging is enabled on that memslot. If
+                       enabled, huge pages will not be eagerly split.
+
+                       Eager page splitting currently only supports splitting
+                       huge pages mapped by the TDP MMU.
+
+                       Default is Y (on).
+
         kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
                                    Default is false (don't support).
  
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 4b7eee9..8bfb069 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1587,6 +1587,9 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
                                       const struct kvm_memory_slot *memslot,
                                       int start_level);
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+                                      const struct kvm_memory_slot *memslot,
+                                      int target_level);
  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                    const struct kvm_memory_slot *memslot);
  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index a75e4ae..308c8b2 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5830,6 +5830,30 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
  }
  
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+                                      const struct kvm_memory_slot *memslot,
+                                      int target_level)
+{
+       u64 start = memslot->base_gfn;
+       u64 end = start + memslot->npages;
+
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+               read_unlock(&kvm->mmu_lock);
+       }
+
+       /*
+        * No TLB flush is necessary here. KVM will flush TLBs after
+        * write-protecting and/or clearing dirty on the newly split SPTEs to
+        * ensure that guest writes are reflected in the dirty log before the
+        * ioctl to enable dirty logging on this memslot completes. Since the
+        * split SPTEs retain the write and dirty bits of the huge SPTE, it is
+        * safe for KVM to decide if a TLB flush is necessary based on the split
+        * SPTEs.
+        */
+}
+
  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
                                          struct kvm_rmap_head *rmap_head,
                                          const struct kvm_memory_slot *slot)
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c

index 56411cf..4739b53 100644 (file)
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -192,6 +192,65 @@ out:
         return wrprot;
  }
  
+static u64 make_spte_executable(u64 spte)
+{
+       bool is_access_track = is_access_track_spte(spte);
+
+       if (is_access_track)
+               spte = restore_acc_track_spte(spte);
+
+       spte &= ~shadow_nx_mask;
+       spte |= shadow_x_mask;
+
+       if (is_access_track)
+               spte = mark_spte_for_access_track(spte);
+
+       return spte;
+}
+
+/*
+ * Construct an SPTE that maps a sub-page of the given huge page SPTE where
+ * `index` identifies which sub-page.
+ *
+ * This is used during huge page splitting to build the SPTEs that make up the
+ * new page table.
+ */
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
+{
+       u64 child_spte;
+       int child_level;
+
+       if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
+               return 0;
+
+       if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
+               return 0;
+
+       child_spte = huge_spte;
+       child_level = huge_level - 1;
+
+       /*
+        * The child_spte already has the base address of the huge page being
+        * split. So we just have to OR in the offset to the page at the next
+        * lower level for the given index.
+        */
+       child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
+
+       if (child_level == PG_LEVEL_4K) {
+               child_spte &= ~PT_PAGE_SIZE_MASK;
+
+               /*
+                * When splitting to a 4K page, mark the page executable as the
+                * NX hugepage mitigation no longer applies.
+                */
+               if (is_nx_huge_page_enabled())
+                       child_spte = make_spte_executable(child_spte);
+       }
+
+       return child_spte;
+}
+
+
  u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
  {
         u64 spte = SPTE_MMU_PRESENT_MASK;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h

index 7a219c3..73f1261 100644 (file)
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -415,6 +415,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
                u64 old_spte, bool prefetch, bool can_unsync,
                bool host_writable, u64 *new_spte);
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
  u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
  u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
  u64 mark_spte_for_access_track(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c

index 4c9a98a..6dfd6db 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1257,6 +1257,179 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
         return spte_set;
  }
  
+static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+{
+       struct kvm_mmu_page *sp;
+
+       gfp |= __GFP_ZERO;
+
+       sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+       if (!sp)
+               return NULL;
+
+       sp->spt = (void *)__get_free_page(gfp);
+       if (!sp->spt) {
+               kmem_cache_free(mmu_page_header_cache, sp);
+               return NULL;
+       }
+
+       return sp;
+}
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
+                                                      struct tdp_iter *iter)
+{
+       struct kvm_mmu_page *sp;
+
+       lockdep_assert_held_read(&kvm->mmu_lock);
+
+       /*
+        * Since we are allocating while under the MMU lock we have to be
+        * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
+        * reclaim and to avoid making any filesystem callbacks (which can end
+        * up invoking KVM MMU notifiers, resulting in a deadlock).
+        *
+        * If this allocation fails we drop the lock and retry with reclaim
+        * allowed.
+        */
+       sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
+       if (sp)
+               return sp;
+
+       rcu_read_unlock();
+       read_unlock(&kvm->mmu_lock);
+
+       iter->yielded = true;
+       sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
+
+       read_lock(&kvm->mmu_lock);
+       rcu_read_lock();
+
+       return sp;
+}
+
+static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
+                                         struct tdp_iter *iter,
+                                         struct kvm_mmu_page *sp)
+{
+       const u64 huge_spte = iter->old_spte;
+       const int level = iter->level;
+       int ret, i;
+
+       tdp_mmu_init_child_sp(sp, iter);
+
+       /*
+        * No need for atomics when writing to sp->spt since the page table has
+        * not been linked in yet and thus is not reachable from any other CPU.
+        */
+       for (i = 0; i < PT64_ENT_PER_PAGE; i++)
+               sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
+
+       /*
+        * Replace the huge spte with a pointer to the populated lower level
+        * page table. Since we are making this change without a TLB flush vCPUs
+        * will see a mix of the split mappings and the original huge mapping,
+        * depending on what's currently in their TLB. This is fine from a
+        * correctness standpoint since the translation will be the same either
+        * way.
+        */
+       ret = tdp_mmu_link_sp_atomic(kvm, iter, sp, false);
+       if (ret)
+               return ret;
+
+       /*
+        * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
+        * are overwriting from the page stats. But we have to manually update
+        * the page stats with the new present child pages.
+        */
+       kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
+
+       return 0;
+}
+
+static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
+                                        struct kvm_mmu_page *root,
+                                        gfn_t start, gfn_t end,
+                                        int target_level)
+{
+       struct kvm_mmu_page *sp = NULL;
+       struct tdp_iter iter;
+       int ret = 0;
+
+       rcu_read_lock();
+
+       /*
+        * Traverse the page table splitting all huge pages above the target
+        * level into one lower level. For example, if we encounter a 1GB page
+        * we split it into 512 2MB pages.
+        *
+        * Since the TDP iterator uses a pre-order traversal, we are guaranteed
+        * to visit an SPTE before ever visiting its children, which means we
+        * will correctly recursively split huge pages that are more than one
+        * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
+        * and then splitting each of those to 512 4KB pages).
+        */
+       for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+                       continue;
+
+               if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
+                       continue;
+
+               if (!sp) {
+                       sp = tdp_mmu_alloc_sp_for_split(kvm, &iter);
+                       if (!sp) {
+                               ret = -ENOMEM;
+                               break;
+                       }
+
+                       if (iter.yielded)
+                               continue;
+               }
+
+               if (tdp_mmu_split_huge_page_atomic(kvm, &iter, sp))
+                       goto retry;
+
+               sp = NULL;
+       }
+
+       rcu_read_unlock();
+
+       /*
+        * It's possible to exit the loop having never used the last sp if, for
+        * example, a vCPU doing HugePage NX splitting wins the race and
+        * installs its own sp in place of the last sp we tried to split.
+        */
+       if (sp)
+               tdp_mmu_free_sp(sp);
+
+
+       return ret;
+}
+
+/*
+ * Try to split all huge pages mapped by the TDP MMU down to the target level.
+ */
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+                                     const struct kvm_memory_slot *slot,
+                                     gfn_t start, gfn_t end,
+                                     int target_level)
+{
+       struct kvm_mmu_page *root;
+       int r = 0;
+
+       lockdep_assert_held_read(&kvm->mmu_lock);
+
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) {
+               r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level);
+               if (r) {
+                       kvm_tdp_mmu_put_root(kvm, root, true);
+                       break;
+               }
+       }
+}
+
  /*
   * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
   * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h

index 6b9bdd6..fdb3a88 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -67,6 +67,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
                                    struct kvm_memory_slot *slot, gfn_t gfn,
                                    int min_level);
  
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+                                     const struct kvm_memory_slot *slot,
+                                     gfn_t start, gfn_t end,
+                                     int target_level);
+
  static inline void kvm_tdp_mmu_walk_lockless_begin(void)
  {
         rcu_read_lock();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index a356b8a..ffef31f 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -192,6 +192,9 @@ bool __read_mostly enable_pmu = true;
  EXPORT_SYMBOL_GPL(enable_pmu);
  module_param(enable_pmu, bool, 0444);
  
+static bool __read_mostly eager_page_split = true;
+module_param(eager_page_split, bool, 0644);
+
  /*
   * Restoring the host value for MSRs that are only consumed when running in
   * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -11970,6 +11973,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
                         return;
  
+               if (READ_ONCE(eager_page_split))
+                       kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
+
                 if (kvm_x86_ops.cpu_dirty_log_size) {
                         kvm_mmu_slot_leaf_clear_dirty(kvm, new);
                         kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
author	David Matlack <dmatlack@google.com>
	Wed, 19 Jan 2022 23:07:36 +0000 (23:07 +0000)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Thu, 10 Feb 2022 18:50:42 +0000 (13:50 -0500)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/spte.c		patch \| blob \| history
arch/x86/kvm/mmu/spte.h		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history