From b14b2690c50e02145bb867dfcde8845eb17aa8a4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 29 Apr 2022 01:04:15 +0000 Subject: [PATCH] KVM: Rename/refactor kvm_is_reserved_pfn() to kvm_pfn_to_refcounted_page() Rename and refactor kvm_is_reserved_pfn() to kvm_pfn_to_refcounted_page() to better reflect what KVM is actually checking, and to eliminate extra pfn_to_page() lookups. The kvm_release_pfn_*() an kvm_try_get_pfn() helpers in particular benefit from "refouncted" nomenclature, as it's not all that obvious why KVM needs to get/put refcounts for some PG_reserved pages (ZERO_PAGE and ZONE_DEVICE). Add a comment to call out that the list of exceptions to PG_reserved is all but guaranteed to be incomplete. The list has mostly been compiled by people throwing noodles at KVM and finding out they stick a little too well, e.g. the ZERO_PAGE's refcount overflowed and ZONE_DEVICE pages didn't get freed. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220429010416.2788472-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 ++++++----- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 66 ++++++++++++++++++++++++++++++++++++---------- 4 files changed, 63 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b209aaf..7ebb97c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -534,6 +534,7 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; + struct page *page; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) @@ -549,11 +550,13 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) pfn = spte_to_pfn(old_spte); /* - * KVM does not hold the refcount of the page used by - * kvm mmu, before reclaiming the page, we should - * unmap it from mmu first. + * KVM doesn't hold a reference to any pages mapped into the guest, and + * instead uses the mmu_notifier to ensure that KVM unmaps any pages + * before they are reclaimed. Sanity check that, if the pfn is backed + * by a refcounted page, the refcount is elevated. */ - WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); + page = kvm_pfn_to_refcounted_page(pfn); + WARN_ON(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); @@ -2881,7 +2884,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (unlikely(fault->max_level == PG_LEVEL_4K)) return; - if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + if (is_error_noslot_pfn(fault->pfn) || !kvm_pfn_to_refcounted_page(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) @@ -5993,7 +5996,7 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + if (sp->role.direct && kvm_pfn_to_refcounted_page(pfn) && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, pfn, PG_LEVEL_NUM)) { pte_list_remove(kvm, rmap_head, sptep); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 8e8d412..1e777b7 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1751,7 +1751,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, */ pfn = spte_to_pfn(iter.old_spte); - if (kvm_is_reserved_pfn(pfn)) + if (!kvm_pfn_to_refcounted_page(pfn)) continue; max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6fb433a..a2bbdf3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1570,7 +1570,7 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); -bool kvm_is_reserved_pfn(kvm_pfn_t pfn); +struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn); bool kvm_is_zone_device_page(struct page *page); struct kvm_irq_ack_notifier { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aa1bcd5..0419609 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -182,19 +182,36 @@ bool kvm_is_zone_device_page(struct page *page) return is_zone_device_page(page); } -bool kvm_is_reserved_pfn(kvm_pfn_t pfn) +/* + * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted + * page, NULL otherwise. Note, the list of refcounted PG_reserved page types + * is likely incomplete, it has been compiled purely through people wanting to + * back guest with a certain type of memory and encountering issues. + */ +struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn) { + struct page *page; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + if (!PageReserved(page)) + return page; + + /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */ + if (is_zero_pfn(pfn)) + return page; + /* * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting * perspective they are "normal" pages, albeit with slightly different * usage rules. */ - if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)) && - !is_zero_pfn(pfn) && - !kvm_is_zone_device_page(pfn_to_page(pfn)); + if (kvm_is_zone_device_page(page)) + return page; - return true; + return NULL; } /* @@ -2501,9 +2518,12 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) static int kvm_try_get_pfn(kvm_pfn_t pfn) { - if (kvm_is_reserved_pfn(pfn)) + struct page *page = kvm_pfn_to_refcounted_page(pfn); + + if (!page) return 1; - return get_page_unless_zero(pfn_to_page(pfn)); + + return get_page_unless_zero(page); } static int hva_to_pfn_remapped(struct vm_area_struct *vma, @@ -2728,6 +2748,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); */ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { + struct page *page; kvm_pfn_t pfn; pfn = gfn_to_pfn(kvm, gfn); @@ -2735,10 +2756,11 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) if (is_error_noslot_pfn(pfn)) return KVM_ERR_PTR_BAD_PAGE; - if (kvm_is_reserved_pfn(pfn)) + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) return KVM_ERR_PTR_BAD_PAGE; - return pfn_to_page(pfn); + return page; } EXPORT_SYMBOL_GPL(gfn_to_page); @@ -2841,8 +2863,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(kvm_pfn_t pfn) { - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) - kvm_release_page_clean(pfn_to_page(pfn)); + struct page *page; + + if (is_error_noslot_pfn(pfn)) + return; + + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) + return; + + kvm_release_page_clean(page); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); @@ -2857,8 +2887,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_dirty); void kvm_release_pfn_dirty(kvm_pfn_t pfn) { - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) - kvm_release_page_dirty(pfn_to_page(pfn)); + struct page *page; + + if (is_error_noslot_pfn(pfn)) + return; + + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) + return; + + kvm_release_page_dirty(page); } EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); -- 2.7.4