1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5 #include "mmu_internal.h"
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
14 /* Initializes the TDP MMU for the VM, if enabled. */
15 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
17 struct workqueue_struct *wq;
19 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
23 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
24 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
25 kvm->arch.tdp_mmu_zap_wq = wq;
29 /* Arbitrarily returns true so that this may be used in if statements. */
30 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
34 lockdep_assert_held_read(&kvm->mmu_lock);
36 lockdep_assert_held_write(&kvm->mmu_lock);
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
44 * Invalidate all roots, which besides the obvious, schedules all roots
45 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
46 * ultimately frees all roots.
48 kvm_tdp_mmu_invalidate_all_roots(kvm);
51 * Destroying a workqueue also first flushes the workqueue, i.e. no
52 * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
54 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
56 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
60 * Ensure that all the outstanding RCU callbacks to free shadow pages
61 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
62 * can call kvm_tdp_mmu_put_root and create new callbacks.
67 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
69 free_page((unsigned long)sp->spt);
70 kmem_cache_free(mmu_page_header_cache, sp);
74 * This is called through call_rcu in order to free TDP page table memory
75 * safely with respect to other kernel threads that may be operating on
77 * By only accessing TDP MMU page table memory in an RCU read critical
78 * section, and freeing it after a grace period, lockless access to that
79 * memory won't use it after it is freed.
81 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
83 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
89 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
92 static void tdp_mmu_zap_root_work(struct work_struct *work)
94 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
96 struct kvm *kvm = root->tdp_mmu_async_data;
98 read_lock(&kvm->mmu_lock);
101 * A TLB flush is not necessary as KVM performs a local TLB flush when
102 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
103 * to a different pCPU. Note, the local TLB flush on reuse also
104 * invalidates any paging-structure-cache entries, i.e. TLB entries for
105 * intermediate paging structures, that may be zapped, as such entries
106 * are associated with the ASID on both VMX and SVM.
108 tdp_mmu_zap_root(kvm, root, true);
111 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
112 * avoiding an infinite loop. By design, the root is reachable while
113 * it's being asynchronously zapped, thus a different task can put its
114 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
115 * asynchronously zapped root is unavoidable.
117 kvm_tdp_mmu_put_root(kvm, root, true);
119 read_unlock(&kvm->mmu_lock);
122 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
124 root->tdp_mmu_async_data = kvm;
125 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
126 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
129 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
132 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
134 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
138 * The TDP MMU itself holds a reference to each root until the root is
139 * explicitly invalidated, i.e. the final reference should be never be
140 * put for a valid root.
142 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
144 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
145 list_del_rcu(&root->link);
146 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
147 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
151 * Returns the next root after @prev_root (or the first root if @prev_root is
152 * NULL). A reference to the returned root is acquired, and the reference to
153 * @prev_root is released (the caller obviously must hold a reference to
154 * @prev_root if it's non-NULL).
156 * If @only_valid is true, invalid roots are skipped.
158 * Returns NULL if the end of tdp_mmu_roots was reached.
160 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
161 struct kvm_mmu_page *prev_root,
162 bool shared, bool only_valid)
164 struct kvm_mmu_page *next_root;
169 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
171 typeof(*prev_root), link);
173 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
174 typeof(*next_root), link);
177 if ((!only_valid || !next_root->role.invalid) &&
178 kvm_tdp_mmu_get_root(next_root))
181 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
182 &next_root->link, typeof(*next_root), link);
188 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
194 * Note: this iterator gets and puts references to the roots it iterates over.
195 * This makes it safe to release the MMU lock and yield within the loop, but
196 * if exiting the loop early, the caller must drop the reference to the most
197 * recent root. (Unless keeping a live reference is desirable.)
199 * If shared is set, this function is operating under the MMU lock in read
200 * mode. In the unlikely event that this thread must free a root, the lock
201 * will be temporarily dropped and reacquired in write mode.
203 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
204 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
206 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
207 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
208 kvm_mmu_page_as_id(_root) != _as_id) { \
211 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
212 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
214 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
215 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
218 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
219 * the implication being that any flow that holds mmu_lock for read is
220 * inherently yield-friendly and should use the yield-safe variant above.
221 * Holding mmu_lock for write obviates the need for RCU protection as the list
222 * is guaranteed to be stable.
224 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
225 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
226 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
227 kvm_mmu_page_as_id(_root) != _as_id) { \
230 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
232 struct kvm_mmu_page *sp;
234 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
235 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
240 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
241 gfn_t gfn, union kvm_mmu_page_role role)
243 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
245 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
250 sp->tdp_mmu_page = true;
252 trace_kvm_mmu_get_page(sp, true);
255 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
256 struct tdp_iter *iter)
258 struct kvm_mmu_page *parent_sp;
259 union kvm_mmu_page_role role;
261 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
263 role = parent_sp->role;
266 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
269 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
271 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
272 struct kvm *kvm = vcpu->kvm;
273 struct kvm_mmu_page *root;
275 lockdep_assert_held_write(&kvm->mmu_lock);
278 * Check for an existing root before allocating a new one. Note, the
279 * role check prevents consuming an invalid root.
281 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
282 if (root->role.word == role.word &&
283 kvm_tdp_mmu_get_root(root))
287 root = tdp_mmu_alloc_sp(vcpu);
288 tdp_mmu_init_sp(root, NULL, 0, role);
291 * TDP MMU roots are kept until they are explicitly invalidated, either
292 * by a memslot update or by the destruction of the VM. Initialize the
293 * refcount to two; one reference for the vCPU, and one reference for
294 * the TDP MMU itself, which is held until the root is invalidated and
295 * is ultimately put by tdp_mmu_zap_root_work().
297 refcount_set(&root->tdp_mmu_root_count, 2);
299 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
300 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
301 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
304 return __pa(root->spt);
307 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
308 u64 old_spte, u64 new_spte, int level,
311 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
313 kvm_account_pgtable_pages((void *)sp->spt, +1);
314 atomic64_inc(&kvm->arch.tdp_mmu_pages);
317 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
319 kvm_account_pgtable_pages((void *)sp->spt, -1);
320 atomic64_dec(&kvm->arch.tdp_mmu_pages);
324 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
327 * @sp: the page to be removed
328 * @shared: This operation may not be running under the exclusive use of
329 * the MMU lock and the operation must synchronize with other
330 * threads that might be adding or removing pages.
332 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
335 tdp_unaccount_mmu_page(kvm, sp);
337 if (!sp->nx_huge_page_disallowed)
341 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
343 lockdep_assert_held_write(&kvm->mmu_lock);
345 sp->nx_huge_page_disallowed = false;
346 untrack_possible_nx_huge_page(kvm, sp);
349 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
353 * handle_removed_pt() - handle a page table removed from the TDP structure
356 * @pt: the page removed from the paging structure
357 * @shared: This operation may not be running under the exclusive use
358 * of the MMU lock and the operation must synchronize with other
359 * threads that might be modifying SPTEs.
361 * Given a page table that has been removed from the TDP paging structure,
362 * iterates through the page table to clear SPTEs and free child page tables.
364 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
365 * protection. Since this thread removed it from the paging structure,
366 * this thread will be responsible for ensuring the page is freed. Hence the
367 * early rcu_dereferences in the function.
369 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
371 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
372 int level = sp->role.level;
373 gfn_t base_gfn = sp->gfn;
376 trace_kvm_mmu_prepare_zap_page(sp);
378 tdp_mmu_unlink_sp(kvm, sp, shared);
380 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
381 tdp_ptep_t sptep = pt + i;
382 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
387 * Set the SPTE to a nonpresent value that other
388 * threads will not overwrite. If the SPTE was
389 * already marked as removed then another thread
390 * handling a page fault could overwrite it, so
391 * set the SPTE until it is set from some other
392 * value to the removed SPTE value.
395 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
396 if (!is_removed_spte(old_spte))
402 * If the SPTE is not MMU-present, there is no backing
403 * page associated with the SPTE and so no side effects
404 * that need to be recorded, and exclusive ownership of
405 * mmu_lock ensures the SPTE can't be made present.
406 * Note, zapping MMIO SPTEs is also unnecessary as they
407 * are guarded by the memslots generation, not by being
410 old_spte = kvm_tdp_mmu_read_spte(sptep);
411 if (!is_shadow_present_pte(old_spte))
415 * Use the common helper instead of a raw WRITE_ONCE as
416 * the SPTE needs to be updated atomically if it can be
417 * modified by a different vCPU outside of mmu_lock.
418 * Even though the parent SPTE is !PRESENT, the TLB
419 * hasn't yet been flushed, and both Intel and AMD
420 * document that A/D assists can use upper-level PxE
421 * entries that are cached in the TLB, i.e. the CPU can
422 * still access the page and mark it dirty.
424 * No retry is needed in the atomic update path as the
425 * sole concern is dropping a Dirty bit, i.e. no other
426 * task can zap/remove the SPTE as mmu_lock is held for
427 * write. Marking the SPTE as a removed SPTE is not
428 * strictly necessary for the same reason, but using
429 * the remove SPTE value keeps the shared/exclusive
430 * paths consistent and allows the handle_changed_spte()
431 * call below to hardcode the new value to REMOVED_SPTE.
433 * Note, even though dropping a Dirty bit is the only
434 * scenario where a non-atomic update could result in a
435 * functional bug, simply checking the Dirty bit isn't
436 * sufficient as a fast page fault could read the upper
437 * level SPTE before it is zapped, and then make this
438 * target SPTE writable, resume the guest, and set the
439 * Dirty bit between reading the SPTE above and writing
442 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
443 REMOVED_SPTE, level);
445 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
446 old_spte, REMOVED_SPTE, level, shared);
449 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
453 * handle_changed_spte - handle bookkeeping associated with an SPTE change
455 * @as_id: the address space of the paging structure the SPTE was a part of
456 * @gfn: the base GFN that was mapped by the SPTE
457 * @old_spte: The value of the SPTE before the change
458 * @new_spte: The value of the SPTE after the change
459 * @level: the level of the PT the SPTE is part of in the paging structure
460 * @shared: This operation may not be running under the exclusive use of
461 * the MMU lock and the operation must synchronize with other
462 * threads that might be modifying SPTEs.
464 * Handle bookkeeping that might result from the modification of a SPTE. Note,
465 * dirty logging updates are handled in common code, not here (see make_spte()
466 * and fast_pf_fix_direct_spte()).
468 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
469 u64 old_spte, u64 new_spte, int level,
472 bool was_present = is_shadow_present_pte(old_spte);
473 bool is_present = is_shadow_present_pte(new_spte);
474 bool was_leaf = was_present && is_last_spte(old_spte, level);
475 bool is_leaf = is_present && is_last_spte(new_spte, level);
476 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
478 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
479 WARN_ON_ONCE(level < PG_LEVEL_4K);
480 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
483 * If this warning were to trigger it would indicate that there was a
484 * missing MMU notifier or a race with some notifier handler.
485 * A present, leaf SPTE should never be directly replaced with another
486 * present leaf SPTE pointing to a different PFN. A notifier handler
487 * should be zapping the SPTE before the main MM's page table is
488 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
489 * thread before replacement.
491 if (was_leaf && is_leaf && pfn_changed) {
492 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
493 "SPTE with another present leaf SPTE mapping a\n"
495 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
496 as_id, gfn, old_spte, new_spte, level);
499 * Crash the host to prevent error propagation and guest data
505 if (old_spte == new_spte)
508 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
511 check_spte_writable_invariants(new_spte);
514 * The only times a SPTE should be changed from a non-present to
515 * non-present state is when an MMIO entry is installed/modified/
516 * removed. In that case, there is nothing to do here.
518 if (!was_present && !is_present) {
520 * If this change does not involve a MMIO SPTE or removed SPTE,
521 * it is unexpected. Log the change, though it should not
522 * impact the guest since both the former and current SPTEs
525 if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
526 !is_mmio_spte(new_spte) &&
527 !is_removed_spte(new_spte)))
528 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
529 "should not be replaced with another,\n"
530 "different nonpresent SPTE, unless one or both\n"
531 "are MMIO SPTEs, or the new SPTE is\n"
532 "a temporary removed SPTE.\n"
533 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
534 as_id, gfn, old_spte, new_spte, level);
538 if (is_leaf != was_leaf)
539 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
541 if (was_leaf && is_dirty_spte(old_spte) &&
542 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
543 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
546 * Recursively handle child PTs if the change removed a subtree from
547 * the paging structure. Note the WARN on the PFN changing without the
548 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
549 * pages are kernel allocations and should never be migrated.
551 if (was_present && !was_leaf &&
552 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
553 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
555 if (was_leaf && is_accessed_spte(old_spte) &&
556 (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
557 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
561 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
562 * and handle the associated bookkeeping. Do not mark the page dirty
563 * in KVM's dirty bitmaps.
565 * If setting the SPTE fails because it has changed, iter->old_spte will be
566 * refreshed to the current value of the spte.
569 * @iter: a tdp_iter instance currently on the SPTE that should be set
570 * @new_spte: The value the SPTE should be set to
572 * * 0 - If the SPTE was set.
573 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
574 * no side-effects other than setting iter->old_spte to the last
575 * known value of the spte.
577 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
578 struct tdp_iter *iter,
581 u64 *sptep = rcu_dereference(iter->sptep);
584 * The caller is responsible for ensuring the old SPTE is not a REMOVED
585 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
586 * and pre-checking before inserting a new SPTE is advantageous as it
587 * avoids unnecessary work.
589 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
591 lockdep_assert_held_read(&kvm->mmu_lock);
594 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
595 * does not hold the mmu_lock. On failure, i.e. if a different logical
596 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
597 * the current value, so the caller operates on fresh data, e.g. if it
598 * retries tdp_mmu_set_spte_atomic()
600 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
603 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
604 new_spte, iter->level, true);
609 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
610 struct tdp_iter *iter)
615 * Freeze the SPTE by setting it to a special,
616 * non-present value. This will stop other threads from
617 * immediately installing a present entry in its place
618 * before the TLBs are flushed.
620 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
624 kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
627 * No other thread can overwrite the removed SPTE as they must either
628 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
629 * overwrite the special removed SPTE value. No bookkeeping is needed
630 * here since the SPTE is going from non-present to non-present. Use
631 * the raw write helper to avoid an unnecessary check on volatile bits.
633 __kvm_tdp_mmu_write_spte(iter->sptep, 0);
640 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
642 * @as_id: Address space ID, i.e. regular vs. SMM
643 * @sptep: Pointer to the SPTE
644 * @old_spte: The current value of the SPTE
645 * @new_spte: The new value that will be set for the SPTE
646 * @gfn: The base GFN that was (or will be) mapped by the SPTE
647 * @level: The level _containing_ the SPTE (its parent PT's level)
649 * Returns the old SPTE value, which _may_ be different than @old_spte if the
650 * SPTE had voldatile bits.
652 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
653 u64 old_spte, u64 new_spte, gfn_t gfn, int level)
655 lockdep_assert_held_write(&kvm->mmu_lock);
658 * No thread should be using this function to set SPTEs to or from the
659 * temporary removed SPTE value.
660 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
661 * should be used. If operating under the MMU lock in write mode, the
662 * use of the removed SPTE should not be necessary.
664 WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
666 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
668 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
672 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
675 WARN_ON_ONCE(iter->yielded);
676 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
677 iter->old_spte, new_spte,
678 iter->gfn, iter->level);
681 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
682 for_each_tdp_pte(_iter, _root, _start, _end)
684 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
685 tdp_root_for_each_pte(_iter, _root, _start, _end) \
686 if (!is_shadow_present_pte(_iter.old_spte) || \
687 !is_last_spte(_iter.old_spte, _iter.level)) \
691 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
692 for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
695 * Yield if the MMU lock is contended or this thread needs to return control
698 * If this function should yield and flush is set, it will perform a remote
699 * TLB flush before yielding.
701 * If this function yields, iter->yielded is set and the caller must skip to
702 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
703 * over the paging structures to allow the iterator to continue its traversal
704 * from the paging structure root.
706 * Returns true if this function yielded.
708 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
709 struct tdp_iter *iter,
710 bool flush, bool shared)
712 WARN_ON_ONCE(iter->yielded);
714 /* Ensure forward progress has been made before yielding. */
715 if (iter->next_last_level_gfn == iter->yielded_gfn)
718 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
720 kvm_flush_remote_tlbs(kvm);
725 cond_resched_rwlock_read(&kvm->mmu_lock);
727 cond_resched_rwlock_write(&kvm->mmu_lock);
731 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
733 iter->yielded = true;
736 return iter->yielded;
739 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
742 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
743 * a gpa range that would exceed the max gfn, and KVM does not create
744 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
745 * the slow emulation path every time.
747 return kvm_mmu_max_gfn() + 1;
750 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
751 bool shared, int zap_level)
753 struct tdp_iter iter;
755 gfn_t end = tdp_mmu_max_gfn_exclusive();
758 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
760 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
763 if (!is_shadow_present_pte(iter.old_spte))
766 if (iter.level > zap_level)
770 tdp_mmu_iter_set_spte(kvm, &iter, 0);
771 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
776 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
781 * The root must have an elevated refcount so that it's reachable via
782 * mmu_notifier callbacks, which allows this path to yield and drop
783 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
784 * must drop all references to relevant pages prior to completing the
785 * callback. Dropping mmu_lock with an unreachable root would result
786 * in zapping SPTEs after a relevant mmu_notifier callback completes
787 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
788 * dirty accessed bits to the SPTE's associated struct page.
790 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
792 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
797 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
798 * split the zap into two passes. On the first pass, zap at the 1gb
799 * level, and then zap top-level SPs on the second pass. "1gb" is not
800 * arbitrary, as KVM must be able to zap a 1gb shadow page without
801 * inducing a stall to allow in-place replacement with a 1gb hugepage.
803 * Because zapping a SP recurses on its children, stepping down to
804 * PG_LEVEL_4K in the iterator itself is unnecessary.
806 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
807 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
812 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
817 * This helper intentionally doesn't allow zapping a root shadow page,
818 * which doesn't have a parent page table and thus no associated entry.
820 if (WARN_ON_ONCE(!sp->ptep))
823 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
824 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
827 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
828 sp->gfn, sp->role.level + 1);
834 * If can_yield is true, will release the MMU lock and reschedule if the
835 * scheduler needs the CPU or there is contention on the MMU lock. If this
836 * function cannot yield, it will not release the MMU lock or reschedule and
837 * the caller must ensure it does not supply too large a GFN range, or the
838 * operation can cause a soft lockup.
840 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
841 gfn_t start, gfn_t end, bool can_yield, bool flush)
843 struct tdp_iter iter;
845 end = min(end, tdp_mmu_max_gfn_exclusive());
847 lockdep_assert_held_write(&kvm->mmu_lock);
851 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
853 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
858 if (!is_shadow_present_pte(iter.old_spte) ||
859 !is_last_spte(iter.old_spte, iter.level))
862 tdp_mmu_iter_set_spte(kvm, &iter, 0);
869 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
870 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
876 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
877 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
878 * more SPTEs were zapped since the MMU lock was last acquired.
880 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
881 bool can_yield, bool flush)
883 struct kvm_mmu_page *root;
885 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
886 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
891 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
893 struct kvm_mmu_page *root;
897 * Zap all roots, including invalid roots, as all SPTEs must be dropped
898 * before returning to the caller. Zap directly even if the root is
899 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
900 * all that expensive and mmu_lock is already held, which means the
901 * worker has yielded, i.e. flushing the work instead of zapping here
902 * isn't guaranteed to be any faster.
904 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
905 * is being destroyed or the userspace VMM has exited. In both cases,
906 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
908 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
909 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
910 tdp_mmu_zap_root(kvm, root, false);
915 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
918 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
920 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
924 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
925 * is about to be zapped, e.g. in response to a memslots update. The actual
926 * zapping is performed asynchronously. Using a separate workqueue makes it
927 * easy to ensure that the destruction is performed before the "fast zap"
928 * completes, without keeping a separate list of invalidated roots; the list is
929 * effectively the list of work items in the workqueue.
931 * Note, the asynchronous worker is gifted the TDP MMU's reference.
932 * See kvm_tdp_mmu_get_vcpu_root_hpa().
934 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
936 struct kvm_mmu_page *root;
939 * mmu_lock must be held for write to ensure that a root doesn't become
940 * invalid while there are active readers (invalidating a root while
941 * there are active readers may or may not be problematic in practice,
942 * but it's uncharted territory and not supported).
944 * Waive the assertion if there are no users of @kvm, i.e. the VM is
945 * being destroyed after all references have been put, or if no vCPUs
946 * have been created (which means there are no roots), i.e. the VM is
947 * being destroyed in an error path of KVM_CREATE_VM.
949 if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
950 refcount_read(&kvm->users_count) && kvm->created_vcpus)
951 lockdep_assert_held_write(&kvm->mmu_lock);
954 * As above, mmu_lock isn't held when destroying the VM! There can't
955 * be other references to @kvm, i.e. nothing else can invalidate roots
956 * or be consuming roots, but walking the list of roots does need to be
957 * guarded against roots being deleted by the asynchronous zap worker.
961 list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
962 if (!root->role.invalid) {
963 root->role.invalid = true;
964 tdp_mmu_schedule_zap_root(kvm, root);
972 * Installs a last-level SPTE to handle a TDP page fault.
973 * (NPT/EPT violation/misconfiguration)
975 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
976 struct kvm_page_fault *fault,
977 struct tdp_iter *iter)
979 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
981 int ret = RET_PF_FIXED;
984 if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
987 if (unlikely(!fault->slot))
988 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
990 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
991 fault->pfn, iter->old_spte, fault->prefetch, true,
992 fault->map_writable, &new_spte);
994 if (new_spte == iter->old_spte)
995 ret = RET_PF_SPURIOUS;
996 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
998 else if (is_shadow_present_pte(iter->old_spte) &&
999 !is_last_spte(iter->old_spte, iter->level))
1000 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1003 * If the page fault was caused by a write but the page is write
1004 * protected, emulation is needed. If the emulation was skipped,
1005 * the vCPU would have the same fault again.
1009 ret = RET_PF_EMULATE;
1012 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1013 if (unlikely(is_mmio_spte(new_spte))) {
1014 vcpu->stat.pf_mmio_spte_created++;
1015 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1017 ret = RET_PF_EMULATE;
1019 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1020 rcu_dereference(iter->sptep));
1027 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1028 * provided page table.
1030 * @kvm: kvm instance
1031 * @iter: a tdp_iter instance currently on the SPTE that should be set
1032 * @sp: The new TDP page table to install.
1033 * @shared: This operation is running under the MMU lock in read mode.
1035 * Returns: 0 if the new page table was installed. Non-0 if the page table
1036 * could not be installed (e.g. the atomic compare-exchange failed).
1038 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1039 struct kvm_mmu_page *sp, bool shared)
1041 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1045 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1049 tdp_mmu_iter_set_spte(kvm, iter, spte);
1052 tdp_account_mmu_page(kvm, sp);
1057 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1058 struct kvm_mmu_page *sp, bool shared);
1061 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1062 * page tables and SPTEs to translate the faulting guest physical address.
1064 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1066 struct kvm_mmu *mmu = vcpu->arch.mmu;
1067 struct kvm *kvm = vcpu->kvm;
1068 struct tdp_iter iter;
1069 struct kvm_mmu_page *sp;
1070 int ret = RET_PF_RETRY;
1072 kvm_mmu_hugepage_adjust(vcpu, fault);
1074 trace_kvm_mmu_spte_requested(fault);
1078 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1081 if (fault->nx_huge_page_workaround_enabled)
1082 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1085 * If SPTE has been frozen by another thread, just give up and
1086 * retry, avoiding unnecessary page table allocation and free.
1088 if (is_removed_spte(iter.old_spte))
1091 if (iter.level == fault->goal_level)
1092 goto map_target_level;
1094 /* Step down into the lower level page table if it exists. */
1095 if (is_shadow_present_pte(iter.old_spte) &&
1096 !is_large_pte(iter.old_spte))
1100 * The SPTE is either non-present or points to a huge page that
1101 * needs to be split.
1103 sp = tdp_mmu_alloc_sp(vcpu);
1104 tdp_mmu_init_child_sp(sp, &iter);
1106 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1108 if (is_shadow_present_pte(iter.old_spte))
1109 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1111 r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1114 * Force the guest to retry if installing an upper level SPTE
1115 * failed, e.g. because a different task modified the SPTE.
1118 tdp_mmu_free_sp(sp);
1122 if (fault->huge_page_disallowed &&
1123 fault->req_level >= iter.level) {
1124 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1125 if (sp->nx_huge_page_disallowed)
1126 track_possible_nx_huge_page(kvm, sp);
1127 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1132 * The walk aborted before reaching the target level, e.g. because the
1133 * iterator detected an upper level SPTE was frozen during traversal.
1135 WARN_ON_ONCE(iter.level == fault->goal_level);
1139 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1146 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1149 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1150 range->end, range->may_block, flush);
1153 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1154 struct kvm_gfn_range *range);
1156 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1157 struct kvm_gfn_range *range,
1158 tdp_handler_t handler)
1160 struct kvm_mmu_page *root;
1161 struct tdp_iter iter;
1165 * Don't support rescheduling, none of the MMU notifiers that funnel
1166 * into this helper allow blocking; it'd be dead, wasteful code.
1168 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1171 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1172 ret |= handler(kvm, &iter, range);
1181 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1182 * if any of the GFNs in the range have been accessed.
1184 * No need to mark the corresponding PFN as accessed as this call is coming
1185 * from the clear_young() or clear_flush_young() notifier, which uses the
1186 * return value to determine if the page has been accessed.
1188 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1189 struct kvm_gfn_range *range)
1193 /* If we have a non-accessed entry we don't need to change the pte. */
1194 if (!is_accessed_spte(iter->old_spte))
1197 if (spte_ad_enabled(iter->old_spte)) {
1198 iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
1200 shadow_accessed_mask,
1202 new_spte = iter->old_spte & ~shadow_accessed_mask;
1205 * Capture the dirty status of the page, so that it doesn't get
1206 * lost when the SPTE is marked for access tracking.
1208 if (is_writable_pte(iter->old_spte))
1209 kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
1211 new_spte = mark_spte_for_access_track(iter->old_spte);
1212 iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
1213 iter->old_spte, new_spte,
1217 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1218 iter->old_spte, new_spte);
1222 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1224 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1227 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1228 struct kvm_gfn_range *range)
1230 return is_accessed_spte(iter->old_spte);
1233 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1235 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1238 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1239 struct kvm_gfn_range *range)
1243 /* Huge pages aren't expected to be modified without first being zapped. */
1244 WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
1246 if (iter->level != PG_LEVEL_4K ||
1247 !is_shadow_present_pte(iter->old_spte))
1251 * Note, when changing a read-only SPTE, it's not strictly necessary to
1252 * zero the SPTE before setting the new PFN, but doing so preserves the
1253 * invariant that the PFN of a present * leaf SPTE can never change.
1254 * See handle_changed_spte().
1256 tdp_mmu_iter_set_spte(kvm, iter, 0);
1258 if (!pte_write(range->arg.pte)) {
1259 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1260 pte_pfn(range->arg.pte));
1262 tdp_mmu_iter_set_spte(kvm, iter, new_spte);
1269 * Handle the changed_pte MMU notifier for the TDP MMU.
1270 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1272 * Returns non-zero if a flush is needed before releasing the MMU lock.
1274 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1277 * No need to handle the remote TLB flush under RCU protection, the
1278 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1279 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
1281 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1285 * Remove write access from all SPTEs at or above min_level that map GFNs
1286 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1289 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1290 gfn_t start, gfn_t end, int min_level)
1292 struct tdp_iter iter;
1294 bool spte_set = false;
1298 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1300 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1302 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1305 if (!is_shadow_present_pte(iter.old_spte) ||
1306 !is_last_spte(iter.old_spte, iter.level) ||
1307 !(iter.old_spte & PT_WRITABLE_MASK))
1310 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1312 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1323 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1324 * only affect leaf SPTEs down to min_level.
1325 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1327 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1328 const struct kvm_memory_slot *slot, int min_level)
1330 struct kvm_mmu_page *root;
1331 bool spte_set = false;
1333 lockdep_assert_held_read(&kvm->mmu_lock);
1335 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1336 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1337 slot->base_gfn + slot->npages, min_level);
1342 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1344 struct kvm_mmu_page *sp;
1348 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1352 sp->spt = (void *)__get_free_page(gfp);
1354 kmem_cache_free(mmu_page_header_cache, sp);
1361 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1362 struct tdp_iter *iter,
1365 struct kvm_mmu_page *sp;
1368 * Since we are allocating while under the MMU lock we have to be
1369 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1370 * reclaim and to avoid making any filesystem callbacks (which can end
1371 * up invoking KVM MMU notifiers, resulting in a deadlock).
1373 * If this allocation fails we drop the lock and retry with reclaim
1376 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1383 read_unlock(&kvm->mmu_lock);
1385 write_unlock(&kvm->mmu_lock);
1387 iter->yielded = true;
1388 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1391 read_lock(&kvm->mmu_lock);
1393 write_lock(&kvm->mmu_lock);
1400 /* Note, the caller is responsible for initializing @sp. */
1401 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1402 struct kvm_mmu_page *sp, bool shared)
1404 const u64 huge_spte = iter->old_spte;
1405 const int level = iter->level;
1409 * No need for atomics when writing to sp->spt since the page table has
1410 * not been linked in yet and thus is not reachable from any other CPU.
1412 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1413 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1416 * Replace the huge spte with a pointer to the populated lower level
1417 * page table. Since we are making this change without a TLB flush vCPUs
1418 * will see a mix of the split mappings and the original huge mapping,
1419 * depending on what's currently in their TLB. This is fine from a
1420 * correctness standpoint since the translation will be the same either
1423 ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1428 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1429 * are overwriting from the page stats. But we have to manually update
1430 * the page stats with the new present child pages.
1432 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1435 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1439 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1440 struct kvm_mmu_page *root,
1441 gfn_t start, gfn_t end,
1442 int target_level, bool shared)
1444 struct kvm_mmu_page *sp = NULL;
1445 struct tdp_iter iter;
1451 * Traverse the page table splitting all huge pages above the target
1452 * level into one lower level. For example, if we encounter a 1GB page
1453 * we split it into 512 2MB pages.
1455 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1456 * to visit an SPTE before ever visiting its children, which means we
1457 * will correctly recursively split huge pages that are more than one
1458 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1459 * and then splitting each of those to 512 4KB pages).
1461 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1463 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1466 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1470 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1473 trace_kvm_mmu_split_huge_page(iter.gfn,
1483 tdp_mmu_init_child_sp(sp, &iter);
1485 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1494 * It's possible to exit the loop having never used the last sp if, for
1495 * example, a vCPU doing HugePage NX splitting wins the race and
1496 * installs its own sp in place of the last sp we tried to split.
1499 tdp_mmu_free_sp(sp);
1506 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1508 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1509 const struct kvm_memory_slot *slot,
1510 gfn_t start, gfn_t end,
1511 int target_level, bool shared)
1513 struct kvm_mmu_page *root;
1516 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1518 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1519 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1521 kvm_tdp_mmu_put_root(kvm, root, shared);
1528 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1529 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1530 * If AD bits are not enabled, this will require clearing the writable bit on
1531 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1534 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1535 gfn_t start, gfn_t end)
1537 u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1538 struct tdp_iter iter;
1539 bool spte_set = false;
1543 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1545 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1548 if (!is_shadow_present_pte(iter.old_spte))
1551 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1552 spte_ad_need_write_protect(iter.old_spte));
1554 if (!(iter.old_spte & dbit))
1557 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1568 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1569 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1570 * If AD bits are not enabled, this will require clearing the writable bit on
1571 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1574 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1575 const struct kvm_memory_slot *slot)
1577 struct kvm_mmu_page *root;
1578 bool spte_set = false;
1580 lockdep_assert_held_read(&kvm->mmu_lock);
1582 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1583 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1584 slot->base_gfn + slot->npages);
1590 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1591 * set in mask, starting at gfn. The given memslot is expected to contain all
1592 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1593 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1594 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1596 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1597 gfn_t gfn, unsigned long mask, bool wrprot)
1599 u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1601 struct tdp_iter iter;
1603 lockdep_assert_held_write(&kvm->mmu_lock);
1607 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1608 gfn + BITS_PER_LONG) {
1612 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1613 spte_ad_need_write_protect(iter.old_spte));
1615 if (iter.level > PG_LEVEL_4K ||
1616 !(mask & (1UL << (iter.gfn - gfn))))
1619 mask &= ~(1UL << (iter.gfn - gfn));
1621 if (!(iter.old_spte & dbit))
1624 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1625 iter.old_spte, dbit,
1628 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1630 iter.old_spte & ~dbit);
1631 kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
1638 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1639 * set in mask, starting at gfn. The given memslot is expected to contain all
1640 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1641 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1642 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1644 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1645 struct kvm_memory_slot *slot,
1646 gfn_t gfn, unsigned long mask,
1649 struct kvm_mmu_page *root;
1651 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1652 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1655 static void zap_collapsible_spte_range(struct kvm *kvm,
1656 struct kvm_mmu_page *root,
1657 const struct kvm_memory_slot *slot)
1659 gfn_t start = slot->base_gfn;
1660 gfn_t end = start + slot->npages;
1661 struct tdp_iter iter;
1662 int max_mapping_level;
1666 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1668 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1671 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1672 !is_shadow_present_pte(iter.old_spte))
1676 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1677 * a large page size, then its parent would have been zapped
1678 * instead of stepping down.
1680 if (is_last_spte(iter.old_spte, iter.level))
1684 * If iter.gfn resides outside of the slot, i.e. the page for
1685 * the current level overlaps but is not contained by the slot,
1686 * then the SPTE can't be made huge. More importantly, trying
1687 * to query that info from slot->arch.lpage_info will cause an
1688 * out-of-bounds access.
1690 if (iter.gfn < start || iter.gfn >= end)
1693 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1694 iter.gfn, PG_LEVEL_NUM);
1695 if (max_mapping_level < iter.level)
1698 /* Note, a successful atomic zap also does a remote TLB flush. */
1699 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1707 * Zap non-leaf SPTEs (and free their associated page tables) which could
1708 * be replaced by huge pages, for GFNs within the slot.
1710 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1711 const struct kvm_memory_slot *slot)
1713 struct kvm_mmu_page *root;
1715 lockdep_assert_held_read(&kvm->mmu_lock);
1717 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1718 zap_collapsible_spte_range(kvm, root, slot);
1722 * Removes write access on the last level SPTE mapping this GFN and unsets the
1723 * MMU-writable bit to ensure future writes continue to be intercepted.
1724 * Returns true if an SPTE was set and a TLB flush is needed.
1726 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1727 gfn_t gfn, int min_level)
1729 struct tdp_iter iter;
1731 bool spte_set = false;
1733 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1737 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1738 if (!is_shadow_present_pte(iter.old_spte) ||
1739 !is_last_spte(iter.old_spte, iter.level))
1742 new_spte = iter.old_spte &
1743 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1745 if (new_spte == iter.old_spte)
1748 tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1758 * Removes write access on the last level SPTE mapping this GFN and unsets the
1759 * MMU-writable bit to ensure future writes continue to be intercepted.
1760 * Returns true if an SPTE was set and a TLB flush is needed.
1762 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1763 struct kvm_memory_slot *slot, gfn_t gfn,
1766 struct kvm_mmu_page *root;
1767 bool spte_set = false;
1769 lockdep_assert_held_write(&kvm->mmu_lock);
1770 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1771 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1777 * Return the level of the lowest level SPTE added to sptes.
1778 * That SPTE may be non-present.
1780 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1782 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1785 struct tdp_iter iter;
1786 struct kvm_mmu *mmu = vcpu->arch.mmu;
1787 gfn_t gfn = addr >> PAGE_SHIFT;
1790 *root_level = vcpu->arch.mmu->root_role.level;
1792 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1794 sptes[leaf] = iter.old_spte;
1801 * Returns the last level spte pointer of the shadow page walk for the given
1802 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1803 * walk could be performed, returns NULL and *spte does not contain valid data.
1806 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1807 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1809 * WARNING: This function is only intended to be called during fast_page_fault.
1811 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1814 struct tdp_iter iter;
1815 struct kvm_mmu *mmu = vcpu->arch.mmu;
1816 gfn_t gfn = addr >> PAGE_SHIFT;
1817 tdp_ptep_t sptep = NULL;
1819 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1820 *spte = iter.old_spte;
1825 * Perform the rcu_dereference to get the raw spte pointer value since
1826 * we are passing it up to fast_page_fault, which is shared with the
1827 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1830 * This is safe since fast_page_fault obeys the contracts of this
1831 * function as well as all TDP MMU contracts around modifying SPTEs
1832 * outside of mmu_lock.
1834 return rcu_dereference(sptep);