arch/x86/kvm/mmu/spte.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * Macros and functions to access KVM PTEs (also known as SPTEs)
   6  *
   7  * Copyright (C) 2006 Qumranet, Inc.
   8  * Copyright 2020 Red Hat, Inc. and/or its affiliates.
   9  */
  10
  11
  12 #include <linux/kvm_host.h>
  13 #include "mmu.h"
  14 #include "mmu_internal.h"
  15 #include "x86.h"
  16 #include "spte.h"
  17
  18 #include <asm/e820/api.h>
  19
  20 u64 __read_mostly shadow_nx_mask;
  21 u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
  22 u64 __read_mostly shadow_user_mask;
  23 u64 __read_mostly shadow_accessed_mask;
  24 u64 __read_mostly shadow_dirty_mask;
  25 u64 __read_mostly shadow_mmio_value;
  26 u64 __read_mostly shadow_mmio_access_mask;
  27 u64 __read_mostly shadow_present_mask;
  28 u64 __read_mostly shadow_me_mask;
  29 u64 __read_mostly shadow_acc_track_mask;
  30
  31 u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
  32 u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
  33
  34 u8 __read_mostly shadow_phys_bits;
  35
  36 static u64 generation_mmio_spte_mask(u64 gen)
  37 {
  38         u64 mask;
  39
  40         WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
  41         BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
  42
  43         mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
  44         mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
  45         return mask;
  46 }
  47
  48 u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
  49 {
  50         u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
  51         u64 mask = generation_mmio_spte_mask(gen);
  52         u64 gpa = gfn << PAGE_SHIFT;
  53
  54         access &= shadow_mmio_access_mask;
  55         mask |= shadow_mmio_value | access;
  56         mask |= gpa | shadow_nonpresent_or_rsvd_mask;
  57         mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
  58                 << shadow_nonpresent_or_rsvd_mask_len;
  59
  60         return mask;
  61 }
  62
  63 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
  64 {
  65         if (pfn_valid(pfn))
  66                 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
  67                         /*
  68                          * Some reserved pages, such as those from NVDIMM
  69                          * DAX devices, are not for MMIO, and can be mapped
  70                          * with cached memory type for better performance.
  71                          * However, the above check misconceives those pages
  72                          * as MMIO, and results in KVM mapping them with UC
  73                          * memory type, which would hurt the performance.
  74                          * Therefore, we check the host memory type in addition
  75                          * and only treat UC/UC-/WC pages as MMIO.
  76                          */
  77                         (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
  78
  79         return !e820__mapped_raw_any(pfn_to_hpa(pfn),
  80                                      pfn_to_hpa(pfn + 1) - 1,
  81                                      E820_TYPE_RAM);
  82 }
  83
  84 int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
  85                      gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
  86                      bool can_unsync, bool host_writable, bool ad_disabled,
  87                      u64 *new_spte)
  88 {
  89         u64 spte = 0;
  90         int ret = 0;
  91
  92         if (ad_disabled)
  93                 spte |= SPTE_AD_DISABLED_MASK;
  94         else if (kvm_vcpu_ad_need_write_protect(vcpu))
  95                 spte |= SPTE_AD_WRPROT_ONLY_MASK;
  96
  97         /*
  98          * For the EPT case, shadow_present_mask is 0 if hardware
  99          * supports exec-only page table entries.  In that case,
 100          * ACC_USER_MASK and shadow_user_mask are used to represent
 101          * read access.  See FNAME(gpte_access) in paging_tmpl.h.
 102          */
 103         spte |= shadow_present_mask;
 104         if (!speculative)
 105                 spte |= spte_shadow_accessed_mask(spte);
 106
 107         if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
 108             is_nx_huge_page_enabled()) {
 109                 pte_access &= ~ACC_EXEC_MASK;
 110         }
 111
 112         if (pte_access & ACC_EXEC_MASK)
 113                 spte |= shadow_x_mask;
 114         else
 115                 spte |= shadow_nx_mask;
 116
 117         if (pte_access & ACC_USER_MASK)
 118                 spte |= shadow_user_mask;
 119
 120         if (level > PG_LEVEL_4K)
 121                 spte |= PT_PAGE_SIZE_MASK;
 122         if (tdp_enabled)
 123                 spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
 124                         kvm_is_mmio_pfn(pfn));
 125
 126         if (host_writable)
 127                 spte |= SPTE_HOST_WRITEABLE;
 128         else
 129                 pte_access &= ~ACC_WRITE_MASK;
 130
 131         if (!kvm_is_mmio_pfn(pfn))
 132                 spte |= shadow_me_mask;
 133
 134         spte |= (u64)pfn << PAGE_SHIFT;
 135
 136         if (pte_access & ACC_WRITE_MASK) {
 137                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
 138
 139                 /*
 140                  * Optimization: for pte sync, if spte was writable the hash
 141                  * lookup is unnecessary (and expensive). Write protection
 142                  * is responsibility of mmu_get_page / kvm_sync_page.
 143                  * Same reasoning can be applied to dirty page accounting.
 144                  */
 145                 if (!can_unsync && is_writable_pte(old_spte))
 146                         goto out;
 147
 148                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
 149                         pgprintk("%s: found shadow page for %llx, marking ro\n",
 150                                  __func__, gfn);
 151                         ret |= SET_SPTE_WRITE_PROTECTED_PT;
 152                         pte_access &= ~ACC_WRITE_MASK;
 153                         spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
 154                 }
 155         }
 156
 157         if (pte_access & ACC_WRITE_MASK)
 158                 spte |= spte_shadow_dirty_mask(spte);
 159
 160         if (speculative)
 161                 spte = mark_spte_for_access_track(spte);
 162
 163 out:
 164         *new_spte = spte;
 165         return ret;
 166 }
 167
 168 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
 169 {
 170         u64 spte;
 171
 172         spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
 173                shadow_user_mask | shadow_x_mask | shadow_me_mask;
 174
 175         if (ad_disabled)
 176                 spte |= SPTE_AD_DISABLED_MASK;
 177         else
 178                 spte |= shadow_accessed_mask;
 179
 180         return spte;
 181 }
 182
 183 u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
 184 {
 185         u64 new_spte;
 186
 187         new_spte = old_spte & ~PT64_BASE_ADDR_MASK;
 188         new_spte |= (u64)new_pfn << PAGE_SHIFT;
 189
 190         new_spte &= ~PT_WRITABLE_MASK;
 191         new_spte &= ~SPTE_HOST_WRITEABLE;
 192
 193         new_spte = mark_spte_for_access_track(new_spte);
 194
 195         return new_spte;
 196 }
 197
 198 static u8 kvm_get_shadow_phys_bits(void)
 199 {
 200         /*
 201          * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
 202          * in CPU detection code, but the processor treats those reduced bits as
 203          * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
 204          * the physical address bits reported by CPUID.
 205          */
 206         if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
 207                 return cpuid_eax(0x80000008) & 0xff;
 208
 209         /*
 210          * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
 211          * custom CPUID.  Proceed with whatever the kernel found since these features
 212          * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
 213          */
 214         return boot_cpu_data.x86_phys_bits;
 215 }
 216
 217 u64 mark_spte_for_access_track(u64 spte)
 218 {
 219         if (spte_ad_enabled(spte))
 220                 return spte & ~shadow_accessed_mask;
 221
 222         if (is_access_track_spte(spte))
 223                 return spte;
 224
 225         /*
 226          * Making an Access Tracking PTE will result in removal of write access
 227          * from the PTE. So, verify that we will be able to restore the write
 228          * access in the fast page fault path later on.
 229          */
 230         WARN_ONCE((spte & PT_WRITABLE_MASK) &&
 231                   !spte_can_locklessly_be_made_writable(spte),
 232                   "kvm: Writable SPTE is not locklessly dirty-trackable\n");
 233
 234         WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
 235                           shadow_acc_track_saved_bits_shift),
 236                   "kvm: Access Tracking saved bit locations are not zero\n");
 237
 238         spte |= (spte & shadow_acc_track_saved_bits_mask) <<
 239                 shadow_acc_track_saved_bits_shift;
 240         spte &= ~shadow_acc_track_mask;
 241
 242         return spte;
 243 }
 244
 245 void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
 246 {
 247         BUG_ON((u64)(unsigned)access_mask != access_mask);
 248         WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
 249         WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
 250         shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
 251         shadow_mmio_access_mask = access_mask;
 252 }
 253 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 254
 255 /*
 256  * Sets the shadow PTE masks used by the MMU.
 257  *
 258  * Assumptions:
 259  *  - Setting either @accessed_mask or @dirty_mask requires setting both
 260  *  - At least one of @accessed_mask or @acc_track_mask must be set
 261  */
 262 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 263                 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
 264                 u64 acc_track_mask, u64 me_mask)
 265 {
 266         BUG_ON(!dirty_mask != !accessed_mask);
 267         BUG_ON(!accessed_mask && !acc_track_mask);
 268         BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
 269
 270         shadow_user_mask = user_mask;
 271         shadow_accessed_mask = accessed_mask;
 272         shadow_dirty_mask = dirty_mask;
 273         shadow_nx_mask = nx_mask;
 274         shadow_x_mask = x_mask;
 275         shadow_present_mask = p_mask;
 276         shadow_acc_track_mask = acc_track_mask;
 277         shadow_me_mask = me_mask;
 278 }
 279 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 280
 281 void kvm_mmu_reset_all_pte_masks(void)
 282 {
 283         u8 low_phys_bits;
 284
 285         shadow_user_mask = 0;
 286         shadow_accessed_mask = 0;
 287         shadow_dirty_mask = 0;
 288         shadow_nx_mask = 0;
 289         shadow_x_mask = 0;
 290         shadow_present_mask = 0;
 291         shadow_acc_track_mask = 0;
 292
 293         shadow_phys_bits = kvm_get_shadow_phys_bits();
 294
 295         /*
 296          * If the CPU has 46 or less physical address bits, then set an
 297          * appropriate mask to guard against L1TF attacks. Otherwise, it is
 298          * assumed that the CPU is not vulnerable to L1TF.
 299          *
 300          * Some Intel CPUs address the L1 cache using more PA bits than are
 301          * reported by CPUID. Use the PA width of the L1 cache when possible
 302          * to achieve more effective mitigation, e.g. if system RAM overlaps
 303          * the most significant bits of legal physical address space.
 304          */
 305         shadow_nonpresent_or_rsvd_mask = 0;
 306         low_phys_bits = boot_cpu_data.x86_phys_bits;
 307         if (boot_cpu_has_bug(X86_BUG_L1TF) &&
 308             !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
 309                           52 - shadow_nonpresent_or_rsvd_mask_len)) {
 310                 low_phys_bits = boot_cpu_data.x86_cache_bits
 311                         - shadow_nonpresent_or_rsvd_mask_len;
 312                 shadow_nonpresent_or_rsvd_mask =
 313                         rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
 314         }
 315
 316         shadow_nonpresent_or_rsvd_lower_gfn_mask =
 317                 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
 318 }