static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
+
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
+{
+ shadow_mmio_mask = mmio_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+{
+ access &= ACC_WRITE_MASK | ACC_USER_MASK;
+
+ mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+}
+
+static bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+ if (unlikely(is_noslot_pfn(pfn))) {
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
static inline u64 rsvd_bits(int s, int e)
{
static int is_shadow_present_pte(u64 pte)
{
- return pte & PT_PRESENT_MASK;
+ return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
}
static int is_large_pte(u64 pte)
{
return ACCESS_ONCE(*sptep);
}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+ /* It is valid if the spte is zapped. */
+ return spte == 0ull;
+}
#else
union split_spte {
struct {
return spte.spte;
}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+ union split_spte sspte = (union split_spte)spte;
+ u32 high_mmio_mask = shadow_mmio_mask >> 32;
+
+ /* It is valid if the spte is zapped. */
+ if (spte == 0ull)
+ return true;
+
+ /* It is valid if the spte is being zapped. */
+ if (sspte.spte_low == 0ull &&
+ (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
+ return true;
+
+ return false;
+}
#endif
static bool spte_has_volatile_bits(u64 spte)
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
- }
+ } else if (is_mmio_spte(pte))
+ mmu_spte_clear_no_track(spte);
if (is_large_pte(pte))
--kvm->stat.lpages;
u64 spte, entry = *sptep;
int ret = 0;
+ if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+ return 0;
+
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
kvm_mmu_flush_tlb(vcpu);
}
+ if (unlikely(is_mmio_spte(*sptep) && emulate))
+ *emulate = 1;
+
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
static bool mmu_invalid_pfn(pfn_t pfn)
{
- return unlikely(is_invalid_pfn(pfn) || is_noslot_pfn(pfn));
+ return unlikely(is_invalid_pfn(pfn));
}
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
goto exit;
}
- if (unlikely(is_noslot_pfn(pfn))) {
+ if (unlikely(is_noslot_pfn(pfn)))
vcpu_cache_mmio_info(vcpu, gva, gfn, access);
- *ret_val = 1;
- goto exit;
- }
ret = false;
exit:
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+
+/*
+ * On direct hosts, the last spte is only allows two states
+ * for mmio page fault:
+ * - It is the mmio spte
+ * - It is zapped or it is being zapped.
+ *
+ * This function completely checks the spte when the last spte
+ * is not the mmio spte.
+ */
+static bool check_direct_spte_mmio_pf(u64 spte)
+{
+ return __check_direct_spte_mmio_pf(spte);
+}
+
+static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte = 0ull;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+ if (!is_shadow_present_pte(spte))
+ break;
+ walk_shadow_page_lockless_end(vcpu);
+
+ return spte;
+}
+
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 to let CPU fault again on the address, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ u64 spte;
+
+ if (quickly_check_mmio_pf(vcpu, addr, direct))
+ return 1;
+
+ spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+
+ if (is_mmio_spte(spte)) {
+ gfn_t gfn = get_mmio_spte_gfn(spte);
+ unsigned access = get_mmio_spte_access(spte);
+
+ if (direct)
+ addr = 0;
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return 1;
+ }
+
+ /*
+ * It's ok if the gva is remapped by other cpus on shadow guest,
+ * it's a BUG if the gfn is not a mmio page.
+ */
+ if (direct && !check_direct_spte_mmio_pf(spte))
+ return -1;
+
+ /*
+ * If the page table is zapped by other cpus, let CPU fault again on
+ * the address.
+ */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+ u32 error_code, bool direct)
+{
+ int ret;
+
+ ret = handle_mmio_page_fault_common(vcpu, addr, direct);
+ WARN_ON(ret < 0);
+ return ret;
+}
+
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
{
int r;
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, gva, error_code, true);
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
}
+static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+ int *nr_present)
+{
+ if (unlikely(is_mmio_spte(*sptep))) {
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ mmu_spte_clear_no_track(sptep);
+ return true;
+ }
+
+ (*nr_present)++;
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, addr, error_code,
+ mmu_is_nested(vcpu));
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
--vcpu->kvm->stat.lpages;
drop_spte(vcpu->kvm, sptep);
need_flush = 1;
- }
+ } else if (is_mmio_spte(*sptep))
+ mmu_spte_clear_no_track(sptep);
break;
}
gpa_t pte_gpa;
gfn_t gfn;
- if (!is_shadow_present_pte(sp->spt[i]))
+ if (!sp->spt[i])
continue;
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
sizeof(pt_element_t)))
return -EINVAL;
- gfn = gpte_to_gfn(gpte);
-
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+
+ if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+ continue;
+
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i]);
vcpu->kvm->tlbs_dirty++;
}
nr_present++;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
- true);
+
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,