KVM: MMU: allow more page become unsync at gfn mapping time

[platform/kernel/linux-rpi.git] / arch / x86 / kvm / mmu.c
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 3699613..ba119da 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
   * MMU support
   *
   * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
   *
   * Authors:
   *   Yaniv Kamay  <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
  #include <linux/compiler.h>
  #include <linux/srcu.h>
  #include <linux/slab.h>
+#include <linux/uaccess.h>
  
  #include <asm/page.h>
  #include <asm/cmpxchg.h>
@@ -304,10 +306,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
         return 0;
  }
  
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+                                 struct kmem_cache *cache)
  {
         while (mc->nobjs)
-               kfree(mc->objects[--mc->nobjs]);
+               kmem_cache_free(cache, mc->objects[--mc->nobjs]);
  }
  
  static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +358,11 @@ out:
  
  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
  {
-       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
-       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
-       mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+                               mmu_page_header_cache);
  }
  
  static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +383,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
  
  static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
  {
-       kfree(pc);
+       kmem_cache_free(pte_chain_cache, pc);
  }
  
  static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +394,7 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
  
  static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
  {
-       kfree(rd);
+       kmem_cache_free(rmap_desc_cache, rd);
  }
  
  /*
@@ -897,7 +901,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
         list_del(&sp->link);
         __free_page(virt_to_page(sp->spt));
         __free_page(virt_to_page(sp->gfns));
-       kfree(sp);
+       kmem_cache_free(mmu_page_header_cache, sp);
         ++kvm->arch.n_free_mmu_pages;
  }
  
@@ -1166,26 +1170,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
         return __mmu_unsync_walk(sp, pvec);
  }
  
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
-       unsigned index;
-       struct hlist_head *bucket;
-       struct kvm_mmu_page *sp;
-       struct hlist_node *node;
-
-       pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
-       index = kvm_page_table_hashfn(gfn);
-       bucket = &kvm->arch.mmu_page_hash[index];
-       hlist_for_each_entry(sp, node, bucket, hash_link)
-               if (sp->gfn == gfn && !sp->role.direct
-                   && !sp->role.invalid) {
-                       pgprintk("%s: found role %x\n",
-                                __func__, sp->role.word);
-                       return sp;
-               }
-       return NULL;
-}
-
  static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
         WARN_ON(!sp->unsync);
@@ -1196,16 +1180,20 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
  
  static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
  
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                          bool clear_unsync)
  {
         if (sp->role.cr4_pae != !!is_pae(vcpu)) {
                 kvm_mmu_zap_page(vcpu->kvm, sp);
                 return 1;
         }
  
-       if (rmap_write_protect(vcpu->kvm, sp->gfn))
-               kvm_flush_remote_tlbs(vcpu->kvm);
-       kvm_unlink_unsync_page(vcpu->kvm, sp);
+       if (clear_unsync) {
+               if (rmap_write_protect(vcpu->kvm, sp->gfn))
+                       kvm_flush_remote_tlbs(vcpu->kvm);
+               kvm_unlink_unsync_page(vcpu->kvm, sp);
+       }
+
         if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
                 kvm_mmu_zap_page(vcpu->kvm, sp);
                 return 1;
@@ -1215,6 +1203,23 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
         return 0;
  }
  
+static void mmu_convert_notrap(struct kvm_mmu_page *sp);
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+                                  struct kvm_mmu_page *sp)
+{
+       int ret;
+
+       ret = __kvm_sync_page(vcpu, sp, false);
+       if (!ret)
+               mmu_convert_notrap(sp);
+       return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+       return __kvm_sync_page(vcpu, sp, true);
+}
+
  struct mmu_page_path {
         struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
         unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1313,7 +1318,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
         unsigned index;
         unsigned quadrant;
         struct hlist_head *bucket;
-       struct kvm_mmu_page *sp;
+       struct kvm_mmu_page *sp, *unsync_sp = NULL;
         struct hlist_node *node, *tmp;
  
         role = vcpu->arch.mmu.base_role;
@@ -1332,20 +1337,30 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
         hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
                 if (sp->gfn == gfn) {
                         if (sp->unsync)
-                               if (kvm_sync_page(vcpu, sp))
-                                       continue;
+                               unsync_sp = sp;
  
                         if (sp->role.word != role.word)
                                 continue;
  
+                       if (!direct && unsync_sp &&
+                             kvm_sync_page_transient(vcpu, unsync_sp)) {
+                               unsync_sp = NULL;
+                               break;
+                       }
+
                         mmu_page_add_parent_pte(vcpu, sp, parent_pte);
                         if (sp->unsync_children) {
                                 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
                                 kvm_mmu_mark_parents_unsync(sp);
-                       }
+                       } else if (sp->unsync)
+                               kvm_mmu_mark_parents_unsync(sp);
+
                         trace_kvm_mmu_get_page(sp, false);
                         return sp;
                 }
+       if (!direct && unsync_sp)
+               kvm_sync_page(vcpu, unsync_sp);
+
         ++vcpu->kvm->stat.mmu_cache_miss;
         sp = kvm_mmu_alloc_page(vcpu, parent_pte);
         if (!sp)
@@ -1503,6 +1518,8 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
         if (sp->unsync)
                 kvm_unlink_unsync_page(kvm, sp);
         if (!sp->root_count) {
+               /* Count self */
+               ret++;
                 hlist_del(&sp->hash_link);
                 kvm_mmu_free_page(kvm, sp);
         } else {
@@ -1539,7 +1556,6 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
                         page = container_of(kvm->arch.active_mmu_pages.prev,
                                             struct kvm_mmu_page, link);
                         used_pages -= kvm_mmu_zap_page(kvm, page);
-                       used_pages--;
                 }
                 kvm_nr_mmu_pages = used_pages;
                 kvm->arch.n_free_mmu_pages = 0;
@@ -1723,47 +1739,61 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
  }
  EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
  
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+       trace_kvm_mmu_unsync_page(sp);
+       ++vcpu->kvm->stat.mmu_unsync;
+       sp->unsync = 1;
+
+       kvm_mmu_mark_parents_unsync(sp);
+       mmu_convert_notrap(sp);
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
  {
-       unsigned index;
         struct hlist_head *bucket;
         struct kvm_mmu_page *s;
         struct hlist_node *node, *n;
+       unsigned index;
  
-       index = kvm_page_table_hashfn(sp->gfn);
+       index = kvm_page_table_hashfn(gfn);
         bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-       /* don't unsync if pagetable is shadowed with multiple roles */
+
         hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-               if (s->gfn != sp->gfn || s->role.direct)
+               if (s->gfn != gfn || s->role.direct || s->unsync ||
+                     s->role.invalid)
                         continue;
-               if (s->role.word != sp->role.word)
-                       return 1;
+               WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+               __kvm_unsync_page(vcpu, s);
         }
-       trace_kvm_mmu_unsync_page(sp);
-       ++vcpu->kvm->stat.mmu_unsync;
-       sp->unsync = 1;
-
-       kvm_mmu_mark_parents_unsync(sp);
-
-       mmu_convert_notrap(sp);
-       return 0;
  }
  
  static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
                                   bool can_unsync)
  {
-       struct kvm_mmu_page *shadow;
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *s;
+       struct hlist_node *node, *n;
+       bool need_unsync = false;
+
+       index = kvm_page_table_hashfn(gfn);
+       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+               if (s->gfn != gfn || s->role.direct || s->role.invalid)
+                       continue;
  
-       shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-       if (shadow) {
-               if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+               if (s->role.level != PT_PAGE_TABLE_LEVEL)
                         return 1;
-               if (shadow->unsync)
-                       return 0;
-               if (can_unsync && oos_shadow)
-                       return kvm_unsync_page(vcpu, shadow);
-               return 1;
+
+               if (!need_unsync && !s->unsync) {
+                       if (!can_unsync || !oos_shadow)
+                               return 1;
+                       need_unsync = true;
+               }
         }
+       if (need_unsync)
+               kvm_unsync_pages(vcpu, gfn);
         return 0;
  }
  
@@ -1809,6 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 if (level > PT_PAGE_TABLE_LEVEL &&
                     has_wrprotected_page(vcpu->kvm, gfn, level)) {
                         ret = 1;
+                       rmap_remove(vcpu->kvm, sptep);
                         spte = shadow_trap_nonpresent_pte;
                         goto set_pte;
                 }
@@ -1960,6 +1991,27 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
         return pt_write;
  }
  
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+       char buf[1];
+       void __user *hva;
+       int r;
+
+       /* Touch the page, so send SIGBUS */
+       hva = (void __user *)gfn_to_hva(kvm, gfn);
+       r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+       kvm_release_pfn_clean(pfn);
+       if (is_hwpoison_pfn(pfn)) {
+               kvm_send_hwpoison_signal(kvm, gfn);
+               return 0;
+       }
+       return 1;
+}
+
  static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
  {
         int r;
@@ -1983,10 +2035,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
         pfn = gfn_to_pfn(vcpu->kvm, gfn);
  
         /* mmio */
-       if (is_error_pfn(pfn)) {
-               kvm_release_pfn_clean(pfn);
-               return 1;
-       }
+       if (is_error_pfn(pfn))
+               return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
  
         spin_lock(&vcpu->kvm->mmu_lock);
         if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2073,6 +2123,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
                         root_gfn = 0;
                 }
                 spin_lock(&vcpu->kvm->mmu_lock);
+               kvm_mmu_free_some_pages(vcpu);
                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
                                       PT64_ROOT_LEVEL, direct,
                                       ACC_ALL, NULL);
@@ -2103,6 +2154,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
                         root_gfn = i << 30;
                 }
                 spin_lock(&vcpu->kvm->mmu_lock);
+               kvm_mmu_free_some_pages(vcpu);
                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
                                       PT32_ROOT_LEVEL, direct,
                                       ACC_ALL, NULL);
@@ -2198,10 +2250,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
         mmu_seq = vcpu->kvm->mmu_notifier_seq;
         smp_rmb();
         pfn = gfn_to_pfn(vcpu->kvm, gfn);
-       if (is_error_pfn(pfn)) {
-               kvm_release_pfn_clean(pfn);
-               return 1;
-       }
+       if (is_error_pfn(pfn))
+               return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
         spin_lock(&vcpu->kvm->mmu_lock);
         if (mmu_notifier_retry(vcpu, mmu_seq))
                 goto out_unlock;
@@ -2457,10 +2507,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
  static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
  {
         ASSERT(vcpu);
-       if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+       if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               /* mmu.free() should set root_hpa = INVALID_PAGE */
                 vcpu->arch.mmu.free(vcpu);
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       }
  }
  
  int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2526,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
         r = mmu_topup_memory_caches(vcpu);
         if (r)
                 goto out;
-       spin_lock(&vcpu->kvm->mmu_lock);
-       kvm_mmu_free_some_pages(vcpu);
-       spin_unlock(&vcpu->kvm->mmu_lock);
         r = mmu_alloc_roots(vcpu);
         spin_lock(&vcpu->kvm->mmu_lock);
         mmu_sync_roots(vcpu);
@@ -2795,11 +2841,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
                 return 1;
         case EMULATE_DO_MMIO:
                 ++vcpu->stat.mmio_exits;
-               return 0;
+               /* fall through */
         case EMULATE_FAIL:
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-               vcpu->run->internal.ndata = 0;
                 return 0;
         default:
                 BUG();
@@ -2923,10 +2966,10 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
  
         page = container_of(kvm->arch.active_mmu_pages.prev,
                             struct kvm_mmu_page, link);
-       return kvm_mmu_zap_page(kvm, page) + 1;
+       return kvm_mmu_zap_page(kvm, page);
  }
  
-static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
+static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
  {
         struct kvm *kvm;
         struct kvm *kvm_freed = NULL;