Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 9 Jun 2021 20:09:57 +0000 (13:09 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 9 Jun 2021 20:09:57 +0000 (13:09 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 9 Jun 2021 20:09:57 +0000 (13:09 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 9 Jun 2021 20:09:57 +0000 (13:09 -0700)
diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst

index 5bfe28b..20d85da 100644 (file)
--- a/Documentation/virt/kvm/mmu.rst
+++ b/Documentation/virt/kvm/mmu.rst
@@ -171,8 +171,8 @@ Shadow pages contain the following information:
      shadow pages) so role.quadrant takes values in the range 0..3.  Each
      quadrant maps 1GB virtual address space.
    role.access:
-    Inherited guest access permissions in the form uwx.  Note execute
-    permission is positive, not negative.
+    Inherited guest access permissions from the parent ptes in the form uwx.
+    Note execute permission is positive, not negative.
    role.invalid:
      The page is invalid and should not be used.  It is a root page that is
      currently pinned (by a cpu hardware register pointing to it); once it is
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index 8120e86..6d72d8f 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1494,6 +1494,15 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
  
  static void cancel_hv_timer(struct kvm_lapic *apic);
  
+static void cancel_apic_timer(struct kvm_lapic *apic)
+{
+       hrtimer_cancel(&apic->lapic_timer.timer);
+       preempt_disable();
+       if (apic->lapic_timer.hv_timer_in_use)
+               cancel_hv_timer(apic);
+       preempt_enable();
+}
+
  static void apic_update_lvtt(struct kvm_lapic *apic)
  {
         u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
@@ -1502,11 +1511,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
         if (apic->lapic_timer.timer_mode != timer_mode) {
                 if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
                                 APIC_LVT_TIMER_TSCDEADLINE)) {
-                       hrtimer_cancel(&apic->lapic_timer.timer);
-                       preempt_disable();
-                       if (apic->lapic_timer.hv_timer_in_use)
-                               cancel_hv_timer(apic);
-                       preempt_enable();
+                       cancel_apic_timer(apic);
                         kvm_lapic_set_reg(apic, APIC_TMICT, 0);
                         apic->lapic_timer.period = 0;
                         apic->lapic_timer.tscdeadline = 0;
@@ -2092,7 +2097,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
                 if (apic_lvtt_tscdeadline(apic))
                         break;
  
-               hrtimer_cancel(&apic->lapic_timer.timer);
+               cancel_apic_timer(apic);
                 kvm_lapic_set_reg(apic, APIC_TMICT, val);
                 start_apic_timer(apic);
                 break;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h

index 70b7e44..823a591 100644 (file)
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -90,8 +90,8 @@ struct guest_walker {
         gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
         pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
         bool pte_writable[PT_MAX_FULL_LEVELS];
-       unsigned pt_access;
-       unsigned pte_access;
+       unsigned int pt_access[PT_MAX_FULL_LEVELS];
+       unsigned int pte_access;
         gfn_t gfn;
         struct x86_exception fault;
  };
@@ -418,13 +418,15 @@ retry_walk:
                 }
  
                 walker->ptes[walker->level - 1] = pte;
+
+               /* Convert to ACC_*_MASK flags for struct guest_walker.  */
+               walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
         } while (!is_last_gpte(mmu, walker->level, pte));
  
         pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
         accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
  
         /* Convert to ACC_*_MASK flags for struct guest_walker.  */
-       walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
         walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
         errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
         if (unlikely(errcode))
@@ -463,7 +465,8 @@ retry_walk:
         }
  
         pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-                __func__, (u64)pte, walker->pte_access, walker->pt_access);
+                __func__, (u64)pte, walker->pte_access,
+                walker->pt_access[walker->level - 1]);
         return 1;
  
  error:
@@ -643,7 +646,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
         struct kvm_mmu_page *sp = NULL;
         struct kvm_shadow_walk_iterator it;
-       unsigned direct_access, access = gw->pt_access;
+       unsigned int direct_access, access;
         int top_level, level, req_level, ret;
         gfn_t base_gfn = gw->gfn;
  
@@ -675,6 +678,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
                 sp = NULL;
                 if (!is_shadow_present_pte(*it.sptep)) {
                         table_gfn = gw->table_gfn[it.level - 2];
+                       access = gw->pt_access[it.level - 2];
                         sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
                                               false, access);
                 }
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c

index 5bc887e..e0ce5da 100644 (file)
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1103,10 +1103,9 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
         struct sev_data_send_start data;
         int ret;
  
+       memset(&data, 0, sizeof(data));
         data.handle = sev->handle;
         ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
-       if (ret < 0)
-               return ret;
  
         params->session_len = data.session_len;
         if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
@@ -1215,10 +1214,9 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
         struct sev_data_send_update_data data;
         int ret;
  
+       memset(&data, 0, sizeof(data));
         data.handle = sev->handle;
         ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
-       if (ret < 0)
-               return ret;
  
         params->hdr_len = data.hdr_len;
         params->trans_len = data.trans_len;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h

index a61c015..4f83914 100644 (file)
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1550,16 +1550,16 @@ TRACE_EVENT(kvm_nested_vmenter_failed,
         TP_ARGS(msg, err),
  
         TP_STRUCT__entry(
-               __field(const char *, msg)
+               __string(msg, msg)
                 __field(u32, err)
         ),
  
         TP_fast_assign(
-               __entry->msg = msg;
+               __assign_str(msg, msg);
                 __entry->err = err;
         ),
  
-       TP_printk("%s%s", __entry->msg, !__entry->err ? "" :
+       TP_printk("%s%s", __get_str(msg), !__entry->err ? "" :
                 __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
  );
  
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index b594275..6d3955a 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3072,6 +3072,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
  static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
  {
         ++vcpu->stat.tlb_flush;
+
+       if (!tdp_enabled) {
+               /*
+                * A TLB flush on behalf of the guest is equivalent to
+                * INVPCID(all), toggling CR4.PGE, etc., which requires
+                * a forced sync of the shadow page tables.  Unload the
+                * entire MMU here and the subsequent load will sync the
+                * shadow page tables, and also flush the TLB.
+                */
+               kvm_mmu_unload(vcpu);
+               return;
+       }
+
         static_call(kvm_x86_tlb_flush_guest)(vcpu);
  }
  
@@ -3101,9 +3114,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
          * expensive IPIs.
          */
         if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+               u8 st_preempted = xchg(&st->preempted, 0);
+
                 trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
-                                      st->preempted & KVM_VCPU_FLUSH_TLB);
-               if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+                                      st_preempted & KVM_VCPU_FLUSH_TLB);
+               if (st_preempted & KVM_VCPU_FLUSH_TLB)
                         kvm_vcpu_flush_tlb_guest(vcpu);
         } else {
                 st->preempted = 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index 76102ef..8583ed3 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1185,7 +1185,15 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
  static inline unsigned long
  __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
  {
-       return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+       /*
+        * The index was checked originally in search_memslots.  To avoid
+        * that a malicious guest builds a Spectre gadget out of e.g. page
+        * table walks, do not let the processor speculate loads outside
+        * the guest's registered memslots.
+        */
+       unsigned long offset = gfn - slot->base_gfn;
+       offset = array_index_nospec(offset, slot->npages);
+       return slot->userspace_addr + offset * PAGE_SIZE;
  }
  
  static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h

index fcd8e38..3573956 100644 (file)
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -43,6 +43,7 @@ enum vm_guest_mode {
         VM_MODE_P40V48_4K,
         VM_MODE_P40V48_64K,
         VM_MODE_PXXV48_4K,      /* For 48bits VA but ANY bits PA */
+       VM_MODE_P47V64_4K,
         NUM_VM_MODES,
  };
  
@@ -60,7 +61,7 @@ enum vm_guest_mode {
  
  #elif defined(__s390x__)
  
-#define VM_MODE_DEFAULT                        VM_MODE_P52V48_4K
+#define VM_MODE_DEFAULT                        VM_MODE_P47V64_4K
  #define MIN_PAGE_SHIFT                 12U
  #define ptes_per_page(page_size)       ((page_size) / 16)
  
@@ -285,10 +286,11 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me
                                             uint32_t num_percpu_pages, void *guest_code,
                                             uint32_t vcpuids[]);
  
-/* Like vm_create_default_with_vcpus, but accepts mode as a parameter */
+/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */
  struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
-                                   uint64_t extra_mem_pages, uint32_t num_percpu_pages,
-                                   void *guest_code, uint32_t vcpuids[]);
+                                   uint64_t slot0_mem_pages, uint64_t extra_mem_pages,
+                                   uint32_t num_percpu_pages, void *guest_code,
+                                   uint32_t vcpuids[]);
  
  /*
   * Adds a vCPU with reasonable defaults (e.g. a stack)
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c

index 1c4753f..82171f1 100644 (file)
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -268,7 +268,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
  
         /* Create a VM with enough guest pages */
         guest_num_pages = test_mem_size / guest_page_size;
-       vm = vm_create_with_vcpus(mode, nr_vcpus,
+       vm = vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES,
                                   guest_num_pages, 0, guest_code, NULL);
  
         /* Align down GPA of the testing memslot */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c

index 28e528c..5c70596 100644 (file)
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -175,6 +175,7 @@ const char *vm_guest_mode_string(uint32_t i)
                 [VM_MODE_P40V48_4K]     = "PA-bits:40,  VA-bits:48,  4K pages",
                 [VM_MODE_P40V48_64K]    = "PA-bits:40,  VA-bits:48, 64K pages",
                 [VM_MODE_PXXV48_4K]     = "PA-bits:ANY, VA-bits:48,  4K pages",
+               [VM_MODE_P47V64_4K]     = "PA-bits:47,  VA-bits:64,  4K pages",
         };
         _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
                        "Missing new mode strings?");
@@ -192,6 +193,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = {
         { 40, 48,  0x1000, 12 },
         { 40, 48, 0x10000, 16 },
         {  0,  0,  0x1000, 12 },
+       { 47, 64,  0x1000, 12 },
  };
  _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
                "Missing new mode params?");
@@ -277,6 +279,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
                 TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms");
  #endif
                 break;
+       case VM_MODE_P47V64_4K:
+               vm->pgtable_levels = 5;
+               break;
         default:
                 TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
         }
@@ -308,21 +313,50 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
         return vm;
  }
  
+/*
+ * VM Create with customized parameters
+ *
+ * Input Args:
+ *   mode - VM Mode (e.g. VM_MODE_P52V48_4K)
+ *   nr_vcpus - VCPU count
+ *   slot0_mem_pages - Slot0 physical memory size
+ *   extra_mem_pages - Non-slot0 physical memory total size
+ *   num_percpu_pages - Per-cpu physical memory pages
+ *   guest_code - Guest entry point
+ *   vcpuids - VCPU IDs
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   Pointer to opaque structure that describes the created VM.
+ *
+ * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K),
+ * with customized slot0 memory size, at least 512 pages currently.
+ * extra_mem_pages is only used to calculate the maximum page table size,
+ * no real memory allocation for non-slot0 memory in this function.
+ */
  struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
-                                   uint64_t extra_mem_pages, uint32_t num_percpu_pages,
-                                   void *guest_code, uint32_t vcpuids[])
+                                   uint64_t slot0_mem_pages, uint64_t extra_mem_pages,
+                                   uint32_t num_percpu_pages, void *guest_code,
+                                   uint32_t vcpuids[])
  {
+       uint64_t vcpu_pages, extra_pg_pages, pages;
+       struct kvm_vm *vm;
+       int i;
+
+       /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
+       if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
+               slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES;
+
         /* The maximum page table size for a memory region will be when the
          * smallest pages are used. Considering each page contains x page
          * table descriptors, the total extra size for page tables (for extra
          * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller
          * than N/x*2.
          */
-       uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus;
-       uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2;
-       uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages;
-       struct kvm_vm *vm;
-       int i;
+       vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus;
+       extra_pg_pages = (slot0_mem_pages + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2;
+       pages = slot0_mem_pages + vcpu_pages + extra_pg_pages;
  
         TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS),
                     "nr_vcpus = %d too large for host, max-vcpus = %d",
@@ -354,8 +388,8 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me
                                             uint32_t num_percpu_pages, void *guest_code,
                                             uint32_t vcpuids[])
  {
-       return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, extra_mem_pages,
-                                   num_percpu_pages, guest_code, vcpuids);
+       return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES,
+                                   extra_mem_pages, num_percpu_pages, guest_code, vcpuids);
  }
  
  struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c

index abf3818..7397ca2 100644 (file)
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -69,7 +69,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
         TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0,
                     "Guest memory size is not guest page size aligned.");
  
-       vm = vm_create_with_vcpus(mode, vcpus,
+       vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES,
                                   (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size,
                                   0, guest_code, NULL);
  
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c

index 9307f25..1123965 100644 (file)
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -267,7 +267,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
         data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
         TEST_ASSERT(data->hva_slots, "malloc() fail");
  
-       data->vm = vm_create_default(VCPU_ID, 1024, guest_code);
+       data->vm = vm_create_default(VCPU_ID, mempages, guest_code);
  
         pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
                 max_mem_slots - 1, data->pages_per_slot, rempages);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 9 Jun 2021 20:09:57 +0000 (13:09 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 9 Jun 2021 20:09:57 +0000 (13:09 -0700)
Documentation/virt/kvm/mmu.rst		patch \| blob \| history
arch/x86/kvm/lapic.c		patch \| blob \| history
arch/x86/kvm/mmu/paging_tmpl.h		patch \| blob \| history
arch/x86/kvm/svm/sev.c		patch \| blob \| history
arch/x86/kvm/trace.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
include/linux/kvm_host.h		patch \| blob \| history
tools/testing/selftests/kvm/include/kvm_util.h		patch \| blob \| history
tools/testing/selftests/kvm/kvm_page_table_test.c		patch \| blob \| history
tools/testing/selftests/kvm/lib/kvm_util.c		patch \| blob \| history
tools/testing/selftests/kvm/lib/perf_test_util.c		patch \| blob \| history
tools/testing/selftests/kvm/memslot_perf_test.c		patch \| blob \| history