Merge tag 'kvm-s390-master-5.15-2' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorPaolo Bonzini <pbonzini@redhat.com>
Mon, 25 Oct 2021 13:08:56 +0000 (09:08 -0400)
committerPaolo Bonzini <pbonzini@redhat.com>
Mon, 25 Oct 2021 13:08:56 +0000 (09:08 -0400)
KVM: s390: Fixes for interrupt delivery

Two bugs that might result in CPUs not woken up when interrupts are
pending.

13 files changed:
arch/arm64/kvm/hyp/include/nvhe/gfp.h
arch/arm64/kvm/hyp/nvhe/mem_protect.c
arch/arm64/kvm/hyp/nvhe/page_alloc.c
arch/arm64/kvm/mmu.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvmclock.h
arch/x86/kernel/kvmclock.c
arch/x86/kvm/cpuid.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/x86.c
drivers/ptp/ptp_kvm_x86.c
tools/testing/selftests/kvm/include/x86_64/processor.h
tools/testing/selftests/kvm/rseq_test.c

index fb0f523..0a048dc 100644 (file)
@@ -24,6 +24,7 @@ struct hyp_pool {
 
 /* Allocation */
 void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
+void hyp_split_page(struct hyp_page *page);
 void hyp_get_page(struct hyp_pool *pool, void *addr);
 void hyp_put_page(struct hyp_pool *pool, void *addr);
 
index bacd493..34eeb52 100644 (file)
@@ -35,7 +35,18 @@ const u8 pkvm_hyp_id = 1;
 
 static void *host_s2_zalloc_pages_exact(size_t size)
 {
-       return hyp_alloc_pages(&host_s2_pool, get_order(size));
+       void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size));
+
+       hyp_split_page(hyp_virt_to_page(addr));
+
+       /*
+        * The size of concatenated PGDs is always a power of two of PAGE_SIZE,
+        * so there should be no need to free any of the tail pages to make the
+        * allocation exact.
+        */
+       WARN_ON(size != (PAGE_SIZE << get_order(size)));
+
+       return addr;
 }
 
 static void *host_s2_zalloc_page(void *pool)
index 41fc25b..0bd7701 100644 (file)
@@ -152,6 +152,7 @@ static inline void hyp_page_ref_inc(struct hyp_page *p)
 
 static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
 {
+       BUG_ON(!p->refcount);
        p->refcount--;
        return (p->refcount == 0);
 }
@@ -193,6 +194,20 @@ void hyp_get_page(struct hyp_pool *pool, void *addr)
        hyp_spin_unlock(&pool->lock);
 }
 
+void hyp_split_page(struct hyp_page *p)
+{
+       unsigned short order = p->order;
+       unsigned int i;
+
+       p->order = 0;
+       for (i = 1; i < (1 << order); i++) {
+               struct hyp_page *tail = p + i;
+
+               tail->order = 0;
+               hyp_set_page_refcounted(tail);
+       }
+}
+
 void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
 {
        unsigned short i = order;
index 1a94a7c..69bd173 100644 (file)
@@ -1529,8 +1529,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                 * when updating the PG_mte_tagged page flag, see
                 * sanitise_mte_tags for more details.
                 */
-               if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED)
-                       return -EINVAL;
+               if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
+                       ret = -EINVAL;
+                       break;
+               }
 
                if (vma->vm_flags & VM_PFNMAP) {
                        /* IO region dirty page logging not allowed */
index f8f48a7..7077137 100644 (file)
@@ -1097,7 +1097,7 @@ struct kvm_arch {
        u64 cur_tsc_generation;
        int nr_vcpus_matched_tsc;
 
-       spinlock_t pvclock_gtod_sync_lock;
+       raw_spinlock_t pvclock_gtod_sync_lock;
        bool use_master_clock;
        u64 master_kernel_ns;
        u64 master_cycle_now;
index eceea92..6c57651 100644 (file)
@@ -2,6 +2,20 @@
 #ifndef _ASM_X86_KVM_CLOCK_H
 #define _ASM_X86_KVM_CLOCK_H
 
+#include <linux/percpu.h>
+
 extern struct clocksource kvm_clock;
 
+DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+
+static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+{
+       return &this_cpu_read(hv_clock_per_cpu)->pvti;
+}
+
+static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
+{
+       return this_cpu_read(hv_clock_per_cpu);
+}
+
 #endif /* _ASM_X86_KVM_CLOCK_H */
index ad273e5..73c74b9 100644 (file)
@@ -49,18 +49,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
 static struct pvclock_vsyscall_time_info
                        hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
 static struct pvclock_wall_clock wall_clock __bss_decrypted;
-static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 static struct pvclock_vsyscall_time_info *hvclock_mem;
-
-static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
-{
-       return &this_cpu_read(hv_clock_per_cpu)->pvti;
-}
-
-static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
-{
-       return this_cpu_read(hv_clock_per_cpu);
-}
+DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
 
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
index fe03bd9..751aa85 100644 (file)
@@ -65,8 +65,8 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
        for (i = 0; i < nent; i++) {
                e = &entries[i];
 
-               if (e->function == function && (e->index == index ||
-                   !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
+               if (e->function == function &&
+                   (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index))
                        return e;
        }
 
index c36b5fe..e672493 100644 (file)
@@ -2583,7 +2583,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
                return -EINVAL;
 
        return kvm_sev_es_string_io(&svm->vcpu, size, port,
-                                   svm->ghcb_sa, svm->ghcb_sa_len, in);
+                                   svm->ghcb_sa, svm->ghcb_sa_len / size, in);
 }
 
 void sev_es_init_vmcb(struct vcpu_svm *svm)
index aabd3a2..5c82eff 100644 (file)
@@ -2542,7 +2542,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        kvm_vcpu_write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-       spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
        if (!matched) {
                kvm->arch.nr_vcpus_matched_tsc = 0;
        } else if (!already_matched) {
@@ -2550,7 +2550,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        }
 
        kvm_track_tsc_matching(vcpu);
-       spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
 }
 
 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2780,9 +2780,9 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
        kvm_make_mclock_inprogress_request(kvm);
 
        /* no guest entries from this point */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        pvclock_update_vm_gtod_copy(kvm);
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2800,15 +2800,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
        unsigned long flags;
        u64 ret;
 
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        if (!ka->use_master_clock) {
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
                return get_kvmclock_base_ns() + ka->kvmclock_offset;
        }
 
        hv_clock.tsc_timestamp = ka->master_cycle_now;
        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        /* both __this_cpu_read() and rdtsc() should be on the same cpu */
        get_cpu();
@@ -2902,13 +2902,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         * If the host uses TSC clock, then passthrough TSC as stable
         * to the guest.
         */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        use_master_clock = ka->use_master_clock;
        if (use_master_clock) {
                host_tsc = ka->master_cycle_now;
                kernel_ns = ka->master_kernel_ns;
        }
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
@@ -6100,13 +6100,13 @@ set_pit2_out:
                 * is slightly ahead) here we risk going negative on unsigned
                 * 'system_time' when 'user_ns.clock' is very small.
                 */
-               spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock);
                if (kvm->arch.use_master_clock)
                        now_ns = ka->master_kernel_ns;
                else
                        now_ns = get_kvmclock_base_ns();
                ka->kvmclock_offset = user_ns.clock - now_ns;
-               spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
 
                kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
                break;
@@ -8139,9 +8139,9 @@ static void kvm_hyperv_tsc_notifier(void)
        list_for_each_entry(kvm, &vm_list, vm_list) {
                struct kvm_arch *ka = &kvm->arch;
 
-               spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
                pvclock_update_vm_gtod_copy(kvm);
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
                kvm_for_each_vcpu(cpu, vcpu, kvm)
                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -11182,7 +11182,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
        mutex_init(&kvm->arch.apic_map_lock);
-       spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+       raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
        kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
        pvclock_update_vm_gtod_copy(kvm);
@@ -11392,7 +11392,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
                int level = i + 1;
                int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
 
-               WARN_ON(slot->arch.rmap[i]);
+               if (slot->arch.rmap[i])
+                       continue;
 
                slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
                if (!slot->arch.rmap[i]) {
index 3dd519d..d0096cd 100644 (file)
@@ -15,8 +15,6 @@
 #include <linux/ptp_clock_kernel.h>
 #include <linux/ptp_kvm.h>
 
-struct pvclock_vsyscall_time_info *hv_clock;
-
 static phys_addr_t clock_pair_gpa;
 static struct kvm_clock_pairing clock_pair;
 
@@ -28,8 +26,7 @@ int kvm_arch_ptp_init(void)
                return -ENODEV;
 
        clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-       hv_clock = pvclock_get_pvti_cpu0_va();
-       if (!hv_clock)
+       if (!pvclock_get_pvti_cpu0_va())
                return -ENODEV;
 
        ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
@@ -64,10 +61,8 @@ int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec,
        struct pvclock_vcpu_time_info *src;
        unsigned int version;
        long ret;
-       int cpu;
 
-       cpu = smp_processor_id();
-       src = &hv_clock[cpu].pvti;
+       src = this_cpu_pvti();
 
        do {
                /*
index eba8bd0..05e65ca 100644 (file)
@@ -315,7 +315,7 @@ static inline void set_xmm(int n, unsigned long val)
 #define GET_XMM(__xmm)                                                 \
 ({                                                                     \
        unsigned long __val;                                            \
-       asm volatile("movq %%"#__xmm", %0" : "=r"(__val) : : #__xmm);   \
+       asm volatile("movq %%"#__xmm", %0" : "=r"(__val));              \
        __val;                                                          \
 })
 
index c5e0dd6..4158da0 100644 (file)
@@ -10,6 +10,7 @@
 #include <signal.h>
 #include <syscall.h>
 #include <sys/ioctl.h>
+#include <sys/sysinfo.h>
 #include <asm/barrier.h>
 #include <linux/atomic.h>
 #include <linux/rseq.h>
@@ -39,6 +40,7 @@ static __thread volatile struct rseq __rseq = {
 
 static pthread_t migration_thread;
 static cpu_set_t possible_mask;
+static int min_cpu, max_cpu;
 static bool done;
 
 static atomic_t seq_cnt;
@@ -57,20 +59,37 @@ static void sys_rseq(int flags)
        TEST_ASSERT(!r, "rseq failed, errno = %d (%s)", errno, strerror(errno));
 }
 
+static int next_cpu(int cpu)
+{
+       /*
+        * Advance to the next CPU, skipping those that weren't in the original
+        * affinity set.  Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's
+        * data storage is considered as opaque.  Note, if this task is pinned
+        * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will
+        * burn a lot cycles and the test will take longer than normal to
+        * complete.
+        */
+       do {
+               cpu++;
+               if (cpu > max_cpu) {
+                       cpu = min_cpu;
+                       TEST_ASSERT(CPU_ISSET(cpu, &possible_mask),
+                                   "Min CPU = %d must always be usable", cpu);
+                       break;
+               }
+       } while (!CPU_ISSET(cpu, &possible_mask));
+
+       return cpu;
+}
+
 static void *migration_worker(void *ign)
 {
        cpu_set_t allowed_mask;
-       int r, i, nr_cpus, cpu;
+       int r, i, cpu;
 
        CPU_ZERO(&allowed_mask);
 
-       nr_cpus = CPU_COUNT(&possible_mask);
-
-       for (i = 0; i < NR_TASK_MIGRATIONS; i++) {
-               cpu = i % nr_cpus;
-               if (!CPU_ISSET(cpu, &possible_mask))
-                       continue;
-
+       for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) {
                CPU_SET(cpu, &allowed_mask);
 
                /*
@@ -154,6 +173,36 @@ static void *migration_worker(void *ign)
        return NULL;
 }
 
+static int calc_min_max_cpu(void)
+{
+       int i, cnt, nproc;
+
+       if (CPU_COUNT(&possible_mask) < 2)
+               return -EINVAL;
+
+       /*
+        * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that
+        * this task is affined to in order to reduce the time spent querying
+        * unusable CPUs, e.g. if this task is pinned to a small percentage of
+        * total CPUs.
+        */
+       nproc = get_nprocs_conf();
+       min_cpu = -1;
+       max_cpu = -1;
+       cnt = 0;
+
+       for (i = 0; i < nproc; i++) {
+               if (!CPU_ISSET(i, &possible_mask))
+                       continue;
+               if (min_cpu == -1)
+                       min_cpu = i;
+               max_cpu = i;
+               cnt++;
+       }
+
+       return (cnt < 2) ? -EINVAL : 0;
+}
+
 int main(int argc, char *argv[])
 {
        int r, i, snapshot;
@@ -167,8 +216,8 @@ int main(int argc, char *argv[])
        TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno,
                    strerror(errno));
 
-       if (CPU_COUNT(&possible_mask) < 2) {
-               print_skip("Only one CPU, task migration not possible\n");
+       if (calc_min_max_cpu()) {
+               print_skip("Only one usable CPU, task migration not possible");
                exit(KSFT_SKIP);
        }