Merge tag 'kvm-arm-for-4.1-take2' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorPaolo Bonzini <pbonzini@redhat.com>
Wed, 22 Apr 2015 15:08:12 +0000 (17:08 +0200)
committerPaolo Bonzini <pbonzini@redhat.com>
Wed, 22 Apr 2015 15:08:12 +0000 (17:08 +0200)
KVM/ARM changes for v4.1, take #2:

Rather small this time:

- a fix for a nasty bug with virtual IRQ injection
- a fix for irqfd

30 files changed:
Documentation/virtual/kvm/api.txt
arch/powerpc/include/asm/archrandom.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/time.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/time.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/book3s_hv_rm_xics.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_pr_papr.c
arch/powerpc/kvm/book3s_xics.c
arch/powerpc/kvm/book3s_xics.h
arch/powerpc/kvm/powerpc.c
arch/powerpc/lib/locks.c
arch/powerpc/platforms/powernv/rng.c
arch/s390/kvm/kvm-s390.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
include/uapi/linux/kvm.h
virt/kvm/kvm_main.c

index bc9f6fe..9fa2bf8 100644 (file)
@@ -3573,3 +3573,20 @@ struct {
 @ar   - access register number
 
 KVM handlers should exit to userspace with rc = -EREMOTE.
+
+
+8. Other capabilities.
+----------------------
+
+This section lists capabilities that give information about other
+features of the KVM implementation.
+
+8.1 KVM_CAP_PPC_HWRNG
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel has an implementation of the
+H_RANDOM hypercall backed by a hardware random-number generator.
+If present, the kernel H_RANDOM handler can be enabled for guest use
+with the KVM_CAP_PPC_ENABLE_HCALL capability.
index bde5311..0cc6eed 100644 (file)
@@ -30,8 +30,6 @@ static inline int arch_has_random(void)
        return !!ppc_md.get_random_long;
 }
 
-int powernv_get_random_long(unsigned long *v);
-
 static inline int arch_get_random_seed_long(unsigned long *v)
 {
        return 0;
@@ -47,4 +45,13 @@ static inline int arch_has_random_seed(void)
 
 #endif /* CONFIG_ARCH_RANDOM */
 
+#ifdef CONFIG_PPC_POWERNV
+int powernv_hwrng_present(void);
+int powernv_get_random_long(unsigned long *v);
+int powernv_get_random_real_mode(unsigned long *v);
+#else
+static inline int powernv_hwrng_present(void) { return 0; }
+static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; }
+#endif
+
 #endif /* _ASM_POWERPC_ARCHRANDOM_H */
index 942c7b1..578e550 100644 (file)
@@ -292,6 +292,9 @@ static inline bool kvmppc_supports_magic_page(struct kvm_vcpu *vcpu)
        return !is_kvmppc_hv_enabled(vcpu->kvm);
 }
 
+extern int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu);
+extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3                        0x113724FA
index 2d81e20..2b84e48 100644 (file)
@@ -85,6 +85,20 @@ static inline long try_lock_hpte(__be64 *hpte, unsigned long bits)
        return old == 0;
 }
 
+static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
+{
+       hpte_v &= ~HPTE_V_HVLOCK;
+       asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
+       hpte[0] = cpu_to_be64(hpte_v);
+}
+
+/* Without barrier */
+static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
+{
+       hpte_v &= ~HPTE_V_HVLOCK;
+       hpte[0] = cpu_to_be64(hpte_v);
+}
+
 static inline int __hpte_actual_psize(unsigned int lp, int psize)
 {
        int i, shift;
@@ -422,6 +436,10 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
        return rcu_dereference_raw_notrace(kvm->memslots);
 }
 
+extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
+
+extern void kvmhv_rm_send_ipi(int cpu);
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
index 8ef0512..d67a838 100644 (file)
@@ -227,10 +227,8 @@ struct kvm_arch {
        unsigned long host_sdr1;
        int tlbie_lock;
        unsigned long lpcr;
-       unsigned long rmor;
-       struct kvm_rma_info *rma;
        unsigned long vrma_slb_v;
-       int rma_setup_done;
+       int hpte_setup_done;
        u32 hpt_order;
        atomic_t vcpus_running;
        u32 online_vcores;
@@ -239,6 +237,8 @@ struct kvm_arch {
        atomic_t hpte_mod_interest;
        cpumask_t need_tlb_flush;
        int hpt_cma_alloc;
+       struct dentry *debugfs_dir;
+       struct dentry *htab_dentry;
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
        struct mutex hpt_mutex;
@@ -263,18 +263,15 @@ struct kvm_arch {
 
 /*
  * Struct for a virtual core.
- * Note: entry_exit_count combines an entry count in the bottom 8 bits
- * and an exit count in the next 8 bits.  This is so that we can
- * atomically increment the entry count iff the exit count is 0
- * without taking the lock.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits.  This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
  */
 struct kvmppc_vcore {
        int n_runnable;
-       int n_busy;
        int num_threads;
-       int entry_exit_count;
-       int n_woken;
-       int nap_count;
+       int entry_exit_map;
        int napping_threads;
        int first_vcpuid;
        u16 pcpu;
@@ -299,13 +296,14 @@ struct kvmppc_vcore {
        ulong conferring_threads;
 };
 
-#define VCORE_ENTRY_COUNT(vc)  ((vc)->entry_exit_count & 0xff)
-#define VCORE_EXIT_COUNT(vc)   ((vc)->entry_exit_count >> 8)
+#define VCORE_ENTRY_MAP(vc)    ((vc)->entry_exit_map & 0xff)
+#define VCORE_EXIT_MAP(vc)     ((vc)->entry_exit_map >> 8)
+#define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
 
 /* Values for vcore_state */
 #define VCORE_INACTIVE 0
 #define VCORE_SLEEPING 1
-#define VCORE_STARTING 2
+#define VCORE_PREEMPT  2
 #define VCORE_RUNNING  3
 #define VCORE_EXITING  4
 
@@ -368,6 +366,14 @@ struct kvmppc_slb {
        u8 base_page_size;      /* MMU_PAGE_xxx */
 };
 
+/* Struct used to accumulate timing information in HV real mode code */
+struct kvmhv_tb_accumulator {
+       u64     seqcount;       /* used to synchronize access, also count * 2 */
+       u64     tb_total;       /* total time in timebase ticks */
+       u64     tb_min;         /* min time */
+       u64     tb_max;         /* max time */
+};
+
 # ifdef CONFIG_PPC_FSL_BOOK3E
 #define KVMPPC_BOOKE_IAC_NUM   2
 #define KVMPPC_BOOKE_DAC_NUM   2
@@ -656,6 +662,19 @@ struct kvm_vcpu_arch {
 
        u32 emul_inst;
 #endif
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       struct kvmhv_tb_accumulator *cur_activity;      /* What we're timing */
+       u64     cur_tb_start;                   /* when it started */
+       struct kvmhv_tb_accumulator rm_entry;   /* real-mode entry code */
+       struct kvmhv_tb_accumulator rm_intr;    /* real-mode intr handling */
+       struct kvmhv_tb_accumulator rm_exit;    /* real-mode exit code */
+       struct kvmhv_tb_accumulator guest_time; /* guest execution */
+       struct kvmhv_tb_accumulator cede_time;  /* time napping inside guest */
+
+       struct dentry *debugfs_dir;
+       struct dentry *debugfs_timings;
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
 };
 
 #define VCPU_FPR(vcpu, i)      (vcpu)->arch.fp.fpr[i][TS_FPROFFSET]
index 46bf652..b8475da 100644 (file)
@@ -302,6 +302,8 @@ static inline bool is_kvmppc_hv_enabled(struct kvm *kvm)
        return kvm->arch.kvm_ops == kvmppc_hv_ops;
 }
 
+extern int kvmppc_hwrng_present(void);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
index 03cbada..10fc784 100644 (file)
@@ -211,5 +211,8 @@ extern void secondary_cpu_time_init(void);
 
 DECLARE_PER_CPU(u64, decrementers_next_tb);
 
+/* Convert timebase ticks to nanoseconds */
+unsigned long long tb_to_ns(unsigned long long tb_ticks);
+
 #endif /* __KERNEL__ */
 #endif /* __POWERPC_TIME_H */
index 4717859..0034b6b 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/thread_info.h>
 #include <asm/rtas.h>
 #include <asm/vdso_datapage.h>
+#include <asm/dbell.h>
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
 #include <asm/lppaca.h>
@@ -459,6 +460,19 @@ int main(void)
        DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
        DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
 #endif
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry));
+       DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr));
+       DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit));
+       DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time));
+       DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time));
+       DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity));
+       DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start));
+       DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount));
+       DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total));
+       DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min));
+       DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max));
+#endif
        DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
        DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
        DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
@@ -492,7 +506,6 @@ int main(void)
        DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
        DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls));
        DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
-       DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
        DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
        DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
        DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
@@ -550,8 +563,7 @@ int main(void)
        DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
        DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
        DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
-       DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
-       DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+       DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map));
        DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
        DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
        DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
@@ -748,5 +760,7 @@ int main(void)
                        offsetof(struct paca_struct, subcore_sibling_mask));
 #endif
 
+       DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
+
        return 0;
 }
index 2d7b33f..56f4484 100644 (file)
@@ -608,6 +608,12 @@ void arch_suspend_enable_irqs(void)
 }
 #endif
 
+unsigned long long tb_to_ns(unsigned long long ticks)
+{
+       return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
+}
+EXPORT_SYMBOL_GPL(tb_to_ns);
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  *
index 11850f3..2963e4d 100644 (file)
@@ -110,6 +110,20 @@ config KVM_BOOK3S_64_PR
          processor, including emulating 32-bit processors on a 64-bit
          host.
 
+config KVM_BOOK3S_HV_EXIT_TIMING
+       bool "Detailed timing for hypervisor real-mode code"
+       depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS
+       ---help---
+         Calculate time taken for each vcpu in the real-mode guest entry,
+         exit, and interrupt handling code, plus time spent in the guest
+         and in nap mode due to idle (cede) while other threads are still
+         in the guest.  The total, minimum and maximum times in nanoseconds
+         together with the number of executions are reported in debugfs in
+         kvm/vm#/vcpu#/timings.  The overhead is of the order of 30 - 40
+         ns per exit on POWER8.
+
+         If unsure, say N.
+
 config KVM_BOOKE_HV
        bool
 
index cfbcdc6..453a8a4 100644 (file)
@@ -821,6 +821,82 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 #endif
 }
 
+int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+       unsigned long size = kvmppc_get_gpr(vcpu, 4);
+       unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+       u64 buf;
+       int ret;
+
+       if (!is_power_of_2(size) || (size > sizeof(buf)))
+               return H_TOO_HARD;
+
+       ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+       if (ret != 0)
+               return H_TOO_HARD;
+
+       switch (size) {
+       case 1:
+               kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf);
+               break;
+
+       case 2:
+               kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf));
+               break;
+
+       case 4:
+               kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf));
+               break;
+
+       case 8:
+               kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf));
+               break;
+
+       default:
+               BUG();
+       }
+
+       return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load);
+
+int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+       unsigned long size = kvmppc_get_gpr(vcpu, 4);
+       unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+       unsigned long val = kvmppc_get_gpr(vcpu, 6);
+       u64 buf;
+       int ret;
+
+       switch (size) {
+       case 1:
+               *(u8 *)&buf = val;
+               break;
+
+       case 2:
+               *(__be16 *)&buf = cpu_to_be16(val);
+               break;
+
+       case 4:
+               *(__be32 *)&buf = cpu_to_be32(val);
+               break;
+
+       case 8:
+               *(__be64 *)&buf = cpu_to_be64(val);
+               break;
+
+       default:
+               return H_TOO_HARD;
+       }
+
+       ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+       if (ret != 0)
+               return H_TOO_HARD;
+
+       return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store);
+
 int kvmppc_core_check_processor_compat(void)
 {
        /*
index 534acb3..d6fe308 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/srcu.h>
 #include <linux/anon_inodes.h>
 #include <linux/file.h>
+#include <linux/debugfs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -116,12 +117,12 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
        long order;
 
        mutex_lock(&kvm->lock);
-       if (kvm->arch.rma_setup_done) {
-               kvm->arch.rma_setup_done = 0;
-               /* order rma_setup_done vs. vcpus_running */
+       if (kvm->arch.hpte_setup_done) {
+               kvm->arch.hpte_setup_done = 0;
+               /* order hpte_setup_done vs. vcpus_running */
                smp_mb();
                if (atomic_read(&kvm->arch.vcpus_running)) {
-                       kvm->arch.rma_setup_done = 1;
+                       kvm->arch.hpte_setup_done = 1;
                        goto out;
                }
        }
@@ -338,9 +339,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
        v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
        gr = kvm->arch.revmap[index].guest_rpte;
 
-       /* Unlock the HPTE */
-       asm volatile("lwsync" : : : "memory");
-       hptep[0] = cpu_to_be64(v);
+       unlock_hpte(hptep, v);
        preempt_enable();
 
        gpte->eaddr = eaddr;
@@ -469,8 +468,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
        hpte[1] = be64_to_cpu(hptep[1]);
        hpte[2] = r = rev->guest_rpte;
-       asm volatile("lwsync" : : : "memory");
-       hptep[0] = cpu_to_be64(hpte[0]);
+       unlock_hpte(hptep, hpte[0]);
        preempt_enable();
 
        if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
@@ -621,7 +619,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        hptep[1] = cpu_to_be64(r);
        eieio();
-       hptep[0] = cpu_to_be64(hpte[0]);
+       __unlock_hpte(hptep, hpte[0]);
        asm volatile("ptesync" : : : "memory");
        preempt_enable();
        if (page && hpte_is_writable(r))
@@ -642,7 +640,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        return ret;
 
  out_unlock:
-       hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+       __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
        preempt_enable();
        goto out_put;
 }
@@ -771,7 +769,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        }
                }
                unlock_rmap(rmapp);
-               hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
        }
        return 0;
 }
@@ -857,7 +855,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        }
                        ret = 1;
                }
-               hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
        } while ((i = j) != head);
 
        unlock_rmap(rmapp);
@@ -974,8 +972,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 
                /* Now check and modify the HPTE */
                if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
-                       /* unlock and continue */
-                       hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+                       __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
                        continue;
                }
 
@@ -996,9 +993,9 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
                                npages_dirty = n;
                        eieio();
                }
-               v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+               v &= ~HPTE_V_ABSENT;
                v |= HPTE_V_VALID;
-               hptep[0] = cpu_to_be64(v);
+               __unlock_hpte(hptep, v);
        } while ((i = j) != head);
 
        unlock_rmap(rmapp);
@@ -1218,8 +1215,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
                        r &= ~HPTE_GR_MODIFIED;
                        revp->guest_rpte = r;
                }
-               asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
-               hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               unlock_hpte(hptp, be64_to_cpu(hptp[0]));
                preempt_enable();
                if (!(valid == want_valid && (first_pass || dirty)))
                        ok = 0;
@@ -1339,20 +1335,20 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
        unsigned long tmp[2];
        ssize_t nb;
        long int err, ret;
-       int rma_setup;
+       int hpte_setup;
 
        if (!access_ok(VERIFY_READ, buf, count))
                return -EFAULT;
 
        /* lock out vcpus from running while we're doing this */
        mutex_lock(&kvm->lock);
-       rma_setup = kvm->arch.rma_setup_done;
-       if (rma_setup) {
-               kvm->arch.rma_setup_done = 0;   /* temporarily */
-               /* order rma_setup_done vs. vcpus_running */
+       hpte_setup = kvm->arch.hpte_setup_done;
+       if (hpte_setup) {
+               kvm->arch.hpte_setup_done = 0;  /* temporarily */
+               /* order hpte_setup_done vs. vcpus_running */
                smp_mb();
                if (atomic_read(&kvm->arch.vcpus_running)) {
-                       kvm->arch.rma_setup_done = 1;
+                       kvm->arch.hpte_setup_done = 1;
                        mutex_unlock(&kvm->lock);
                        return -EBUSY;
                }
@@ -1405,7 +1401,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
                                       "r=%lx\n", ret, i, v, r);
                                goto out;
                        }
-                       if (!rma_setup && is_vrma_hpte(v)) {
+                       if (!hpte_setup && is_vrma_hpte(v)) {
                                unsigned long psize = hpte_base_page_size(v, r);
                                unsigned long senc = slb_pgsize_encoding(psize);
                                unsigned long lpcr;
@@ -1414,7 +1410,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
                                        (VRMA_VSID << SLB_VSID_SHIFT_1T);
                                lpcr = senc << (LPCR_VRMASD_SH - 4);
                                kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
-                               rma_setup = 1;
+                               hpte_setup = 1;
                        }
                        ++i;
                        hptp += 2;
@@ -1430,9 +1426,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
        }
 
  out:
-       /* Order HPTE updates vs. rma_setup_done */
+       /* Order HPTE updates vs. hpte_setup_done */
        smp_wmb();
-       kvm->arch.rma_setup_done = rma_setup;
+       kvm->arch.hpte_setup_done = hpte_setup;
        mutex_unlock(&kvm->lock);
 
        if (err)
@@ -1495,6 +1491,141 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
        return ret;
 }
 
+struct debugfs_htab_state {
+       struct kvm      *kvm;
+       struct mutex    mutex;
+       unsigned long   hpt_index;
+       int             chars_left;
+       int             buf_index;
+       char            buf[64];
+};
+
+static int debugfs_htab_open(struct inode *inode, struct file *file)
+{
+       struct kvm *kvm = inode->i_private;
+       struct debugfs_htab_state *p;
+
+       p = kzalloc(sizeof(*p), GFP_KERNEL);
+       if (!p)
+               return -ENOMEM;
+
+       kvm_get_kvm(kvm);
+       p->kvm = kvm;
+       mutex_init(&p->mutex);
+       file->private_data = p;
+
+       return nonseekable_open(inode, file);
+}
+
+static int debugfs_htab_release(struct inode *inode, struct file *file)
+{
+       struct debugfs_htab_state *p = file->private_data;
+
+       kvm_put_kvm(p->kvm);
+       kfree(p);
+       return 0;
+}
+
+static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
+                                size_t len, loff_t *ppos)
+{
+       struct debugfs_htab_state *p = file->private_data;
+       ssize_t ret, r;
+       unsigned long i, n;
+       unsigned long v, hr, gr;
+       struct kvm *kvm;
+       __be64 *hptp;
+
+       ret = mutex_lock_interruptible(&p->mutex);
+       if (ret)
+               return ret;
+
+       if (p->chars_left) {
+               n = p->chars_left;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf + p->buf_index, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index += n;
+               buf += n;
+               len -= n;
+               ret = n;
+               if (r) {
+                       if (!n)
+                               ret = -EFAULT;
+                       goto out;
+               }
+       }
+
+       kvm = p->kvm;
+       i = p->hpt_index;
+       hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+       for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+               if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
+                       continue;
+
+               /* lock the HPTE so it's stable and read it */
+               preempt_disable();
+               while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+                       cpu_relax();
+               v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
+               hr = be64_to_cpu(hptp[1]);
+               gr = kvm->arch.revmap[i].guest_rpte;
+               unlock_hpte(hptp, v);
+               preempt_enable();
+
+               if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+                       continue;
+
+               n = scnprintf(p->buf, sizeof(p->buf),
+                             "%6lx %.16lx %.16lx %.16lx\n",
+                             i, v, hr, gr);
+               p->chars_left = n;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index = n;
+               buf += n;
+               len -= n;
+               ret += n;
+               if (r) {
+                       if (!ret)
+                               ret = -EFAULT;
+                       goto out;
+               }
+       }
+       p->hpt_index = i;
+
+ out:
+       mutex_unlock(&p->mutex);
+       return ret;
+}
+
+ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
+                          size_t len, loff_t *ppos)
+{
+       return -EACCES;
+}
+
+static const struct file_operations debugfs_htab_fops = {
+       .owner   = THIS_MODULE,
+       .open    = debugfs_htab_open,
+       .release = debugfs_htab_release,
+       .read    = debugfs_htab_read,
+       .write   = debugfs_htab_write,
+       .llseek  = generic_file_llseek,
+};
+
+void kvmppc_mmu_debugfs_init(struct kvm *kvm)
+{
+       kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
+                                                   kvm->arch.debugfs_dir, kvm,
+                                                   &debugfs_htab_fops);
+}
+
 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
index de74756..48d3c5d 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/page-flags.h>
 #include <linux/srcu.h>
 #include <linux/miscdevice.h>
+#include <linux/debugfs.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -50,6 +51,7 @@
 #include <asm/hvcall.h>
 #include <asm/switch_to.h>
 #include <asm/smp.h>
+#include <asm/dbell.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -83,9 +85,35 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+static bool kvmppc_ipi_thread(int cpu)
+{
+       /* On POWER8 for IPIs to threads in the same core, use msgsnd */
+       if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+               preempt_disable();
+               if (cpu_first_thread_sibling(cpu) ==
+                   cpu_first_thread_sibling(smp_processor_id())) {
+                       unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+                       msg |= cpu_thread_in_core(cpu);
+                       smp_mb();
+                       __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+                       preempt_enable();
+                       return true;
+               }
+               preempt_enable();
+       }
+
+#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
+       if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
+               xics_wake_cpu(cpu);
+               return true;
+       }
+#endif
+
+       return false;
+}
+
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
-       int me;
        int cpu = vcpu->cpu;
        wait_queue_head_t *wqp;
 
@@ -95,20 +123,12 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
                ++vcpu->stat.halt_wakeup;
        }
 
-       me = get_cpu();
+       if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+               return;
 
        /* CPU points to the first thread of the core */
-       if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
-#ifdef CONFIG_PPC_ICP_NATIVE
-               int real_cpu = cpu + vcpu->arch.ptid;
-               if (paca[real_cpu].kvm_hstate.xics_phys)
-                       xics_wake_cpu(real_cpu);
-               else
-#endif
-               if (cpu_online(cpu))
-                       smp_send_reschedule(cpu);
-       }
-       put_cpu();
+       if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
+               smp_send_reschedule(cpu);
 }
 
 /*
@@ -706,6 +726,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
                /* Send the error out to userspace via KVM_RUN */
                return rc;
+       case H_LOGICAL_CI_LOAD:
+               ret = kvmppc_h_logical_ci_load(vcpu);
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_LOGICAL_CI_STORE:
+               ret = kvmppc_h_logical_ci_store(vcpu);
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
        case H_SET_MODE:
                ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
                                        kvmppc_get_gpr(vcpu, 5),
@@ -740,6 +770,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
        case H_CONFER:
        case H_REGISTER_VPA:
        case H_SET_MODE:
+       case H_LOGICAL_CI_LOAD:
+       case H_LOGICAL_CI_STORE:
 #ifdef CONFIG_KVM_XICS
        case H_XIRR:
        case H_CPPR:
@@ -1410,6 +1442,154 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
        return vcore;
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+static struct debugfs_timings_element {
+       const char *name;
+       size_t offset;
+} timings[] = {
+       {"rm_entry",    offsetof(struct kvm_vcpu, arch.rm_entry)},
+       {"rm_intr",     offsetof(struct kvm_vcpu, arch.rm_intr)},
+       {"rm_exit",     offsetof(struct kvm_vcpu, arch.rm_exit)},
+       {"guest",       offsetof(struct kvm_vcpu, arch.guest_time)},
+       {"cede",        offsetof(struct kvm_vcpu, arch.cede_time)},
+};
+
+#define N_TIMINGS      (sizeof(timings) / sizeof(timings[0]))
+
+struct debugfs_timings_state {
+       struct kvm_vcpu *vcpu;
+       unsigned int    buflen;
+       char            buf[N_TIMINGS * 100];
+};
+
+static int debugfs_timings_open(struct inode *inode, struct file *file)
+{
+       struct kvm_vcpu *vcpu = inode->i_private;
+       struct debugfs_timings_state *p;
+
+       p = kzalloc(sizeof(*p), GFP_KERNEL);
+       if (!p)
+               return -ENOMEM;
+
+       kvm_get_kvm(vcpu->kvm);
+       p->vcpu = vcpu;
+       file->private_data = p;
+
+       return nonseekable_open(inode, file);
+}
+
+static int debugfs_timings_release(struct inode *inode, struct file *file)
+{
+       struct debugfs_timings_state *p = file->private_data;
+
+       kvm_put_kvm(p->vcpu->kvm);
+       kfree(p);
+       return 0;
+}
+
+static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
+                                   size_t len, loff_t *ppos)
+{
+       struct debugfs_timings_state *p = file->private_data;
+       struct kvm_vcpu *vcpu = p->vcpu;
+       char *s, *buf_end;
+       struct kvmhv_tb_accumulator tb;
+       u64 count;
+       loff_t pos;
+       ssize_t n;
+       int i, loops;
+       bool ok;
+
+       if (!p->buflen) {
+               s = p->buf;
+               buf_end = s + sizeof(p->buf);
+               for (i = 0; i < N_TIMINGS; ++i) {
+                       struct kvmhv_tb_accumulator *acc;
+
+                       acc = (struct kvmhv_tb_accumulator *)
+                               ((unsigned long)vcpu + timings[i].offset);
+                       ok = false;
+                       for (loops = 0; loops < 1000; ++loops) {
+                               count = acc->seqcount;
+                               if (!(count & 1)) {
+                                       smp_rmb();
+                                       tb = *acc;
+                                       smp_rmb();
+                                       if (count == acc->seqcount) {
+                                               ok = true;
+                                               break;
+                                       }
+                               }
+                               udelay(1);
+                       }
+                       if (!ok)
+                               snprintf(s, buf_end - s, "%s: stuck\n",
+                                       timings[i].name);
+                       else
+                               snprintf(s, buf_end - s,
+                                       "%s: %llu %llu %llu %llu\n",
+                                       timings[i].name, count / 2,
+                                       tb_to_ns(tb.tb_total),
+                                       tb_to_ns(tb.tb_min),
+                                       tb_to_ns(tb.tb_max));
+                       s += strlen(s);
+               }
+               p->buflen = s - p->buf;
+       }
+
+       pos = *ppos;
+       if (pos >= p->buflen)
+               return 0;
+       if (len > p->buflen - pos)
+               len = p->buflen - pos;
+       n = copy_to_user(buf, p->buf + pos, len);
+       if (n) {
+               if (n == len)
+                       return -EFAULT;
+               len -= n;
+       }
+       *ppos = pos + len;
+       return len;
+}
+
+static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
+                                    size_t len, loff_t *ppos)
+{
+       return -EACCES;
+}
+
+static const struct file_operations debugfs_timings_ops = {
+       .owner   = THIS_MODULE,
+       .open    = debugfs_timings_open,
+       .release = debugfs_timings_release,
+       .read    = debugfs_timings_read,
+       .write   = debugfs_timings_write,
+       .llseek  = generic_file_llseek,
+};
+
+/* Create a debugfs directory for the vcpu */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+       char buf[16];
+       struct kvm *kvm = vcpu->kvm;
+
+       snprintf(buf, sizeof(buf), "vcpu%u", id);
+       if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+               return;
+       vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
+       if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
+               return;
+       vcpu->arch.debugfs_timings =
+               debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
+                                   vcpu, &debugfs_timings_ops);
+}
+
+#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+}
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+
 static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
                                                   unsigned int id)
 {
@@ -1479,6 +1659,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
        vcpu->arch.cpu_type = KVM_CPU_3S_64;
        kvmppc_sanity_check(vcpu);
 
+       debugfs_vcpu_init(vcpu, id);
+
        return vcpu;
 
 free_vcpu:
@@ -1566,8 +1748,10 @@ static int kvmppc_grab_hwthread(int cpu)
        tpaca = &paca[cpu];
 
        /* Ensure the thread won't go into the kernel if it wakes */
-       tpaca->kvm_hstate.hwthread_req = 1;
        tpaca->kvm_hstate.kvm_vcpu = NULL;
+       tpaca->kvm_hstate.napping = 0;
+       smp_wmb();
+       tpaca->kvm_hstate.hwthread_req = 1;
 
        /*
         * If the thread is already executing in the kernel (e.g. handling
@@ -1610,35 +1794,41 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
        }
        cpu = vc->pcpu + vcpu->arch.ptid;
        tpaca = &paca[cpu];
-       tpaca->kvm_hstate.kvm_vcpu = vcpu;
        tpaca->kvm_hstate.kvm_vcore = vc;
        tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
        vcpu->cpu = vc->pcpu;
+       /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
        smp_wmb();
-#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
-       if (cpu != smp_processor_id()) {
-               xics_wake_cpu(cpu);
-               if (vcpu->arch.ptid)
-                       ++vc->n_woken;
-       }
-#endif
+       tpaca->kvm_hstate.kvm_vcpu = vcpu;
+       if (cpu != smp_processor_id())
+               kvmppc_ipi_thread(cpu);
 }
 
-static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+static void kvmppc_wait_for_nap(void)
 {
-       int i;
+       int cpu = smp_processor_id();
+       int i, loops;
 
-       HMT_low();
-       i = 0;
-       while (vc->nap_count < vc->n_woken) {
-               if (++i >= 1000000) {
-                       pr_err("kvmppc_wait_for_nap timeout %d %d\n",
-                              vc->nap_count, vc->n_woken);
-                       break;
+       for (loops = 0; loops < 1000000; ++loops) {
+               /*
+                * Check if all threads are finished.
+                * We set the vcpu pointer when starting a thread
+                * and the thread clears it when finished, so we look
+                * for any threads that still have a non-NULL vcpu ptr.
+                */
+               for (i = 1; i < threads_per_subcore; ++i)
+                       if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+                               break;
+               if (i == threads_per_subcore) {
+                       HMT_medium();
+                       return;
                }
-               cpu_relax();
+               HMT_low();
        }
        HMT_medium();
+       for (i = 1; i < threads_per_subcore; ++i)
+               if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+                       pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 }
 
 /*
@@ -1700,63 +1890,103 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
        mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
 }
 
+static void prepare_threads(struct kvmppc_vcore *vc)
+{
+       struct kvm_vcpu *vcpu, *vnext;
+
+       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+                                arch.run_list) {
+               if (signal_pending(vcpu->arch.run_task))
+                       vcpu->arch.ret = -EINTR;
+               else if (vcpu->arch.vpa.update_pending ||
+                        vcpu->arch.slb_shadow.update_pending ||
+                        vcpu->arch.dtl.update_pending)
+                       vcpu->arch.ret = RESUME_GUEST;
+               else
+                       continue;
+               kvmppc_remove_runnable(vc, vcpu);
+               wake_up(&vcpu->arch.cpu_run);
+       }
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc)
+{
+       u64 now;
+       long ret;
+       struct kvm_vcpu *vcpu, *vnext;
+
+       now = get_tb();
+       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+                                arch.run_list) {
+               /* cancel pending dec exception if dec is positive */
+               if (now < vcpu->arch.dec_expires &&
+                   kvmppc_core_pending_dec(vcpu))
+                       kvmppc_core_dequeue_dec(vcpu);
+
+               trace_kvm_guest_exit(vcpu);
+
+               ret = RESUME_GUEST;
+               if (vcpu->arch.trap)
+                       ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
+                                                   vcpu->arch.run_task);
+
+               vcpu->arch.ret = ret;
+               vcpu->arch.trap = 0;
+
+               if (vcpu->arch.ceded) {
+                       if (!is_kvmppc_resume_guest(ret))
+                               kvmppc_end_cede(vcpu);
+                       else
+                               kvmppc_set_timer(vcpu);
+               }
+               if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+                       kvmppc_remove_runnable(vc, vcpu);
+                       wake_up(&vcpu->arch.cpu_run);
+               }
+       }
+}
+
 /*
  * Run a set of guest threads on a physical core.
  * Called with vc->lock held.
  */
-static void kvmppc_run_core(struct kvmppc_vcore *vc)
+static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-       struct kvm_vcpu *vcpu, *vnext;
-       long ret;
-       u64 now;
-       int i, need_vpa_update;
+       struct kvm_vcpu *vcpu;
+       int i;
        int srcu_idx;
-       struct kvm_vcpu *vcpus_to_update[threads_per_core];
 
-       /* don't start if any threads have a signal pending */
-       need_vpa_update = 0;
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-               if (signal_pending(vcpu->arch.run_task))
-                       return;
-               if (vcpu->arch.vpa.update_pending ||
-                   vcpu->arch.slb_shadow.update_pending ||
-                   vcpu->arch.dtl.update_pending)
-                       vcpus_to_update[need_vpa_update++] = vcpu;
-       }
+       /*
+        * Remove from the list any threads that have a signal pending
+        * or need a VPA update done
+        */
+       prepare_threads(vc);
+
+       /* if the runner is no longer runnable, let the caller pick a new one */
+       if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
+               return;
 
        /*
-        * Initialize *vc, in particular vc->vcore_state, so we can
-        * drop the vcore lock if necessary.
+        * Initialize *vc.
         */
-       vc->n_woken = 0;
-       vc->nap_count = 0;
-       vc->entry_exit_count = 0;
+       vc->entry_exit_map = 0;
        vc->preempt_tb = TB_NIL;
-       vc->vcore_state = VCORE_STARTING;
        vc->in_guest = 0;
        vc->napping_threads = 0;
        vc->conferring_threads = 0;
 
        /*
-        * Updating any of the vpas requires calling kvmppc_pin_guest_page,
-        * which can't be called with any spinlocks held.
-        */
-       if (need_vpa_update) {
-               spin_unlock(&vc->lock);
-               for (i = 0; i < need_vpa_update; ++i)
-                       kvmppc_update_vpas(vcpus_to_update[i]);
-               spin_lock(&vc->lock);
-       }
-
-       /*
         * Make sure we are running on primary threads, and that secondary
         * threads are offline.  Also check if the number of threads in this
         * guest are greater than the current system threads per guest.
         */
        if ((threads_per_core > 1) &&
            ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-               list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
                        vcpu->arch.ret = -EBUSY;
+                       kvmppc_remove_runnable(vc, vcpu);
+                       wake_up(&vcpu->arch.cpu_run);
+               }
                goto out;
        }
 
@@ -1797,8 +2027,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
        list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
                vcpu->cpu = -1;
        /* wait for secondary threads to finish writing their state to memory */
-       if (vc->nap_count < vc->n_woken)
-               kvmppc_wait_for_nap(vc);
+       kvmppc_wait_for_nap();
        for (i = 0; i < threads_per_subcore; ++i)
                kvmppc_release_hwthread(vc->pcpu + i);
        /* prevent other vcpu threads from doing kvmppc_start_thread() now */
@@ -1812,44 +2041,12 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
        kvm_guest_exit();
 
        preempt_enable();
-       cond_resched();
 
        spin_lock(&vc->lock);
-       now = get_tb();
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-               /* cancel pending dec exception if dec is positive */
-               if (now < vcpu->arch.dec_expires &&
-                   kvmppc_core_pending_dec(vcpu))
-                       kvmppc_core_dequeue_dec(vcpu);
-
-               trace_kvm_guest_exit(vcpu);
-
-               ret = RESUME_GUEST;
-               if (vcpu->arch.trap)
-                       ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
-                                                   vcpu->arch.run_task);
-
-               vcpu->arch.ret = ret;
-               vcpu->arch.trap = 0;
-
-               if (vcpu->arch.ceded) {
-                       if (!is_kvmppc_resume_guest(ret))
-                               kvmppc_end_cede(vcpu);
-                       else
-                               kvmppc_set_timer(vcpu);
-               }
-       }
+       post_guest_process(vc);
 
  out:
        vc->vcore_state = VCORE_INACTIVE;
-       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-                                arch.run_list) {
-               if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
-                       kvmppc_remove_runnable(vc, vcpu);
-                       wake_up(&vcpu->arch.cpu_run);
-               }
-       }
-
        trace_kvmppc_run_core(vc, 1);
 }
 
@@ -1939,8 +2136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
         * this thread straight away and have it join in.
         */
        if (!signal_pending(current)) {
-               if (vc->vcore_state == VCORE_RUNNING &&
-                   VCORE_EXIT_COUNT(vc) == 0) {
+               if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
                        kvmppc_create_dtl_entry(vcpu, vc);
                        kvmppc_start_thread(vcpu);
                        trace_kvm_guest_enter(vcpu);
@@ -1971,7 +2167,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                }
                if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
                        break;
-               vc->runner = vcpu;
                n_ceded = 0;
                list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
                        if (!v->arch.pending_exceptions)
@@ -1979,10 +2174,17 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                        else
                                v->arch.ceded = 0;
                }
-               if (n_ceded == vc->n_runnable)
+               vc->runner = vcpu;
+               if (n_ceded == vc->n_runnable) {
                        kvmppc_vcore_blocked(vc);
-               else
+               } else if (should_resched()) {
+                       vc->vcore_state = VCORE_PREEMPT;
+                       /* Let something else run */
+                       cond_resched_lock(&vc->lock);
+                       vc->vcore_state = VCORE_INACTIVE;
+               } else {
                        kvmppc_run_core(vc);
+               }
                vc->runner = NULL;
        }
 
@@ -2032,11 +2234,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
        }
 
        atomic_inc(&vcpu->kvm->arch.vcpus_running);
-       /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
+       /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
        smp_mb();
 
        /* On the first time here, set up HTAB and VRMA */
-       if (!vcpu->kvm->arch.rma_setup_done) {
+       if (!vcpu->kvm->arch.hpte_setup_done) {
                r = kvmppc_hv_setup_htab_rma(vcpu);
                if (r)
                        goto out;
@@ -2238,7 +2440,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
        int srcu_idx;
 
        mutex_lock(&kvm->lock);
-       if (kvm->arch.rma_setup_done)
+       if (kvm->arch.hpte_setup_done)
                goto out;       /* another vcpu beat us to it */
 
        /* Allocate hashed page table (if not done already) and reset it */
@@ -2289,9 +2491,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 
        kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
 
-       /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
+       /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
        smp_wmb();
-       kvm->arch.rma_setup_done = 1;
+       kvm->arch.hpte_setup_done = 1;
        err = 0;
  out_srcu:
        srcu_read_unlock(&kvm->srcu, srcu_idx);
@@ -2307,6 +2509,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
        unsigned long lpcr, lpid;
+       char buf[32];
 
        /* Allocate the guest's logical partition ID */
 
@@ -2347,6 +2550,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
         */
        kvm_hv_vm_activated();
 
+       /*
+        * Create a debugfs directory for the VM
+        */
+       snprintf(buf, sizeof(buf), "vm%d", current->pid);
+       kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
+       if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+               kvmppc_mmu_debugfs_init(kvm);
+
        return 0;
 }
 
@@ -2367,6 +2578,8 @@ static void kvmppc_free_vcores(struct kvm *kvm)
 
 static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
+       debugfs_remove_recursive(kvm->arch.debugfs_dir);
+
        kvm_hv_vm_deactivated();
 
        kvmppc_free_vcores(kvm);
index 1f083ff..ed2589d 100644 (file)
 #include <asm/cputable.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
+#include <asm/archrandom.h>
+#include <asm/xics.h>
+#include <asm/dbell.h>
+#include <asm/cputhreads.h>
 
 #define KVM_CMA_CHUNK_ORDER    18
 
@@ -114,11 +118,11 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
        int rv = H_SUCCESS; /* => don't yield */
 
        set_bit(vcpu->arch.ptid, &vc->conferring_threads);
-       while ((get_tb() < stop) && (VCORE_EXIT_COUNT(vc) == 0)) {
-               threads_running = VCORE_ENTRY_COUNT(vc);
-               threads_ceded = hweight32(vc->napping_threads);
-               threads_conferring = hweight32(vc->conferring_threads);
-               if (threads_ceded + threads_conferring >= threads_running) {
+       while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
+               threads_running = VCORE_ENTRY_MAP(vc);
+               threads_ceded = vc->napping_threads;
+               threads_conferring = vc->conferring_threads;
+               if ((threads_ceded | threads_conferring) == threads_running) {
                        rv = H_TOO_HARD; /* => do yield */
                        break;
                }
@@ -169,3 +173,89 @@ int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
+
+int kvmppc_hwrng_present(void)
+{
+       return powernv_hwrng_present();
+}
+EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+long kvmppc_h_random(struct kvm_vcpu *vcpu)
+{
+       if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
+               return H_SUCCESS;
+
+       return H_HARDWARE;
+}
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+       __asm__ __volatile__("stbcix %0,0,%1"
+               : : "r" (val), "r" (paddr) : "memory");
+}
+
+/*
+ * Send an interrupt or message to another CPU.
+ * This can only be called in real mode.
+ * The caller needs to include any barrier needed to order writes
+ * to memory vs. the IPI/message.
+ */
+void kvmhv_rm_send_ipi(int cpu)
+{
+       unsigned long xics_phys;
+
+       /* On POWER8 for IPIs to threads in the same core, use msgsnd */
+       if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+           cpu_first_thread_sibling(cpu) ==
+           cpu_first_thread_sibling(raw_smp_processor_id())) {
+               unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+               msg |= cpu_thread_in_core(cpu);
+               __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+               return;
+       }
+
+       /* Else poke the target with an IPI */
+       xics_phys = paca[cpu].kvm_hstate.xics_phys;
+       rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+/*
+ * The following functions are called from the assembly code
+ * in book3s_hv_rmhandlers.S.
+ */
+static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active)
+{
+       int cpu = vc->pcpu;
+
+       /* Order setting of exit map vs. msgsnd/IPI */
+       smp_mb();
+       for (; active; active >>= 1, ++cpu)
+               if (active & 1)
+                       kvmhv_rm_send_ipi(cpu);
+}
+
+void kvmhv_commence_exit(int trap)
+{
+       struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+       int ptid = local_paca->kvm_hstate.ptid;
+       int me, ee;
+
+       /* Set our bit in the threads-exiting-guest map in the 0xff00
+          bits of vcore->entry_exit_map */
+       me = 0x100 << ptid;
+       do {
+               ee = vc->entry_exit_map;
+       } while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee);
+
+       /* Are we the first here? */
+       if ((ee >> 8) != 0)
+               return;
+
+       /*
+        * Trigger the other threads in this vcore to exit the guest.
+        * If this is a hypervisor decrementer interrupt then they
+        * will be already on their way out of the guest.
+        */
+       if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
+               kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+}
index 625407e..f6bf0b1 100644 (file)
@@ -150,12 +150,6 @@ static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
        return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
 }
 
-static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
-{
-       asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
-       hpte[0] = cpu_to_be64(hpte_v);
-}
-
 long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
                       long pte_index, unsigned long pteh, unsigned long ptel,
                       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
@@ -271,10 +265,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
                                u64 pte;
                                while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
                                        cpu_relax();
-                               pte = be64_to_cpu(*hpte);
+                               pte = be64_to_cpu(hpte[0]);
                                if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT)))
                                        break;
-                               *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+                               __unlock_hpte(hpte, pte);
                                hpte += 2;
                        }
                        if (i == 8)
@@ -290,9 +284,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
                        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
                                cpu_relax();
-                       pte = be64_to_cpu(*hpte);
+                       pte = be64_to_cpu(hpte[0]);
                        if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
-                               *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+                               __unlock_hpte(hpte, pte);
                                return H_PTEG_FULL;
                        }
                }
@@ -331,7 +325,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
        /* Write the first HPTE dword, unlocking the HPTE and making it valid */
        eieio();
-       hpte[0] = cpu_to_be64(pteh);
+       __unlock_hpte(hpte, pteh);
        asm volatile("ptesync" : : : "memory");
 
        *pte_idx_ret = pte_index;
@@ -412,7 +406,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
        if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
            ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
            ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
-               hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               __unlock_hpte(hpte, pte);
                return H_NOT_FOUND;
        }
 
@@ -548,7 +542,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
                                be64_to_cpu(hp[0]), be64_to_cpu(hp[1]));
                        rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
                        args[j] |= rcbits << (56 - 5);
-                       hp[0] = 0;
+                       __unlock_hpte(hp, 0);
                }
        }
 
@@ -574,7 +568,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
        pte = be64_to_cpu(hpte[0]);
        if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
            ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
-               hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               __unlock_hpte(hpte, pte);
                return H_NOT_FOUND;
        }
 
@@ -755,8 +749,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
                                /* Return with the HPTE still locked */
                                return (hash << 3) + (i >> 1);
 
-                       /* Unlock and move on */
-                       hpte[i] = cpu_to_be64(v);
+                       __unlock_hpte(&hpte[i], v);
                }
 
                if (val & HPTE_V_SECONDARY)
index 7c22997..00e45b6 100644 (file)
 
 #define DEBUG_PASSUP
 
-static inline void rm_writeb(unsigned long paddr, u8 val)
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq);
+
+/* -- ICS routines -- */
+static void ics_rm_check_resend(struct kvmppc_xics *xics,
+                               struct kvmppc_ics *ics, struct kvmppc_icp *icp)
 {
-       __asm__ __volatile__("sync; stbcix %0,0,%1"
-               : : "r" (val), "r" (paddr) : "memory");
+       int i;
+
+       arch_spin_lock(&ics->lock);
+
+       for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+               struct ics_irq_state *state = &ics->irq_state[i];
+
+               if (!state->resend)
+                       continue;
+
+               arch_spin_unlock(&ics->lock);
+               icp_rm_deliver_irq(xics, icp, state->number);
+               arch_spin_lock(&ics->lock);
+       }
+
+       arch_spin_unlock(&ics->lock);
 }
 
+/* -- ICP routines -- */
+
 static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
                                struct kvm_vcpu *this_vcpu)
 {
        struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
-       unsigned long xics_phys;
        int cpu;
 
        /* Mark the target VCPU as having an interrupt pending */
@@ -56,9 +76,8 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
        /* In SMT cpu will always point to thread 0, we adjust it */
        cpu += vcpu->arch.ptid;
 
-       /* Not too hard, then poke the target */
-       xics_phys = paca[cpu].kvm_hstate.xics_phys;
-       rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+       smp_mb();
+       kvmhv_rm_send_ipi(cpu);
 }
 
 static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
@@ -116,6 +135,180 @@ static inline int check_too_hard(struct kvmppc_xics *xics,
        return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
 }
 
+static void icp_rm_check_resend(struct kvmppc_xics *xics,
+                            struct kvmppc_icp *icp)
+{
+       u32 icsid;
+
+       /* Order this load with the test for need_resend in the caller */
+       smp_rmb();
+       for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+               struct kvmppc_ics *ics = xics->ics[icsid];
+
+               if (!test_and_clear_bit(icsid, icp->resend_map))
+                       continue;
+               if (!ics)
+                       continue;
+               ics_rm_check_resend(xics, ics, icp);
+       }
+}
+
+static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+                              u32 *reject)
+{
+       union kvmppc_icp_state old_state, new_state;
+       bool success;
+
+       do {
+               old_state = new_state = READ_ONCE(icp->state);
+
+               *reject = 0;
+
+               /* See if we can deliver */
+               success = new_state.cppr > priority &&
+                       new_state.mfrr > priority &&
+                       new_state.pending_pri > priority;
+
+               /*
+                * If we can, check for a rejection and perform the
+                * delivery
+                */
+               if (success) {
+                       *reject = new_state.xisr;
+                       new_state.xisr = irq;
+                       new_state.pending_pri = priority;
+               } else {
+                       /*
+                        * If we failed to deliver we set need_resend
+                        * so a subsequent CPPR state change causes us
+                        * to try a new delivery.
+                        */
+                       new_state.need_resend = true;
+               }
+
+       } while (!icp_rm_try_update(icp, old_state, new_state));
+
+       return success;
+}
+
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq)
+{
+       struct ics_irq_state *state;
+       struct kvmppc_ics *ics;
+       u32 reject;
+       u16 src;
+
+       /*
+        * This is used both for initial delivery of an interrupt and
+        * for subsequent rejection.
+        *
+        * Rejection can be racy vs. resends. We have evaluated the
+        * rejection in an atomic ICP transaction which is now complete,
+        * so potentially the ICP can already accept the interrupt again.
+        *
+        * So we need to retry the delivery. Essentially the reject path
+        * boils down to a failed delivery. Always.
+        *
+        * Now the interrupt could also have moved to a different target,
+        * thus we may need to re-do the ICP lookup as well
+        */
+
+ again:
+       /* Get the ICS state and lock it */
+       ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+       if (!ics) {
+               /* Unsafe increment, but this does not need to be accurate */
+               xics->err_noics++;
+               return;
+       }
+       state = &ics->irq_state[src];
+
+       /* Get a lock on the ICS */
+       arch_spin_lock(&ics->lock);
+
+       /* Get our server */
+       if (!icp || state->server != icp->server_num) {
+               icp = kvmppc_xics_find_server(xics->kvm, state->server);
+               if (!icp) {
+                       /* Unsafe increment again*/
+                       xics->err_noicp++;
+                       goto out;
+               }
+       }
+
+       /* Clear the resend bit of that interrupt */
+       state->resend = 0;
+
+       /*
+        * If masked, bail out
+        *
+        * Note: PAPR doesn't mention anything about masked pending
+        * when doing a resend, only when doing a delivery.
+        *
+        * However that would have the effect of losing a masked
+        * interrupt that was rejected and isn't consistent with
+        * the whole masked_pending business which is about not
+        * losing interrupts that occur while masked.
+        *
+        * I don't differentiate normal deliveries and resends, this
+        * implementation will differ from PAPR and not lose such
+        * interrupts.
+        */
+       if (state->priority == MASKED) {
+               state->masked_pending = 1;
+               goto out;
+       }
+
+       /*
+        * Try the delivery, this will set the need_resend flag
+        * in the ICP as part of the atomic transaction if the
+        * delivery is not possible.
+        *
+        * Note that if successful, the new delivery might have itself
+        * rejected an interrupt that was "delivered" before we took the
+        * ics spin lock.
+        *
+        * In this case we do the whole sequence all over again for the
+        * new guy. We cannot assume that the rejected interrupt is less
+        * favored than the new one, and thus doesn't need to be delivered,
+        * because by the time we exit icp_rm_try_to_deliver() the target
+        * processor may well have already consumed & completed it, and thus
+        * the rejected interrupt might actually be already acceptable.
+        */
+       if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+               /*
+                * Delivery was successful, did we reject somebody else ?
+                */
+               if (reject && reject != XICS_IPI) {
+                       arch_spin_unlock(&ics->lock);
+                       new_irq = reject;
+                       goto again;
+               }
+       } else {
+               /*
+                * We failed to deliver the interrupt we need to set the
+                * resend map bit and mark the ICS state as needing a resend
+                */
+               set_bit(ics->icsid, icp->resend_map);
+               state->resend = 1;
+
+               /*
+                * If the need_resend flag got cleared in the ICP some time
+                * between icp_rm_try_to_deliver() atomic update and now, then
+                * we know it might have missed the resend_map bit. So we
+                * retry
+                */
+               smp_mb();
+               if (!icp->state.need_resend) {
+                       arch_spin_unlock(&ics->lock);
+                       goto again;
+               }
+       }
+ out:
+       arch_spin_unlock(&ics->lock);
+}
+
 static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
                             u8 new_cppr)
 {
@@ -184,8 +377,8 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
         * separately here as well.
         */
        if (resend) {
-               icp->rm_action |= XICS_RM_CHECK_RESEND;
-               icp->rm_resend_icp = icp;
+               icp->n_check_resend++;
+               icp_rm_check_resend(xics, icp);
        }
 }
 
@@ -300,16 +493,16 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
                }
        } while (!icp_rm_try_update(icp, old_state, new_state));
 
-       /* Pass rejects to virtual mode */
+       /* Handle reject in real mode */
        if (reject && reject != XICS_IPI) {
-               this_icp->rm_action |= XICS_RM_REJECT;
-               this_icp->rm_reject = reject;
+               this_icp->n_reject++;
+               icp_rm_deliver_irq(xics, icp, reject);
        }
 
-       /* Pass resends to virtual mode */
+       /* Handle resends in real mode */
        if (resend) {
-               this_icp->rm_action |= XICS_RM_CHECK_RESEND;
-               this_icp->rm_resend_icp = icp;
+               this_icp->n_check_resend++;
+               icp_rm_check_resend(xics, icp);
        }
 
        return check_too_hard(xics, this_icp);
@@ -365,10 +558,13 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 
        } while (!icp_rm_try_update(icp, old_state, new_state));
 
-       /* Pass rejects to virtual mode */
+       /*
+        * Check for rejects. They are handled by doing a new delivery
+        * attempt (see comments in icp_rm_deliver_irq).
+        */
        if (reject && reject != XICS_IPI) {
-               icp->rm_action |= XICS_RM_REJECT;
-               icp->rm_reject = reject;
+               icp->n_reject++;
+               icp_rm_deliver_irq(xics, icp, reject);
        }
  bail:
        return check_too_hard(xics, icp);
@@ -416,10 +612,10 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
                goto bail;
        state = &ics->irq_state[src];
 
-       /* Still asserted, resend it, we make it look like a reject */
+       /* Still asserted, resend it */
        if (state->asserted) {
-               icp->rm_action |= XICS_RM_REJECT;
-               icp->rm_reject = irq;
+               icp->n_reject++;
+               icp_rm_deliver_irq(xics, icp, irq);
        }
 
        if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
index 6cbf163..4d70df2 100644 (file)
@@ -172,6 +172,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 kvmppc_primary_no_guest:
        /* We handle this much like a ceded vcpu */
+       /* put the HDEC into the DEC, since HDEC interrupts don't wake us */
+       mfspr   r3, SPRN_HDEC
+       mtspr   SPRN_DEC, r3
+       /*
+        * Make sure the primary has finished the MMU switch.
+        * We should never get here on a secondary thread, but
+        * check it for robustness' sake.
+        */
+       ld      r5, HSTATE_KVM_VCORE(r13)
+65:    lbz     r0, VCORE_IN_GUEST(r5)
+       cmpwi   r0, 0
+       beq     65b
+       /* Set LPCR. */
+       ld      r8,VCORE_LPCR(r5)
+       mtspr   SPRN_LPCR,r8
+       isync
        /* set our bit in napping_threads */
        ld      r5, HSTATE_KVM_VCORE(r13)
        lbz     r7, HSTATE_PTID(r13)
@@ -182,7 +198,7 @@ kvmppc_primary_no_guest:
        or      r3, r3, r0
        stwcx.  r3, 0, r6
        bne     1b
-       /* order napping_threads update vs testing entry_exit_count */
+       /* order napping_threads update vs testing entry_exit_map */
        isync
        li      r12, 0
        lwz     r7, VCORE_ENTRY_EXIT(r5)
@@ -191,6 +207,7 @@ kvmppc_primary_no_guest:
        li      r3, NAPPING_NOVCPU
        stb     r3, HSTATE_NAPPING(r13)
 
+       li      r3, 0           /* Don't wake on privileged (OS) doorbell */
        b       kvm_do_nap
 
 kvm_novcpu_wakeup:
@@ -202,7 +219,7 @@ kvm_novcpu_wakeup:
 
        /* check the wake reason */
        bl      kvmppc_check_wake_reason
-       
+
        /* see if any other thread is already exiting */
        lwz     r0, VCORE_ENTRY_EXIT(r5)
        cmpwi   r0, 0x100
@@ -222,13 +239,37 @@ kvm_novcpu_wakeup:
        cmpdi   r3, 0
        bge     kvm_novcpu_exit
 
+       /* See if our timeslice has expired (HDEC is negative) */
+       mfspr   r0, SPRN_HDEC
+       li      r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+       cmpwi   r0, 0
+       blt     kvm_novcpu_exit
+
        /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
        ld      r4, HSTATE_KVM_VCPU(r13)
        cmpdi   r4, 0
-       bne     kvmppc_got_guest
+       beq     kvmppc_primary_no_guest
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r4, VCPU_TB_RMENTRY
+       bl      kvmhv_start_timing
+#endif
+       b       kvmppc_got_guest
 
 kvm_novcpu_exit:
-       b       hdec_soon
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       cmpdi   r4, 0
+       beq     13f
+       addi    r3, r4, VCPU_TB_RMEXIT
+       bl      kvmhv_accumulate_time
+#endif
+13:    mr      r3, r12
+       stw     r12, 112-4(r1)
+       bl      kvmhv_commence_exit
+       nop
+       lwz     r12, 112-4(r1)
+       b       kvmhv_switch_to_host
 
 /*
  * We come in here when wakened from nap mode.
@@ -239,9 +280,9 @@ kvm_novcpu_exit:
 kvm_start_guest:
 
        /* Set runlatch bit the minute you wake up from nap */
-       mfspr   r1, SPRN_CTRLF
-       ori     r1, r1, 1
-       mtspr   SPRN_CTRLT, r1
+       mfspr   r0, SPRN_CTRLF
+       ori     r0, r0, 1
+       mtspr   SPRN_CTRLT, r0
 
        ld      r2,PACATOC(r13)
 
@@ -286,26 +327,21 @@ kvm_secondary_got_guest:
        ld      r6, PACA_DSCR(r13)
        std     r6, HSTATE_DSCR(r13)
 
+       /* Order load of vcore, ptid etc. after load of vcpu */
+       lwsync
        bl      kvmppc_hv_entry
 
        /* Back from the guest, go back to nap */
        /* Clear our vcpu pointer so we don't come back in early */
        li      r0, 0
-       std     r0, HSTATE_KVM_VCPU(r13)
        /*
-        * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing
-        * the nap_count, because once the increment to nap_count is
-        * visible we could be given another vcpu.
+        * Once we clear HSTATE_KVM_VCPU(r13), the code in
+        * kvmppc_run_core() is going to assume that all our vcpu
+        * state is visible in memory.  This lwsync makes sure
+        * that that is true.
         */
        lwsync
-
-       /* increment the nap count and then go to nap mode */
-       ld      r4, HSTATE_KVM_VCORE(r13)
-       addi    r4, r4, VCORE_NAP_COUNT
-51:    lwarx   r3, 0, r4
-       addi    r3, r3, 1
-       stwcx.  r3, 0, r4
-       bne     51b
+       std     r0, HSTATE_KVM_VCPU(r13)
 
 /*
  * At this point we have finished executing in the guest.
@@ -376,6 +412,14 @@ kvmppc_hv_entry:
        li      r6, KVM_GUEST_MODE_HOST_HV
        stb     r6, HSTATE_IN_GUEST(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       /* Store initial timestamp */
+       cmpdi   r4, 0
+       beq     1f
+       addi    r3, r4, VCPU_TB_RMENTRY
+       bl      kvmhv_start_timing
+1:
+#endif
        /* Clear out SLB */
        li      r6,0
        slbmte  r6,r6
@@ -387,21 +431,23 @@ kvmppc_hv_entry:
         * We don't have to lock against concurrent tlbies,
         * but we do have to coordinate across hardware threads.
         */
-       /* Increment entry count iff exit count is zero. */
-       ld      r5,HSTATE_KVM_VCORE(r13)
-       addi    r9,r5,VCORE_ENTRY_EXIT
-21:    lwarx   r3,0,r9
-       cmpwi   r3,0x100                /* any threads starting to exit? */
+       /* Set bit in entry map iff exit map is zero. */
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       li      r7, 1
+       lbz     r6, HSTATE_PTID(r13)
+       sld     r7, r7, r6
+       addi    r9, r5, VCORE_ENTRY_EXIT
+21:    lwarx   r3, 0, r9
+       cmpwi   r3, 0x100               /* any threads starting to exit? */
        bge     secondary_too_late      /* if so we're too late to the party */
-       addi    r3,r3,1
-       stwcx.  r3,0,r9
+       or      r3, r3, r7
+       stwcx.  r3, 0, r9
        bne     21b
 
        /* Primary thread switches to guest partition. */
        ld      r9,VCORE_KVM(r5)        /* pointer to struct kvm */
-       lbz     r6,HSTATE_PTID(r13)
        cmpwi   r6,0
-       bne     20f
+       bne     10f
        ld      r6,KVM_SDR1(r9)
        lwz     r7,KVM_LPID(r9)
        li      r0,LPID_RSVD            /* switch to reserved LPID */
@@ -472,28 +518,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
        li      r0,1
        stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
-       b       10f
-
-       /* Secondary threads wait for primary to have done partition switch */
-20:    lbz     r0,VCORE_IN_GUEST(r5)
-       cmpwi   r0,0
-       beq     20b
-
-       /* Set LPCR and RMOR. */
-10:    ld      r8,VCORE_LPCR(r5)
-       mtspr   SPRN_LPCR,r8
-       ld      r8,KVM_RMOR(r9)
-       mtspr   SPRN_RMOR,r8
-       isync
-
-       /* Check if HDEC expires soon */
-       mfspr   r3,SPRN_HDEC
-       cmpwi   r3,512          /* 1 microsecond */
-       li      r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-       blt     hdec_soon
 
        /* Do we have a guest vcpu to run? */
-       cmpdi   r4, 0
+10:    cmpdi   r4, 0
        beq     kvmppc_primary_no_guest
 kvmppc_got_guest:
 
@@ -818,6 +845,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        clrrdi  r6,r6,1
        mtspr   SPRN_CTRLT,r6
 4:
+       /* Secondary threads wait for primary to have done partition switch */
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       lbz     r6, HSTATE_PTID(r13)
+       cmpwi   r6, 0
+       beq     21f
+       lbz     r0, VCORE_IN_GUEST(r5)
+       cmpwi   r0, 0
+       bne     21f
+       HMT_LOW
+20:    lbz     r0, VCORE_IN_GUEST(r5)
+       cmpwi   r0, 0
+       beq     20b
+       HMT_MEDIUM
+21:
+       /* Set LPCR. */
+       ld      r8,VCORE_LPCR(r5)
+       mtspr   SPRN_LPCR,r8
+       isync
+
+       /* Check if HDEC expires soon */
+       mfspr   r3, SPRN_HDEC
+       cmpwi   r3, 512         /* 1 microsecond */
+       blt     hdec_soon
+
        ld      r6, VCPU_CTR(r4)
        lwz     r7, VCPU_XER(r4)
 
@@ -880,6 +931,12 @@ fast_guest_return:
        li      r9, KVM_GUEST_MODE_GUEST_HV
        stb     r9, HSTATE_IN_GUEST(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       /* Accumulate timing */
+       addi    r3, r4, VCPU_TB_GUEST
+       bl      kvmhv_accumulate_time
+#endif
+
        /* Enter guest */
 
 BEGIN_FTR_SECTION
@@ -917,6 +974,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        hrfid
        b       .
 
+secondary_too_late:
+       li      r12, 0
+       cmpdi   r4, 0
+       beq     11f
+       stw     r12, VCPU_TRAP(r4)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r4, VCPU_TB_RMEXIT
+       bl      kvmhv_accumulate_time
+#endif
+11:    b       kvmhv_switch_to_host
+
+hdec_soon:
+       li      r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+       stw     r12, VCPU_TRAP(r4)
+       mr      r9, r4
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r4, VCPU_TB_RMEXIT
+       bl      kvmhv_accumulate_time
+#endif
+       b       guest_exit_cont
+
 /******************************************************************************
  *                                                                            *
  *                               Exit code                                    *
@@ -1002,6 +1080,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
        stw     r12,VCPU_TRAP(r9)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r9, VCPU_TB_RMINTR
+       mr      r4, r9
+       bl      kvmhv_accumulate_time
+       ld      r5, VCPU_GPR(R5)(r9)
+       ld      r6, VCPU_GPR(R6)(r9)
+       ld      r7, VCPU_GPR(R7)(r9)
+       ld      r8, VCPU_GPR(R8)(r9)
+#endif
+
        /* Save HEIR (HV emulation assist reg) in emul_inst
           if this is an HEI (HV emulation interrupt, e40) */
        li      r3,KVM_INST_FETCH_FAILED
@@ -1028,34 +1116,37 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        bne     2f
        mfspr   r3,SPRN_HDEC
        cmpwi   r3,0
-       bge     ignore_hdec
+       mr      r4,r9
+       bge     fast_guest_return
 2:
        /* See if this is an hcall we can handle in real mode */
        cmpwi   r12,BOOK3S_INTERRUPT_SYSCALL
        beq     hcall_try_real_mode
 
+       /* Hypervisor doorbell - exit only if host IPI flag set */
+       cmpwi   r12, BOOK3S_INTERRUPT_H_DOORBELL
+       bne     3f
+       lbz     r0, HSTATE_HOST_IPI(r13)
+       beq     4f
+       b       guest_exit_cont
+3:
        /* External interrupt ? */
        cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
-       bne+    ext_interrupt_to_host
+       bne+    guest_exit_cont
 
        /* External interrupt, first check for host_ipi. If this is
         * set, we know the host wants us out so let's do it now
         */
        bl      kvmppc_read_intr
        cmpdi   r3, 0
-       bgt     ext_interrupt_to_host
+       bgt     guest_exit_cont
 
        /* Check if any CPU is heading out to the host, if so head out too */
-       ld      r5, HSTATE_KVM_VCORE(r13)
+4:     ld      r5, HSTATE_KVM_VCORE(r13)
        lwz     r0, VCORE_ENTRY_EXIT(r5)
        cmpwi   r0, 0x100
-       bge     ext_interrupt_to_host
-
-       /* Return to guest after delivering any pending interrupt */
        mr      r4, r9
-       b       deliver_guest_interrupt
-
-ext_interrupt_to_host:
+       blt     deliver_guest_interrupt
 
 guest_exit_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
        /* Save more register state  */
@@ -1065,7 +1156,7 @@ guest_exit_cont:          /* r9 = vcpu, r12 = trap, r13 = paca */
        stw     r7, VCPU_DSISR(r9)
        /* don't overwrite fault_dar/fault_dsisr if HDSI */
        cmpwi   r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
-       beq     6f
+       beq     mc_cont
        std     r6, VCPU_FAULT_DAR(r9)
        stw     r7, VCPU_FAULT_DSISR(r9)
 
@@ -1073,9 +1164,20 @@ guest_exit_cont:         /* r9 = vcpu, r12 = trap, r13 = paca */
        cmpwi   r12, BOOK3S_INTERRUPT_MACHINE_CHECK
        beq     machine_check_realmode
 mc_cont:
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r9, VCPU_TB_RMEXIT
+       mr      r4, r9
+       bl      kvmhv_accumulate_time
+#endif
+
+       /* Increment exit count, poke other threads to exit */
+       bl      kvmhv_commence_exit
+       nop
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       lwz     r12, VCPU_TRAP(r9)
 
        /* Save guest CTRL register, set runlatch to 1 */
-6:     mfspr   r6,SPRN_CTRLF
+       mfspr   r6,SPRN_CTRLF
        stw     r6,VCPU_CTRL(r9)
        andi.   r0,r6,1
        bne     4f
@@ -1417,68 +1519,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        slbia
        ptesync
 
-hdec_soon:                     /* r12 = trap, r13 = paca */
        /*
         * POWER7/POWER8 guest -> host partition switch code.
         * We don't have to lock against tlbies but we do
         * have to coordinate the hardware threads.
         */
-       /* Increment the threads-exiting-guest count in the 0xff00
-          bits of vcore->entry_exit_count */
-       ld      r5,HSTATE_KVM_VCORE(r13)
-       addi    r6,r5,VCORE_ENTRY_EXIT
-41:    lwarx   r3,0,r6
-       addi    r0,r3,0x100
-       stwcx.  r0,0,r6
-       bne     41b
-       isync           /* order stwcx. vs. reading napping_threads */
-
-       /*
-        * At this point we have an interrupt that we have to pass
-        * up to the kernel or qemu; we can't handle it in real mode.
-        * Thus we have to do a partition switch, so we have to
-        * collect the other threads, if we are the first thread
-        * to take an interrupt.  To do this, we set the HDEC to 0,
-        * which causes an HDEC interrupt in all threads within 2ns
-        * because the HDEC register is shared between all 4 threads.
-        * However, we don't need to bother if this is an HDEC
-        * interrupt, since the other threads will already be on their
-        * way here in that case.
-        */
-       cmpwi   r3,0x100        /* Are we the first here? */
-       bge     43f
-       cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-       beq     40f
-       li      r0,0
-       mtspr   SPRN_HDEC,r0
-40:
-       /*
-        * Send an IPI to any napping threads, since an HDEC interrupt
-        * doesn't wake CPUs up from nap.
-        */
-       lwz     r3,VCORE_NAPPING_THREADS(r5)
-       lbz     r4,HSTATE_PTID(r13)
-       li      r0,1
-       sld     r0,r0,r4
-       andc.   r3,r3,r0                /* no sense IPI'ing ourselves */
-       beq     43f
-       /* Order entry/exit update vs. IPIs */
-       sync
-       mulli   r4,r4,PACA_SIZE         /* get paca for thread 0 */
-       subf    r6,r4,r13
-42:    andi.   r0,r3,1
-       beq     44f
-       ld      r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
-       li      r0,IPI_PRIORITY
-       li      r7,XICS_MFRR
-       stbcix  r0,r7,r8                /* trigger the IPI */
-44:    srdi.   r3,r3,1
-       addi    r6,r6,PACA_SIZE
-       bne     42b
-
-secondary_too_late:
+kvmhv_switch_to_host:
        /* Secondary threads wait for primary to do partition switch */
-43:    ld      r5,HSTATE_KVM_VCORE(r13)
+       ld      r5,HSTATE_KVM_VCORE(r13)
        ld      r4,VCORE_KVM(r5)        /* pointer to struct kvm */
        lbz     r3,HSTATE_PTID(r13)
        cmpwi   r3,0
@@ -1562,6 +1610,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 1:     addi    r8,r8,16
        .endr
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       /* Finish timing, if we have a vcpu */
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       cmpdi   r4, 0
+       li      r3, 0
+       beq     2f
+       bl      kvmhv_accumulate_time
+2:
+#endif
        /* Unset guest mode */
        li      r0, KVM_GUEST_MODE_NONE
        stb     r0, HSTATE_IN_GUEST(r13)
@@ -1696,8 +1753,10 @@ kvmppc_hisi:
  * Returns to the guest if we handle it, or continues on up to
  * the kernel if we can't (i.e. if we don't have a handler for
  * it, or if the handler returns H_TOO_HARD).
+ *
+ * r5 - r8 contain hcall args,
+ * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca
  */
-       .globl  hcall_try_real_mode
 hcall_try_real_mode:
        ld      r3,VCPU_GPR(R3)(r9)
        andi.   r0,r11,MSR_PR
@@ -1839,13 +1898,124 @@ hcall_real_table:
        .long   0               /* 0x12c */
        .long   0               /* 0x130 */
        .long   DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
+       .long   0               /* 0x138 */
+       .long   0               /* 0x13c */
+       .long   0               /* 0x140 */
+       .long   0               /* 0x144 */
+       .long   0               /* 0x148 */
+       .long   0               /* 0x14c */
+       .long   0               /* 0x150 */
+       .long   0               /* 0x154 */
+       .long   0               /* 0x158 */
+       .long   0               /* 0x15c */
+       .long   0               /* 0x160 */
+       .long   0               /* 0x164 */
+       .long   0               /* 0x168 */
+       .long   0               /* 0x16c */
+       .long   0               /* 0x170 */
+       .long   0               /* 0x174 */
+       .long   0               /* 0x178 */
+       .long   0               /* 0x17c */
+       .long   0               /* 0x180 */
+       .long   0               /* 0x184 */
+       .long   0               /* 0x188 */
+       .long   0               /* 0x18c */
+       .long   0               /* 0x190 */
+       .long   0               /* 0x194 */
+       .long   0               /* 0x198 */
+       .long   0               /* 0x19c */
+       .long   0               /* 0x1a0 */
+       .long   0               /* 0x1a4 */
+       .long   0               /* 0x1a8 */
+       .long   0               /* 0x1ac */
+       .long   0               /* 0x1b0 */
+       .long   0               /* 0x1b4 */
+       .long   0               /* 0x1b8 */
+       .long   0               /* 0x1bc */
+       .long   0               /* 0x1c0 */
+       .long   0               /* 0x1c4 */
+       .long   0               /* 0x1c8 */
+       .long   0               /* 0x1cc */
+       .long   0               /* 0x1d0 */
+       .long   0               /* 0x1d4 */
+       .long   0               /* 0x1d8 */
+       .long   0               /* 0x1dc */
+       .long   0               /* 0x1e0 */
+       .long   0               /* 0x1e4 */
+       .long   0               /* 0x1e8 */
+       .long   0               /* 0x1ec */
+       .long   0               /* 0x1f0 */
+       .long   0               /* 0x1f4 */
+       .long   0               /* 0x1f8 */
+       .long   0               /* 0x1fc */
+       .long   0               /* 0x200 */
+       .long   0               /* 0x204 */
+       .long   0               /* 0x208 */
+       .long   0               /* 0x20c */
+       .long   0               /* 0x210 */
+       .long   0               /* 0x214 */
+       .long   0               /* 0x218 */
+       .long   0               /* 0x21c */
+       .long   0               /* 0x220 */
+       .long   0               /* 0x224 */
+       .long   0               /* 0x228 */
+       .long   0               /* 0x22c */
+       .long   0               /* 0x230 */
+       .long   0               /* 0x234 */
+       .long   0               /* 0x238 */
+       .long   0               /* 0x23c */
+       .long   0               /* 0x240 */
+       .long   0               /* 0x244 */
+       .long   0               /* 0x248 */
+       .long   0               /* 0x24c */
+       .long   0               /* 0x250 */
+       .long   0               /* 0x254 */
+       .long   0               /* 0x258 */
+       .long   0               /* 0x25c */
+       .long   0               /* 0x260 */
+       .long   0               /* 0x264 */
+       .long   0               /* 0x268 */
+       .long   0               /* 0x26c */
+       .long   0               /* 0x270 */
+       .long   0               /* 0x274 */
+       .long   0               /* 0x278 */
+       .long   0               /* 0x27c */
+       .long   0               /* 0x280 */
+       .long   0               /* 0x284 */
+       .long   0               /* 0x288 */
+       .long   0               /* 0x28c */
+       .long   0               /* 0x290 */
+       .long   0               /* 0x294 */
+       .long   0               /* 0x298 */
+       .long   0               /* 0x29c */
+       .long   0               /* 0x2a0 */
+       .long   0               /* 0x2a4 */
+       .long   0               /* 0x2a8 */
+       .long   0               /* 0x2ac */
+       .long   0               /* 0x2b0 */
+       .long   0               /* 0x2b4 */
+       .long   0               /* 0x2b8 */
+       .long   0               /* 0x2bc */
+       .long   0               /* 0x2c0 */
+       .long   0               /* 0x2c4 */
+       .long   0               /* 0x2c8 */
+       .long   0               /* 0x2cc */
+       .long   0               /* 0x2d0 */
+       .long   0               /* 0x2d4 */
+       .long   0               /* 0x2d8 */
+       .long   0               /* 0x2dc */
+       .long   0               /* 0x2e0 */
+       .long   0               /* 0x2e4 */
+       .long   0               /* 0x2e8 */
+       .long   0               /* 0x2ec */
+       .long   0               /* 0x2f0 */
+       .long   0               /* 0x2f4 */
+       .long   0               /* 0x2f8 */
+       .long   0               /* 0x2fc */
+       .long   DOTSYM(kvmppc_h_random) - hcall_real_table
        .globl  hcall_real_table_end
 hcall_real_table_end:
 
-ignore_hdec:
-       mr      r4,r9
-       b       fast_guest_return
-
 _GLOBAL(kvmppc_h_set_xdabr)
        andi.   r0, r5, DABRX_USER | DABRX_KERNEL
        beq     6f
@@ -1884,7 +2054,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        li      r3, 0
        blr
 
-_GLOBAL(kvmppc_h_cede)
+_GLOBAL(kvmppc_h_cede)         /* r3 = vcpu pointer, r11 = msr, r13 = paca */
        ori     r11,r11,MSR_EE
        std     r11,VCPU_MSR(r3)
        li      r0,1
@@ -1893,8 +2063,8 @@ _GLOBAL(kvmppc_h_cede)
        lbz     r5,VCPU_PRODDED(r3)
        cmpwi   r5,0
        bne     kvm_cede_prodded
-       li      r0,0            /* set trap to 0 to say hcall is handled */
-       stw     r0,VCPU_TRAP(r3)
+       li      r12,0           /* set trap to 0 to say hcall is handled */
+       stw     r12,VCPU_TRAP(r3)
        li      r0,H_SUCCESS
        std     r0,VCPU_GPR(R3)(r3)
 
@@ -1912,12 +2082,11 @@ _GLOBAL(kvmppc_h_cede)
        addi    r6,r5,VCORE_NAPPING_THREADS
 31:    lwarx   r4,0,r6
        or      r4,r4,r0
-       PPC_POPCNTW(R7,R4)
-       cmpw    r7,r8
-       bge     kvm_cede_exit
+       cmpw    r4,r8
+       beq     kvm_cede_exit
        stwcx.  r4,0,r6
        bne     31b
-       /* order napping_threads update vs testing entry_exit_count */
+       /* order napping_threads update vs testing entry_exit_map */
        isync
        li      r0,NAPPING_CEDE
        stb     r0,HSTATE_NAPPING(r13)
@@ -1955,21 +2124,52 @@ _GLOBAL(kvmppc_h_cede)
        bl      kvmppc_save_fp
 
        /*
+        * Set DEC to the smaller of DEC and HDEC, so that we wake
+        * no later than the end of our timeslice (HDEC interrupts
+        * don't wake us from nap).
+        */
+       mfspr   r3, SPRN_DEC
+       mfspr   r4, SPRN_HDEC
+       mftb    r5
+       cmpw    r3, r4
+       ble     67f
+       mtspr   SPRN_DEC, r4
+67:
+       /* save expiry time of guest decrementer */
+       extsw   r3, r3
+       add     r3, r3, r5
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       ld      r6, VCORE_TB_OFFSET(r5)
+       subf    r3, r6, r3      /* convert to host TB value */
+       std     r3, VCPU_DEC_EXPIRES(r4)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       addi    r3, r4, VCPU_TB_CEDE
+       bl      kvmhv_accumulate_time
+#endif
+
+       lis     r3, LPCR_PECEDP@h       /* Do wake on privileged doorbell */
+
+       /*
         * Take a nap until a decrementer or external or doobell interrupt
-        * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the
-        * runlatch bit before napping.
+        * occurs, with PECE1 and PECE0 set in LPCR.
+        * On POWER8, set PECEDH, and if we are ceding, also set PECEDP.
+        * Also clear the runlatch bit before napping.
         */
 kvm_do_nap:
-       mfspr   r2, SPRN_CTRLF
-       clrrdi  r2, r2, 1
-       mtspr   SPRN_CTRLT, r2
+       mfspr   r0, SPRN_CTRLF
+       clrrdi  r0, r0, 1
+       mtspr   SPRN_CTRLT, r0
 
        li      r0,1
        stb     r0,HSTATE_HWTHREAD_REQ(r13)
        mfspr   r5,SPRN_LPCR
        ori     r5,r5,LPCR_PECE0 | LPCR_PECE1
 BEGIN_FTR_SECTION
-       oris    r5,r5,LPCR_PECEDP@h
+       ori     r5, r5, LPCR_PECEDH
+       rlwimi  r5, r3, 0, LPCR_PECEDP
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        mtspr   SPRN_LPCR,r5
        isync
@@ -1994,9 +2194,23 @@ kvm_end_cede:
        /* Woken by external or decrementer interrupt */
        ld      r1, HSTATE_HOST_R1(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r4, VCPU_TB_RMINTR
+       bl      kvmhv_accumulate_time
+#endif
+
        /* load up FP state */
        bl      kvmppc_load_fp
 
+       /* Restore guest decrementer */
+       ld      r3, VCPU_DEC_EXPIRES(r4)
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       ld      r6, VCORE_TB_OFFSET(r5)
+       add     r3, r3, r6      /* convert host TB to guest TB value */
+       mftb    r7
+       subf    r3, r7, r3
+       mtspr   SPRN_DEC, r3
+
        /* Load NV GPRS */
        ld      r14, VCPU_GPR(R14)(r4)
        ld      r15, VCPU_GPR(R15)(r4)
@@ -2057,7 +2271,8 @@ kvm_cede_prodded:
 
        /* we've ceded but we want to give control to the host */
 kvm_cede_exit:
-       b       hcall_real_fallback
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       b       guest_exit_cont
 
        /* Try to handle a machine check in real mode */
 machine_check_realmode:
@@ -2089,13 +2304,14 @@ machine_check_realmode:
 
 /*
  * Check the reason we woke from nap, and take appropriate action.
- * Returns:
+ * Returns (in r3):
  *     0 if nothing needs to be done
  *     1 if something happened that needs to be handled by the host
- *     -1 if there was a guest wakeup (IPI)
+ *     -1 if there was a guest wakeup (IPI or msgsnd)
  *
  * Also sets r12 to the interrupt vector for any interrupt that needs
  * to be handled now by the host (0x500 for external interrupt), or zero.
+ * Modifies r0, r6, r7, r8.
  */
 kvmppc_check_wake_reason:
        mfspr   r6, SPRN_SRR1
@@ -2122,7 +2338,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
        /* hypervisor doorbell */
 3:     li      r12, BOOK3S_INTERRUPT_H_DOORBELL
+       /* see if it's a host IPI */
        li      r3, 1
+       lbz     r0, HSTATE_HOST_IPI(r13)
+       cmpwi   r0, 0
+       bnelr
+       /* if not, clear it and return -1 */
+       lis     r6, (PPC_DBELL_SERVER << (63-36))@h
+       PPC_MSGCLR(6)
+       li      r3, -1
        blr
 
 /*
@@ -2131,6 +2355,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  *     0 if no interrupt is pending
  *     1 if an interrupt is pending that needs to be handled by the host
  *     -1 if there was a guest wakeup IPI (which has now been cleared)
+ * Modifies r0, r6, r7, r8, returns value in r3.
  */
 kvmppc_read_intr:
        /* see if a host IPI is pending */
@@ -2185,6 +2410,7 @@ kvmppc_read_intr:
        bne-    43f
 
        /* OK, it's an IPI for us */
+       li      r12, 0
        li      r3, -1
 1:     blr
 
@@ -2314,3 +2540,62 @@ kvmppc_fix_pmao:
        mtspr   SPRN_PMC6, r3
        isync
        blr
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+/*
+ * Start timing an activity
+ * r3 = pointer to time accumulation struct, r4 = vcpu
+ */
+kvmhv_start_timing:
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       lbz     r6, VCORE_IN_GUEST(r5)
+       cmpwi   r6, 0
+       beq     5f                              /* if in guest, need to */
+       ld      r6, VCORE_TB_OFFSET(r5)         /* subtract timebase offset */
+5:     mftb    r5
+       subf    r5, r6, r5
+       std     r3, VCPU_CUR_ACTIVITY(r4)
+       std     r5, VCPU_ACTIVITY_START(r4)
+       blr
+
+/*
+ * Accumulate time to one activity and start another.
+ * r3 = pointer to new time accumulation struct, r4 = vcpu
+ */
+kvmhv_accumulate_time:
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       lbz     r8, VCORE_IN_GUEST(r5)
+       cmpwi   r8, 0
+       beq     4f                              /* if in guest, need to */
+       ld      r8, VCORE_TB_OFFSET(r5)         /* subtract timebase offset */
+4:     ld      r5, VCPU_CUR_ACTIVITY(r4)
+       ld      r6, VCPU_ACTIVITY_START(r4)
+       std     r3, VCPU_CUR_ACTIVITY(r4)
+       mftb    r7
+       subf    r7, r8, r7
+       std     r7, VCPU_ACTIVITY_START(r4)
+       cmpdi   r5, 0
+       beqlr
+       subf    r3, r6, r7
+       ld      r8, TAS_SEQCOUNT(r5)
+       cmpdi   r8, 0
+       addi    r8, r8, 1
+       std     r8, TAS_SEQCOUNT(r5)
+       lwsync
+       ld      r7, TAS_TOTAL(r5)
+       add     r7, r7, r3
+       std     r7, TAS_TOTAL(r5)
+       ld      r6, TAS_MIN(r5)
+       ld      r7, TAS_MAX(r5)
+       beq     3f
+       cmpd    r3, r6
+       bge     1f
+3:     std     r3, TAS_MIN(r5)
+1:     cmpd    r3, r7
+       ble     2f
+       std     r3, TAS_MAX(r5)
+2:     lwsync
+       addi    r8, r8, 1
+       std     r8, TAS_SEQCOUNT(r5)
+       blr
+#endif
index ce3c893..f2c75a1 100644 (file)
@@ -258,6 +258,28 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
        return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+       long rc;
+
+       rc = kvmppc_h_logical_ci_load(vcpu);
+       if (rc == H_TOO_HARD)
+               return EMULATE_FAIL;
+       kvmppc_set_gpr(vcpu, 3, rc);
+       return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+       long rc;
+
+       rc = kvmppc_h_logical_ci_store(vcpu);
+       if (rc == H_TOO_HARD)
+               return EMULATE_FAIL;
+       kvmppc_set_gpr(vcpu, 3, rc);
+       return EMULATE_DONE;
+}
+
 static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 {
        long rc = kvmppc_xics_hcall(vcpu, cmd);
@@ -290,6 +312,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
                vcpu->stat.halt_wakeup++;
                return EMULATE_DONE;
+       case H_LOGICAL_CI_LOAD:
+               return kvmppc_h_pr_logical_ci_load(vcpu);
+       case H_LOGICAL_CI_STORE:
+               return kvmppc_h_pr_logical_ci_store(vcpu);
        case H_XIRR:
        case H_CPPR:
        case H_EOI:
@@ -323,6 +349,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
        case H_BULK_REMOVE:
        case H_PUT_TCE:
        case H_CEDE:
+       case H_LOGICAL_CI_LOAD:
+       case H_LOGICAL_CI_STORE:
 #ifdef CONFIG_KVM_XICS
        case H_XIRR:
        case H_CPPR:
index a4a8d9f..8f3e6cc 100644 (file)
@@ -20,6 +20,7 @@
 #include <asm/xics.h>
 #include <asm/debug.h>
 #include <asm/time.h>
+#include <asm/spinlock.h>
 
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -39,7 +40,7 @@
  * LOCKING
  * =======
  *
- * Each ICS has a mutex protecting the information about the IRQ
+ * Each ICS has a spin lock protecting the information about the IRQ
  * sources and avoiding simultaneous deliveries if the same interrupt.
  *
  * ICP operations are done via a single compare & swap transaction
@@ -109,7 +110,10 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 {
        int i;
 
-       mutex_lock(&ics->lock);
+       unsigned long flags;
+
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
 
        for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
                struct ics_irq_state *state = &ics->irq_state[i];
@@ -120,12 +124,15 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
                XICS_DBG("resend %#x prio %#x\n", state->number,
                              state->priority);
 
-               mutex_unlock(&ics->lock);
+               arch_spin_unlock(&ics->lock);
+               local_irq_restore(flags);
                icp_deliver_irq(xics, icp, state->number);
-               mutex_lock(&ics->lock);
+               local_irq_save(flags);
+               arch_spin_lock(&ics->lock);
        }
 
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
 }
 
 static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -133,8 +140,10 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
                       u32 server, u32 priority, u32 saved_priority)
 {
        bool deliver;
+       unsigned long flags;
 
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
 
        state->server = server;
        state->priority = priority;
@@ -145,7 +154,8 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
                deliver = true;
        }
 
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
 
        return deliver;
 }
@@ -186,6 +196,7 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
        struct kvmppc_ics *ics;
        struct ics_irq_state *state;
        u16 src;
+       unsigned long flags;
 
        if (!xics)
                return -ENODEV;
@@ -195,10 +206,12 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
                return -EINVAL;
        state = &ics->irq_state[src];
 
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
        *server = state->server;
        *priority = state->priority;
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
 
        return 0;
 }
@@ -365,6 +378,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
        struct kvmppc_ics *ics;
        u32 reject;
        u16 src;
+       unsigned long flags;
 
        /*
         * This is used both for initial delivery of an interrupt and
@@ -391,7 +405,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
        state = &ics->irq_state[src];
 
        /* Get a lock on the ICS */
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
 
        /* Get our server */
        if (!icp || state->server != icp->server_num) {
@@ -434,7 +449,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
         *
         * Note that if successful, the new delivery might have itself
         * rejected an interrupt that was "delivered" before we took the
-        * icp mutex.
+        * ics spin lock.
         *
         * In this case we do the whole sequence all over again for the
         * new guy. We cannot assume that the rejected interrupt is less
@@ -448,7 +463,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
                 * Delivery was successful, did we reject somebody else ?
                 */
                if (reject && reject != XICS_IPI) {
-                       mutex_unlock(&ics->lock);
+                       arch_spin_unlock(&ics->lock);
+                       local_irq_restore(flags);
                        new_irq = reject;
                        goto again;
                }
@@ -468,12 +484,14 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
                 */
                smp_mb();
                if (!icp->state.need_resend) {
-                       mutex_unlock(&ics->lock);
+                       arch_spin_unlock(&ics->lock);
+                       local_irq_restore(flags);
                        goto again;
                }
        }
  out:
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
 }
 
 static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
@@ -802,14 +820,22 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
        XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
                 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
 
-       if (icp->rm_action & XICS_RM_KICK_VCPU)
+       if (icp->rm_action & XICS_RM_KICK_VCPU) {
+               icp->n_rm_kick_vcpu++;
                kvmppc_fast_vcpu_kick(icp->rm_kick_target);
-       if (icp->rm_action & XICS_RM_CHECK_RESEND)
+       }
+       if (icp->rm_action & XICS_RM_CHECK_RESEND) {
+               icp->n_rm_check_resend++;
                icp_check_resend(xics, icp->rm_resend_icp);
-       if (icp->rm_action & XICS_RM_REJECT)
+       }
+       if (icp->rm_action & XICS_RM_REJECT) {
+               icp->n_rm_reject++;
                icp_deliver_irq(xics, icp, icp->rm_reject);
-       if (icp->rm_action & XICS_RM_NOTIFY_EOI)
+       }
+       if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
+               icp->n_rm_notify_eoi++;
                kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
+       }
 
        icp->rm_action = 0;
 
@@ -872,10 +898,21 @@ static int xics_debug_show(struct seq_file *m, void *private)
        struct kvm *kvm = xics->kvm;
        struct kvm_vcpu *vcpu;
        int icsid, i;
+       unsigned long flags;
+       unsigned long t_rm_kick_vcpu, t_rm_check_resend;
+       unsigned long t_rm_reject, t_rm_notify_eoi;
+       unsigned long t_reject, t_check_resend;
 
        if (!kvm)
                return 0;
 
+       t_rm_kick_vcpu = 0;
+       t_rm_notify_eoi = 0;
+       t_rm_check_resend = 0;
+       t_rm_reject = 0;
+       t_check_resend = 0;
+       t_reject = 0;
+
        seq_printf(m, "=========\nICP state\n=========\n");
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -890,8 +927,19 @@ static int xics_debug_show(struct seq_file *m, void *private)
                           icp->server_num, state.xisr,
                           state.pending_pri, state.cppr, state.mfrr,
                           state.out_ee, state.need_resend);
+               t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
+               t_rm_notify_eoi += icp->n_rm_notify_eoi;
+               t_rm_check_resend += icp->n_rm_check_resend;
+               t_rm_reject += icp->n_rm_reject;
+               t_check_resend += icp->n_check_resend;
+               t_reject += icp->n_reject;
        }
 
+       seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+                       t_rm_kick_vcpu, t_rm_check_resend,
+                       t_rm_reject, t_rm_notify_eoi);
+       seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
+                       t_check_resend, t_reject);
        for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
                struct kvmppc_ics *ics = xics->ics[icsid];
 
@@ -901,7 +949,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
                seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
                           icsid);
 
-               mutex_lock(&ics->lock);
+               local_irq_save(flags);
+               arch_spin_lock(&ics->lock);
 
                for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
                        struct ics_irq_state *irq = &ics->irq_state[i];
@@ -912,7 +961,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
                                   irq->resend, irq->masked_pending);
 
                }
-               mutex_unlock(&ics->lock);
+               arch_spin_unlock(&ics->lock);
+               local_irq_restore(flags);
        }
        return 0;
 }
@@ -965,7 +1015,6 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
        if (!ics)
                goto out;
 
-       mutex_init(&ics->lock);
        ics->icsid = icsid;
 
        for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
@@ -1107,13 +1156,15 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
        u64 __user *ubufp = (u64 __user *) addr;
        u16 idx;
        u64 val, prio;
+       unsigned long flags;
 
        ics = kvmppc_xics_find_ics(xics, irq, &idx);
        if (!ics)
                return -ENOENT;
 
        irqp = &ics->irq_state[idx];
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
        ret = -ENOENT;
        if (irqp->exists) {
                val = irqp->server;
@@ -1129,7 +1180,8 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
                        val |= KVM_XICS_PENDING;
                ret = 0;
        }
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
 
        if (!ret && put_user(val, ubufp))
                ret = -EFAULT;
@@ -1146,6 +1198,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
        u64 val;
        u8 prio;
        u32 server;
+       unsigned long flags;
 
        if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
                return -ENOENT;
@@ -1166,7 +1219,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
            kvmppc_xics_find_server(xics->kvm, server) == NULL)
                return -EINVAL;
 
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
        irqp->server = server;
        irqp->saved_priority = prio;
        if (val & KVM_XICS_MASKED)
@@ -1178,7 +1232,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
        if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
                irqp->asserted = 1;
        irqp->exists = 1;
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
 
        if (val & KVM_XICS_PENDING)
                icp_deliver_irq(xics, NULL, irqp->number);
index 73f0f27..56ea44f 100644 (file)
@@ -78,13 +78,22 @@ struct kvmppc_icp {
        u32  rm_reject;
        u32  rm_eoied_irq;
 
+       /* Counters for each reason we exited real mode */
+       unsigned long n_rm_kick_vcpu;
+       unsigned long n_rm_check_resend;
+       unsigned long n_rm_reject;
+       unsigned long n_rm_notify_eoi;
+       /* Counters for handling ICP processing in real mode */
+       unsigned long n_check_resend;
+       unsigned long n_reject;
+
        /* Debug stuff for real mode */
        union kvmppc_icp_state rm_dbgstate;
        struct kvm_vcpu *rm_dbgtgt;
 };
 
 struct kvmppc_ics {
-       struct mutex lock;
+       arch_spinlock_t lock;
        u16 icsid;
        struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
 };
@@ -96,6 +105,8 @@ struct kvmppc_xics {
        u32 max_icsid;
        bool real_mode;
        bool real_mode_dbg;
+       u32 err_noics;
+       u32 err_noicp;
        struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
 };
 
index 24bfe40..55a4763 100644 (file)
@@ -529,6 +529,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_PPC_RMA:
                r = 0;
                break;
+       case KVM_CAP_PPC_HWRNG:
+               r = kvmppc_hwrng_present();
+               break;
 #endif
        case KVM_CAP_SYNC_MMU:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
index 170a034..f7deebd 100644 (file)
@@ -41,6 +41,7 @@ void __spin_yield(arch_spinlock_t *lock)
        plpar_hcall_norets(H_CONFER,
                get_hard_smp_processor_id(holder_cpu), yield_count);
 }
+EXPORT_SYMBOL_GPL(__spin_yield);
 
 /*
  * Waiting for a read lock or a write lock on a rwlock...
index 80db439..6eb808f 100644 (file)
 
 struct powernv_rng {
        void __iomem *regs;
+       void __iomem *regs_real;
        unsigned long mask;
 };
 
 static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
 
 
+int powernv_hwrng_present(void)
+{
+       struct powernv_rng *rng;
+
+       rng = get_cpu_var(powernv_rng);
+       put_cpu_var(rng);
+       return rng != NULL;
+}
+
 static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
 {
        unsigned long parity;
@@ -46,6 +56,17 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
        return val;
 }
 
+int powernv_get_random_real_mode(unsigned long *v)
+{
+       struct powernv_rng *rng;
+
+       rng = raw_cpu_read(powernv_rng);
+
+       *v = rng_whiten(rng, in_rm64(rng->regs_real));
+
+       return 1;
+}
+
 int powernv_get_random_long(unsigned long *v)
 {
        struct powernv_rng *rng;
@@ -80,12 +101,20 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng,
 static __init int rng_create(struct device_node *dn)
 {
        struct powernv_rng *rng;
+       struct resource res;
        unsigned long val;
 
        rng = kzalloc(sizeof(*rng), GFP_KERNEL);
        if (!rng)
                return -ENOMEM;
 
+       if (of_address_to_resource(dn, 0, &res)) {
+               kfree(rng);
+               return -ENXIO;
+       }
+
+       rng->regs_real = (void __iomem *)res.start;
+
        rng->regs = of_iomap(dn, 0);
        if (!rng->regs) {
                kfree(rng);
index afa2bd7..8cd8e7b 100644 (file)
@@ -110,7 +110,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 /* upper facilities limit for kvm */
 unsigned long kvm_s390_fac_list_mask[] = {
        0xffe6fffbfcfdfc40UL,
-       0x205c800000000000UL,
+       0x005c800000000000UL,
 };
 
 unsigned long kvm_s390_fac_list_mask_size(void)
index d67206a..629af0f 100644 (file)
@@ -683,8 +683,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
        unsigned long bitmap = 1;
        struct kvm_lapic **dst;
        int i;
-       bool ret = false;
-       bool x2apic_ipi = src && apic_x2apic_mode(src);
+       bool ret, x2apic_ipi;
 
        *r = -1;
 
@@ -696,16 +695,18 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
        if (irq->shorthand)
                return false;
 
+       x2apic_ipi = src && apic_x2apic_mode(src);
        if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
                return false;
 
+       ret = true;
        rcu_read_lock();
        map = rcu_dereference(kvm->arch.apic_map);
 
-       if (!map)
+       if (!map) {
+               ret = false;
                goto out;
-
-       ret = true;
+       }
 
        if (irq->dest_mode == APIC_DEST_PHYSICAL) {
                if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
index 146f295..d43867c 100644 (file)
@@ -4481,9 +4481,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
                pfn = spte_to_pfn(*sptep);
 
                /*
-                * Only EPT supported for now; otherwise, one would need to
-                * find out efficiently whether the guest page tables are
-                * also using huge pages.
+                * We cannot do huge page mapping for indirect shadow pages,
+                * which are found on the last rmap (level = 1) when not using
+                * tdp; such shadow pages are synced with the page table in
+                * the guest, and the guest page table is using 4K page size
+                * mapping if the indirect sp has level = 1.
                 */
                if (sp->role.direct &&
                        !kvm_is_reserved_pfn(pfn) &&
@@ -4504,19 +4506,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
        bool flush = false;
        unsigned long *rmapp;
        unsigned long last_index, index;
-       gfn_t gfn_start, gfn_end;
 
        spin_lock(&kvm->mmu_lock);
 
-       gfn_start = memslot->base_gfn;
-       gfn_end = memslot->base_gfn + memslot->npages - 1;
-
-       if (gfn_start >= gfn_end)
-               goto out;
-
        rmapp = memslot->arch.rmap[0];
-       last_index = gfn_to_index(gfn_end, memslot->base_gfn,
-                                       PT_PAGE_TABLE_LEVEL);
+       last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
+                               memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
 
        for (index = 0; index <= last_index; ++index, ++rmapp) {
                if (*rmapp)
@@ -4534,7 +4529,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
        if (flush)
                kvm_flush_remote_tlbs(kvm);
 
-out:
        spin_unlock(&kvm->mmu_lock);
 }
 
index f5e8dce..f7b6168 100644 (file)
@@ -3622,8 +3622,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-       unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
-                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+       /*
+        * Pass through host's Machine Check Enable value to hw_cr4, which
+        * is in force while we are in guest mode.  Do not let guests control
+        * this bit, even if host CR4.MCE == 0.
+        */
+       unsigned long hw_cr4 =
+               (cr4_read_shadow() & X86_CR4_MCE) |
+               (cr4 & ~X86_CR4_MCE) |
+               (to_vmx(vcpu)->rmode.vm86_active ?
+                KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
        if (cr4 & X86_CR4_VMXE) {
                /*
index e1a8126..ed31c31 100644 (file)
@@ -5799,7 +5799,6 @@ int kvm_arch_init(void *opaque)
        kvm_set_mmio_spte_mask();
 
        kvm_x86_ops = ops;
-       kvm_init_msr_list();
 
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -7253,7 +7252,14 @@ void kvm_arch_hardware_disable(void)
 
 int kvm_arch_hardware_setup(void)
 {
-       return kvm_x86_ops->hardware_setup();
+       int r;
+
+       r = kvm_x86_ops->hardware_setup();
+       if (r != 0)
+               return r;
+
+       kvm_init_msr_list();
+       return 0;
 }
 
 void kvm_arch_hardware_unsetup(void)
index f574d7b..4b60056 100644 (file)
@@ -813,6 +813,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_MIPS_MSA 112
 #define KVM_CAP_S390_INJECT_IRQ 113
 #define KVM_CAP_S390_IRQ_STATE 114
+#define KVM_CAP_PPC_HWRNG 115
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index d3fc939..9097741 100644 (file)
@@ -89,6 +89,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
 static __read_mostly struct preempt_ops kvm_preempt_ops;
 
 struct dentry *kvm_debugfs_dir;
+EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);