Merge tag 'kvm-arm-for-4.1-take2' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Paolo Bonzini <pbonzini@redhat.com>

Wed, 22 Apr 2015 15:08:12 +0000 (17:08 +0200)

committer Paolo Bonzini <pbonzini@redhat.com>

Wed, 22 Apr 2015 15:08:12 +0000 (17:08 +0200)
author Paolo Bonzini <pbonzini@redhat.com>
Wed, 22 Apr 2015 15:08:12 +0000 (17:08 +0200)
committer Paolo Bonzini <pbonzini@redhat.com>
Wed, 22 Apr 2015 15:08:12 +0000 (17:08 +0200)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt

index bc9f6fe..9fa2bf8 100644 (file)
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3573,3 +3573,20 @@ struct {
  @ar   - access register number
  
  KVM handlers should exit to userspace with rc = -EREMOTE.
+
+
+8. Other capabilities.
+----------------------
+
+This section lists capabilities that give information about other
+features of the KVM implementation.
+
+8.1 KVM_CAP_PPC_HWRNG
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel has an implementation of the
+H_RANDOM hypercall backed by a hardware random-number generator.
+If present, the kernel H_RANDOM handler can be enabled for guest use
+with the KVM_CAP_PPC_ENABLE_HCALL capability.
diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h

index bde5311..0cc6eed 100644 (file)
--- a/arch/powerpc/include/asm/archrandom.h
+++ b/arch/powerpc/include/asm/archrandom.h
@@ -30,8 +30,6 @@ static inline int arch_has_random(void)
         return !!ppc_md.get_random_long;
  }
  
-int powernv_get_random_long(unsigned long *v);
-
  static inline int arch_get_random_seed_long(unsigned long *v)
  {
         return 0;
@@ -47,4 +45,13 @@ static inline int arch_has_random_seed(void)
  
  #endif /* CONFIG_ARCH_RANDOM */
  
+#ifdef CONFIG_PPC_POWERNV
+int powernv_hwrng_present(void);
+int powernv_get_random_long(unsigned long *v);
+int powernv_get_random_real_mode(unsigned long *v);
+#else
+static inline int powernv_hwrng_present(void) { return 0; }
+static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; }
+#endif
+
  #endif /* _ASM_POWERPC_ARCHRANDOM_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h

index 942c7b1..578e550 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -292,6 +292,9 @@ static inline bool kvmppc_supports_magic_page(struct kvm_vcpu *vcpu)
         return !is_kvmppc_hv_enabled(vcpu->kvm);
  }
  
+extern int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu);
+extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
+
  /* Magic register values loaded into r3 and r4 before the 'sc' assembly
   * instruction for the OSI hypercalls */
  #define OSI_SC_MAGIC_R3                        0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h

index 2d81e20..2b84e48 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -85,6 +85,20 @@ static inline long try_lock_hpte(__be64 *hpte, unsigned long bits)
         return old == 0;
  }
  
+static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
+{
+       hpte_v &= ~HPTE_V_HVLOCK;
+       asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
+       hpte[0] = cpu_to_be64(hpte_v);
+}
+
+/* Without barrier */
+static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
+{
+       hpte_v &= ~HPTE_V_HVLOCK;
+       hpte[0] = cpu_to_be64(hpte_v);
+}
+
  static inline int __hpte_actual_psize(unsigned int lp, int psize)
  {
         int i, shift;
@@ -422,6 +436,10 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
         return rcu_dereference_raw_notrace(kvm->memslots);
  }
  
+extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
+
+extern void kvmhv_rm_send_ipi(int cpu);
+
  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
  
  #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index 8ef0512..d67a838 100644 (file)
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -227,10 +227,8 @@ struct kvm_arch {
         unsigned long host_sdr1;
         int tlbie_lock;
         unsigned long lpcr;
-       unsigned long rmor;
-       struct kvm_rma_info *rma;
         unsigned long vrma_slb_v;
-       int rma_setup_done;
+       int hpte_setup_done;
         u32 hpt_order;
         atomic_t vcpus_running;
         u32 online_vcores;
@@ -239,6 +237,8 @@ struct kvm_arch {
         atomic_t hpte_mod_interest;
         cpumask_t need_tlb_flush;
         int hpt_cma_alloc;
+       struct dentry *debugfs_dir;
+       struct dentry *htab_dentry;
  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
  #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
         struct mutex hpt_mutex;
@@ -263,18 +263,15 @@ struct kvm_arch {
  
  /*
   * Struct for a virtual core.
- * Note: entry_exit_count combines an entry count in the bottom 8 bits
- * and an exit count in the next 8 bits.  This is so that we can
- * atomically increment the entry count iff the exit count is 0
- * without taking the lock.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits.  This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
   */
  struct kvmppc_vcore {
         int n_runnable;
-       int n_busy;
         int num_threads;
-       int entry_exit_count;
-       int n_woken;
-       int nap_count;
+       int entry_exit_map;
         int napping_threads;
         int first_vcpuid;
         u16 pcpu;
@@ -299,13 +296,14 @@ struct kvmppc_vcore {
         ulong conferring_threads;
  };
  
-#define VCORE_ENTRY_COUNT(vc)  ((vc)->entry_exit_count & 0xff)
-#define VCORE_EXIT_COUNT(vc)   ((vc)->entry_exit_count >> 8)
+#define VCORE_ENTRY_MAP(vc)    ((vc)->entry_exit_map & 0xff)
+#define VCORE_EXIT_MAP(vc)     ((vc)->entry_exit_map >> 8)
+#define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
  
  /* Values for vcore_state */
  #define VCORE_INACTIVE 0
  #define VCORE_SLEEPING 1
-#define VCORE_STARTING 2
+#define VCORE_PREEMPT  2
  #define VCORE_RUNNING  3
  #define VCORE_EXITING  4
  
@@ -368,6 +366,14 @@ struct kvmppc_slb {
         u8 base_page_size;      /* MMU_PAGE_xxx */
  };
  
+/* Struct used to accumulate timing information in HV real mode code */
+struct kvmhv_tb_accumulator {
+       u64     seqcount;       /* used to synchronize access, also count * 2 */
+       u64     tb_total;       /* total time in timebase ticks */
+       u64     tb_min;         /* min time */
+       u64     tb_max;         /* max time */
+};
+
  # ifdef CONFIG_PPC_FSL_BOOK3E
  #define KVMPPC_BOOKE_IAC_NUM   2
  #define KVMPPC_BOOKE_DAC_NUM   2
@@ -656,6 +662,19 @@ struct kvm_vcpu_arch {
  
         u32 emul_inst;
  #endif
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       struct kvmhv_tb_accumulator *cur_activity;      /* What we're timing */
+       u64     cur_tb_start;                   /* when it started */
+       struct kvmhv_tb_accumulator rm_entry;   /* real-mode entry code */
+       struct kvmhv_tb_accumulator rm_intr;    /* real-mode intr handling */
+       struct kvmhv_tb_accumulator rm_exit;    /* real-mode exit code */
+       struct kvmhv_tb_accumulator guest_time; /* guest execution */
+       struct kvmhv_tb_accumulator cede_time;  /* time napping inside guest */
+
+       struct dentry *debugfs_dir;
+       struct dentry *debugfs_timings;
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
  };
  
  #define VCPU_FPR(vcpu, i)      (vcpu)->arch.fp.fpr[i][TS_FPROFFSET]
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h

index 46bf652..b8475da 100644 (file)
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -302,6 +302,8 @@ static inline bool is_kvmppc_hv_enabled(struct kvm *kvm)
         return kvm->arch.kvm_ops == kvmppc_hv_ops;
  }
  
+extern int kvmppc_hwrng_present(void);
+
  /*
   * Cuts out inst bits with ordering according to spec.
   * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h

index 03cbada..10fc784 100644 (file)
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -211,5 +211,8 @@ extern void secondary_cpu_time_init(void);
  
  DECLARE_PER_CPU(u64, decrementers_next_tb);
  
+/* Convert timebase ticks to nanoseconds */
+unsigned long long tb_to_ns(unsigned long long tb_ticks);
+
  #endif /* __KERNEL__ */
  #endif /* __POWERPC_TIME_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c

index 4717859..0034b6b 100644 (file)
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -37,6 +37,7 @@
  #include <asm/thread_info.h>
  #include <asm/rtas.h>
  #include <asm/vdso_datapage.h>
+#include <asm/dbell.h>
  #ifdef CONFIG_PPC64
  #include <asm/paca.h>
  #include <asm/lppaca.h>
@@ -459,6 +460,19 @@ int main(void)
         DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
         DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
  #endif
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry));
+       DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr));
+       DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit));
+       DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time));
+       DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time));
+       DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity));
+       DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start));
+       DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount));
+       DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total));
+       DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min));
+       DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max));
+#endif
         DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
         DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
         DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
@@ -492,7 +506,6 @@ int main(void)
         DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
         DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls));
         DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
-       DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
         DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
         DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
         DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
@@ -550,8 +563,7 @@ int main(void)
         DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
         DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
         DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
-       DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
-       DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+       DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map));
         DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
         DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
         DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
@@ -748,5 +760,7 @@ int main(void)
                         offsetof(struct paca_struct, subcore_sibling_mask));
  #endif
  
+       DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
+
         return 0;
  }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c

index 2d7b33f..56f4484 100644 (file)
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -608,6 +608,12 @@ void arch_suspend_enable_irqs(void)
  }
  #endif
  
+unsigned long long tb_to_ns(unsigned long long ticks)
+{
+       return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
+}
+EXPORT_SYMBOL_GPL(tb_to_ns);
+
  /*
   * Scheduler clock - returns current time in nanosec units.
   *
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig

index 11850f3..2963e4d 100644 (file)
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -110,6 +110,20 @@ config KVM_BOOK3S_64_PR
           processor, including emulating 32-bit processors on a 64-bit
           host.
  
+config KVM_BOOK3S_HV_EXIT_TIMING
+       bool "Detailed timing for hypervisor real-mode code"
+       depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS
+       ---help---
+         Calculate time taken for each vcpu in the real-mode guest entry,
+         exit, and interrupt handling code, plus time spent in the guest
+         and in nap mode due to idle (cede) while other threads are still
+         in the guest.  The total, minimum and maximum times in nanoseconds
+         together with the number of executions are reported in debugfs in
+         kvm/vm#/vcpu#/timings.  The overhead is of the order of 30 - 40
+         ns per exit on POWER8.
+
+         If unsure, say N.
+
  config KVM_BOOKE_HV
         bool
  
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c

index cfbcdc6..453a8a4 100644 (file)
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -821,6 +821,82 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
  #endif
  }
  
+int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+       unsigned long size = kvmppc_get_gpr(vcpu, 4);
+       unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+       u64 buf;
+       int ret;
+
+       if (!is_power_of_2(size) || (size > sizeof(buf)))
+               return H_TOO_HARD;
+
+       ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+       if (ret != 0)
+               return H_TOO_HARD;
+
+       switch (size) {
+       case 1:
+               kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf);
+               break;
+
+       case 2:
+               kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf));
+               break;
+
+       case 4:
+               kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf));
+               break;
+
+       case 8:
+               kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf));
+               break;
+
+       default:
+               BUG();
+       }
+
+       return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load);
+
+int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+       unsigned long size = kvmppc_get_gpr(vcpu, 4);
+       unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+       unsigned long val = kvmppc_get_gpr(vcpu, 6);
+       u64 buf;
+       int ret;
+
+       switch (size) {
+       case 1:
+               *(u8 *)&buf = val;
+               break;
+
+       case 2:
+               *(__be16 *)&buf = cpu_to_be16(val);
+               break;
+
+       case 4:
+               *(__be32 *)&buf = cpu_to_be32(val);
+               break;
+
+       case 8:
+               *(__be64 *)&buf = cpu_to_be64(val);
+               break;
+
+       default:
+               return H_TOO_HARD;
+       }
+
+       ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+       if (ret != 0)
+               return H_TOO_HARD;
+
+       return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store);
+
  int kvmppc_core_check_processor_compat(void)
  {
         /*
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c

index 534acb3..d6fe308 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -27,6 +27,7 @@
  #include <linux/srcu.h>
  #include <linux/anon_inodes.h>
  #include <linux/file.h>
+#include <linux/debugfs.h>
  
  #include <asm/tlbflush.h>
  #include <asm/kvm_ppc.h>
@@ -116,12 +117,12 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
         long order;
  
         mutex_lock(&kvm->lock);
-       if (kvm->arch.rma_setup_done) {
-               kvm->arch.rma_setup_done = 0;
-               /* order rma_setup_done vs. vcpus_running */
+       if (kvm->arch.hpte_setup_done) {
+               kvm->arch.hpte_setup_done = 0;
+               /* order hpte_setup_done vs. vcpus_running */
                 smp_mb();
                 if (atomic_read(&kvm->arch.vcpus_running)) {
-                       kvm->arch.rma_setup_done = 1;
+                       kvm->arch.hpte_setup_done = 1;
                         goto out;
                 }
         }
@@ -338,9 +339,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
         v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
         gr = kvm->arch.revmap[index].guest_rpte;
  
-       /* Unlock the HPTE */
-       asm volatile("lwsync" : : : "memory");
-       hptep[0] = cpu_to_be64(v);
+       unlock_hpte(hptep, v);
         preempt_enable();
  
         gpte->eaddr = eaddr;
@@ -469,8 +468,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
         hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
         hpte[1] = be64_to_cpu(hptep[1]);
         hpte[2] = r = rev->guest_rpte;
-       asm volatile("lwsync" : : : "memory");
-       hptep[0] = cpu_to_be64(hpte[0]);
+       unlock_hpte(hptep, hpte[0]);
         preempt_enable();
  
         if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
@@ -621,7 +619,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
  
         hptep[1] = cpu_to_be64(r);
         eieio();
-       hptep[0] = cpu_to_be64(hpte[0]);
+       __unlock_hpte(hptep, hpte[0]);
         asm volatile("ptesync" : : : "memory");
         preempt_enable();
         if (page && hpte_is_writable(r))
@@ -642,7 +640,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
         return ret;
  
   out_unlock:
-       hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+       __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
         preempt_enable();
         goto out_put;
  }
@@ -771,7 +769,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
                         }
                 }
                 unlock_rmap(rmapp);
-               hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
         }
         return 0;
  }
@@ -857,7 +855,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
                         }
                         ret = 1;
                 }
-               hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
         } while ((i = j) != head);
  
         unlock_rmap(rmapp);
@@ -974,8 +972,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
  
                 /* Now check and modify the HPTE */
                 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
-                       /* unlock and continue */
-                       hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+                       __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
                         continue;
                 }
  
@@ -996,9 +993,9 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
                                 npages_dirty = n;
                         eieio();
                 }
-               v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+               v &= ~HPTE_V_ABSENT;
                 v |= HPTE_V_VALID;
-               hptep[0] = cpu_to_be64(v);
+               __unlock_hpte(hptep, v);
         } while ((i = j) != head);
  
         unlock_rmap(rmapp);
@@ -1218,8 +1215,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
                         r &= ~HPTE_GR_MODIFIED;
                         revp->guest_rpte = r;
                 }
-               asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
-               hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               unlock_hpte(hptp, be64_to_cpu(hptp[0]));
                 preempt_enable();
                 if (!(valid == want_valid && (first_pass || dirty)))
                         ok = 0;
@@ -1339,20 +1335,20 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
         unsigned long tmp[2];
         ssize_t nb;
         long int err, ret;
-       int rma_setup;
+       int hpte_setup;
  
         if (!access_ok(VERIFY_READ, buf, count))
                 return -EFAULT;
  
         /* lock out vcpus from running while we're doing this */
         mutex_lock(&kvm->lock);
-       rma_setup = kvm->arch.rma_setup_done;
-       if (rma_setup) {
-               kvm->arch.rma_setup_done = 0;   /* temporarily */
-               /* order rma_setup_done vs. vcpus_running */
+       hpte_setup = kvm->arch.hpte_setup_done;
+       if (hpte_setup) {
+               kvm->arch.hpte_setup_done = 0;  /* temporarily */
+               /* order hpte_setup_done vs. vcpus_running */
                 smp_mb();
                 if (atomic_read(&kvm->arch.vcpus_running)) {
-                       kvm->arch.rma_setup_done = 1;
+                       kvm->arch.hpte_setup_done = 1;
                         mutex_unlock(&kvm->lock);
                         return -EBUSY;
                 }
@@ -1405,7 +1401,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
                                        "r=%lx\n", ret, i, v, r);
                                 goto out;
                         }
-                       if (!rma_setup && is_vrma_hpte(v)) {
+                       if (!hpte_setup && is_vrma_hpte(v)) {
                                 unsigned long psize = hpte_base_page_size(v, r);
                                 unsigned long senc = slb_pgsize_encoding(psize);
                                 unsigned long lpcr;
@@ -1414,7 +1410,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
                                         (VRMA_VSID << SLB_VSID_SHIFT_1T);
                                 lpcr = senc << (LPCR_VRMASD_SH - 4);
                                 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
-                               rma_setup = 1;
+                               hpte_setup = 1;
                         }
                         ++i;
                         hptp += 2;
@@ -1430,9 +1426,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
         }
  
   out:
-       /* Order HPTE updates vs. rma_setup_done */
+       /* Order HPTE updates vs. hpte_setup_done */
         smp_wmb();
-       kvm->arch.rma_setup_done = rma_setup;
+       kvm->arch.hpte_setup_done = hpte_setup;
         mutex_unlock(&kvm->lock);
  
         if (err)
@@ -1495,6 +1491,141 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
         return ret;
  }
  
+struct debugfs_htab_state {
+       struct kvm      *kvm;
+       struct mutex    mutex;
+       unsigned long   hpt_index;
+       int             chars_left;
+       int             buf_index;
+       char            buf[64];
+};
+
+static int debugfs_htab_open(struct inode *inode, struct file *file)
+{
+       struct kvm *kvm = inode->i_private;
+       struct debugfs_htab_state *p;
+
+       p = kzalloc(sizeof(*p), GFP_KERNEL);
+       if (!p)
+               return -ENOMEM;
+
+       kvm_get_kvm(kvm);
+       p->kvm = kvm;
+       mutex_init(&p->mutex);
+       file->private_data = p;
+
+       return nonseekable_open(inode, file);
+}
+
+static int debugfs_htab_release(struct inode *inode, struct file *file)
+{
+       struct debugfs_htab_state *p = file->private_data;
+
+       kvm_put_kvm(p->kvm);
+       kfree(p);
+       return 0;
+}
+
+static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
+                                size_t len, loff_t *ppos)
+{
+       struct debugfs_htab_state *p = file->private_data;
+       ssize_t ret, r;
+       unsigned long i, n;
+       unsigned long v, hr, gr;
+       struct kvm *kvm;
+       __be64 *hptp;
+
+       ret = mutex_lock_interruptible(&p->mutex);
+       if (ret)
+               return ret;
+
+       if (p->chars_left) {
+               n = p->chars_left;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf + p->buf_index, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index += n;
+               buf += n;
+               len -= n;
+               ret = n;
+               if (r) {
+                       if (!n)
+                               ret = -EFAULT;
+                       goto out;
+               }
+       }
+
+       kvm = p->kvm;
+       i = p->hpt_index;
+       hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+       for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+               if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
+                       continue;
+
+               /* lock the HPTE so it's stable and read it */
+               preempt_disable();
+               while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+                       cpu_relax();
+               v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
+               hr = be64_to_cpu(hptp[1]);
+               gr = kvm->arch.revmap[i].guest_rpte;
+               unlock_hpte(hptp, v);
+               preempt_enable();
+
+               if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+                       continue;
+
+               n = scnprintf(p->buf, sizeof(p->buf),
+                             "%6lx %.16lx %.16lx %.16lx\n",
+                             i, v, hr, gr);
+               p->chars_left = n;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index = n;
+               buf += n;
+               len -= n;
+               ret += n;
+               if (r) {
+                       if (!ret)
+                               ret = -EFAULT;
+                       goto out;
+               }
+       }
+       p->hpt_index = i;
+
+ out:
+       mutex_unlock(&p->mutex);
+       return ret;
+}
+
+ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
+                          size_t len, loff_t *ppos)
+{
+       return -EACCES;
+}
+
+static const struct file_operations debugfs_htab_fops = {
+       .owner   = THIS_MODULE,
+       .open    = debugfs_htab_open,
+       .release = debugfs_htab_release,
+       .read    = debugfs_htab_read,
+       .write   = debugfs_htab_write,
+       .llseek  = generic_file_llseek,
+};
+
+void kvmppc_mmu_debugfs_init(struct kvm *kvm)
+{
+       kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
+                                                   kvm->arch.debugfs_dir, kvm,
+                                                   &debugfs_htab_fops);
+}
+
  void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
  {
         struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index de74756..48d3c5d 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -32,6 +32,7 @@
  #include <linux/page-flags.h>
  #include <linux/srcu.h>
  #include <linux/miscdevice.h>
+#include <linux/debugfs.h>
  
  #include <asm/reg.h>
  #include <asm/cputable.h>
@@ -50,6 +51,7 @@
  #include <asm/hvcall.h>
  #include <asm/switch_to.h>
  #include <asm/smp.h>
+#include <asm/dbell.h>
  #include <linux/gfp.h>
  #include <linux/vmalloc.h>
  #include <linux/highmem.h>
@@ -83,9 +85,35 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
  
+static bool kvmppc_ipi_thread(int cpu)
+{
+       /* On POWER8 for IPIs to threads in the same core, use msgsnd */
+       if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+               preempt_disable();
+               if (cpu_first_thread_sibling(cpu) ==
+                   cpu_first_thread_sibling(smp_processor_id())) {
+                       unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+                       msg |= cpu_thread_in_core(cpu);
+                       smp_mb();
+                       __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+                       preempt_enable();
+                       return true;
+               }
+               preempt_enable();
+       }
+
+#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
+       if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
+               xics_wake_cpu(cpu);
+               return true;
+       }
+#endif
+
+       return false;
+}
+
  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  {
-       int me;
         int cpu = vcpu->cpu;
         wait_queue_head_t *wqp;
  
@@ -95,20 +123,12 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
                 ++vcpu->stat.halt_wakeup;
         }
  
-       me = get_cpu();
+       if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+               return;
  
         /* CPU points to the first thread of the core */
-       if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
-#ifdef CONFIG_PPC_ICP_NATIVE
-               int real_cpu = cpu + vcpu->arch.ptid;
-               if (paca[real_cpu].kvm_hstate.xics_phys)
-                       xics_wake_cpu(real_cpu);
-               else
-#endif
-               if (cpu_online(cpu))
-                       smp_send_reschedule(cpu);
-       }
-       put_cpu();
+       if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
+               smp_send_reschedule(cpu);
  }
  
  /*
@@ -706,6 +726,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
  
                 /* Send the error out to userspace via KVM_RUN */
                 return rc;
+       case H_LOGICAL_CI_LOAD:
+               ret = kvmppc_h_logical_ci_load(vcpu);
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_LOGICAL_CI_STORE:
+               ret = kvmppc_h_logical_ci_store(vcpu);
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
         case H_SET_MODE:
                 ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
                                         kvmppc_get_gpr(vcpu, 5),
@@ -740,6 +770,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
         case H_CONFER:
         case H_REGISTER_VPA:
         case H_SET_MODE:
+       case H_LOGICAL_CI_LOAD:
+       case H_LOGICAL_CI_STORE:
  #ifdef CONFIG_KVM_XICS
         case H_XIRR:
         case H_CPPR:
@@ -1410,6 +1442,154 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
         return vcore;
  }
  
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+static struct debugfs_timings_element {
+       const char *name;
+       size_t offset;
+} timings[] = {
+       {"rm_entry",    offsetof(struct kvm_vcpu, arch.rm_entry)},
+       {"rm_intr",     offsetof(struct kvm_vcpu, arch.rm_intr)},
+       {"rm_exit",     offsetof(struct kvm_vcpu, arch.rm_exit)},
+       {"guest",       offsetof(struct kvm_vcpu, arch.guest_time)},
+       {"cede",        offsetof(struct kvm_vcpu, arch.cede_time)},
+};
+
+#define N_TIMINGS      (sizeof(timings) / sizeof(timings[0]))
+
+struct debugfs_timings_state {
+       struct kvm_vcpu *vcpu;
+       unsigned int    buflen;
+       char            buf[N_TIMINGS * 100];
+};
+
+static int debugfs_timings_open(struct inode *inode, struct file *file)
+{
+       struct kvm_vcpu *vcpu = inode->i_private;
+       struct debugfs_timings_state *p;
+
+       p = kzalloc(sizeof(*p), GFP_KERNEL);
+       if (!p)
+               return -ENOMEM;
+
+       kvm_get_kvm(vcpu->kvm);
+       p->vcpu = vcpu;
+       file->private_data = p;
+
+       return nonseekable_open(inode, file);
+}
+
+static int debugfs_timings_release(struct inode *inode, struct file *file)
+{
+       struct debugfs_timings_state *p = file->private_data;
+
+       kvm_put_kvm(p->vcpu->kvm);
+       kfree(p);
+       return 0;
+}
+
+static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
+                                   size_t len, loff_t *ppos)
+{
+       struct debugfs_timings_state *p = file->private_data;
+       struct kvm_vcpu *vcpu = p->vcpu;
+       char *s, *buf_end;
+       struct kvmhv_tb_accumulator tb;
+       u64 count;
+       loff_t pos;
+       ssize_t n;
+       int i, loops;
+       bool ok;
+
+       if (!p->buflen) {
+               s = p->buf;
+               buf_end = s + sizeof(p->buf);
+               for (i = 0; i < N_TIMINGS; ++i) {
+                       struct kvmhv_tb_accumulator *acc;
+
+                       acc = (struct kvmhv_tb_accumulator *)
+                               ((unsigned long)vcpu + timings[i].offset);
+                       ok = false;
+                       for (loops = 0; loops < 1000; ++loops) {
+                               count = acc->seqcount;
+                               if (!(count & 1)) {
+                                       smp_rmb();
+                                       tb = *acc;
+                                       smp_rmb();
+                                       if (count == acc->seqcount) {
+                                               ok = true;
+                                               break;
+                                       }
+                               }
+                               udelay(1);
+                       }
+                       if (!ok)
+                               snprintf(s, buf_end - s, "%s: stuck\n",
+                                       timings[i].name);
+                       else
+                               snprintf(s, buf_end - s,
+                                       "%s: %llu %llu %llu %llu\n",
+                                       timings[i].name, count / 2,
+                                       tb_to_ns(tb.tb_total),
+                                       tb_to_ns(tb.tb_min),
+                                       tb_to_ns(tb.tb_max));
+                       s += strlen(s);
+               }
+               p->buflen = s - p->buf;
+       }
+
+       pos = *ppos;
+       if (pos >= p->buflen)
+               return 0;
+       if (len > p->buflen - pos)
+               len = p->buflen - pos;
+       n = copy_to_user(buf, p->buf + pos, len);
+       if (n) {
+               if (n == len)
+                       return -EFAULT;
+               len -= n;
+       }
+       *ppos = pos + len;
+       return len;
+}
+
+static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
+                                    size_t len, loff_t *ppos)
+{
+       return -EACCES;
+}
+
+static const struct file_operations debugfs_timings_ops = {
+       .owner   = THIS_MODULE,
+       .open    = debugfs_timings_open,
+       .release = debugfs_timings_release,
+       .read    = debugfs_timings_read,
+       .write   = debugfs_timings_write,
+       .llseek  = generic_file_llseek,
+};
+
+/* Create a debugfs directory for the vcpu */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+       char buf[16];
+       struct kvm *kvm = vcpu->kvm;
+
+       snprintf(buf, sizeof(buf), "vcpu%u", id);
+       if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+               return;
+       vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
+       if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
+               return;
+       vcpu->arch.debugfs_timings =
+               debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
+                                   vcpu, &debugfs_timings_ops);
+}
+
+#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+}
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+
  static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
                                                    unsigned int id)
  {
@@ -1479,6 +1659,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
         vcpu->arch.cpu_type = KVM_CPU_3S_64;
         kvmppc_sanity_check(vcpu);
  
+       debugfs_vcpu_init(vcpu, id);
+
         return vcpu;
  
  free_vcpu:
@@ -1566,8 +1748,10 @@ static int kvmppc_grab_hwthread(int cpu)
         tpaca = &paca[cpu];
  
         /* Ensure the thread won't go into the kernel if it wakes */
-       tpaca->kvm_hstate.hwthread_req = 1;
         tpaca->kvm_hstate.kvm_vcpu = NULL;
+       tpaca->kvm_hstate.napping = 0;
+       smp_wmb();
+       tpaca->kvm_hstate.hwthread_req = 1;
  
         /*
          * If the thread is already executing in the kernel (e.g. handling
@@ -1610,35 +1794,41 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
         }
         cpu = vc->pcpu + vcpu->arch.ptid;
         tpaca = &paca[cpu];
-       tpaca->kvm_hstate.kvm_vcpu = vcpu;
         tpaca->kvm_hstate.kvm_vcore = vc;
         tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
         vcpu->cpu = vc->pcpu;
+       /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
         smp_wmb();
-#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
-       if (cpu != smp_processor_id()) {
-               xics_wake_cpu(cpu);
-               if (vcpu->arch.ptid)
-                       ++vc->n_woken;
-       }
-#endif
+       tpaca->kvm_hstate.kvm_vcpu = vcpu;
+       if (cpu != smp_processor_id())
+               kvmppc_ipi_thread(cpu);
  }
  
-static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+static void kvmppc_wait_for_nap(void)
  {
-       int i;
+       int cpu = smp_processor_id();
+       int i, loops;
  
-       HMT_low();
-       i = 0;
-       while (vc->nap_count < vc->n_woken) {
-               if (++i >= 1000000) {
-                       pr_err("kvmppc_wait_for_nap timeout %d %d\n",
-                              vc->nap_count, vc->n_woken);
-                       break;
+       for (loops = 0; loops < 1000000; ++loops) {
+               /*
+                * Check if all threads are finished.
+                * We set the vcpu pointer when starting a thread
+                * and the thread clears it when finished, so we look
+                * for any threads that still have a non-NULL vcpu ptr.
+                */
+               for (i = 1; i < threads_per_subcore; ++i)
+                       if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+                               break;
+               if (i == threads_per_subcore) {
+                       HMT_medium();
+                       return;
                 }
-               cpu_relax();
+               HMT_low();
         }
         HMT_medium();
+       for (i = 1; i < threads_per_subcore; ++i)
+               if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+                       pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
  }
  
  /*
@@ -1700,63 +1890,103 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
         mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
  }
  
+static void prepare_threads(struct kvmppc_vcore *vc)
+{
+       struct kvm_vcpu *vcpu, *vnext;
+
+       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+                                arch.run_list) {
+               if (signal_pending(vcpu->arch.run_task))
+                       vcpu->arch.ret = -EINTR;
+               else if (vcpu->arch.vpa.update_pending ||
+                        vcpu->arch.slb_shadow.update_pending ||
+                        vcpu->arch.dtl.update_pending)
+                       vcpu->arch.ret = RESUME_GUEST;
+               else
+                       continue;
+               kvmppc_remove_runnable(vc, vcpu);
+               wake_up(&vcpu->arch.cpu_run);
+       }
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc)
+{
+       u64 now;
+       long ret;
+       struct kvm_vcpu *vcpu, *vnext;
+
+       now = get_tb();
+       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+                                arch.run_list) {
+               /* cancel pending dec exception if dec is positive */
+               if (now < vcpu->arch.dec_expires &&
+                   kvmppc_core_pending_dec(vcpu))
+                       kvmppc_core_dequeue_dec(vcpu);
+
+               trace_kvm_guest_exit(vcpu);
+
+               ret = RESUME_GUEST;
+               if (vcpu->arch.trap)
+                       ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
+                                                   vcpu->arch.run_task);
+
+               vcpu->arch.ret = ret;
+               vcpu->arch.trap = 0;
+
+               if (vcpu->arch.ceded) {
+                       if (!is_kvmppc_resume_guest(ret))
+                               kvmppc_end_cede(vcpu);
+                       else
+                               kvmppc_set_timer(vcpu);
+               }
+               if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+                       kvmppc_remove_runnable(vc, vcpu);
+                       wake_up(&vcpu->arch.cpu_run);
+               }
+       }
+}
+
  /*
   * Run a set of guest threads on a physical core.
   * Called with vc->lock held.
   */
-static void kvmppc_run_core(struct kvmppc_vcore *vc)
+static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
  {
-       struct kvm_vcpu *vcpu, *vnext;
-       long ret;
-       u64 now;
-       int i, need_vpa_update;
+       struct kvm_vcpu *vcpu;
+       int i;
         int srcu_idx;
-       struct kvm_vcpu *vcpus_to_update[threads_per_core];
  
-       /* don't start if any threads have a signal pending */
-       need_vpa_update = 0;
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-               if (signal_pending(vcpu->arch.run_task))
-                       return;
-               if (vcpu->arch.vpa.update_pending ||
-                   vcpu->arch.slb_shadow.update_pending ||
-                   vcpu->arch.dtl.update_pending)
-                       vcpus_to_update[need_vpa_update++] = vcpu;
-       }
+       /*
+        * Remove from the list any threads that have a signal pending
+        * or need a VPA update done
+        */
+       prepare_threads(vc);
+
+       /* if the runner is no longer runnable, let the caller pick a new one */
+       if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
+               return;
  
         /*
-        * Initialize *vc, in particular vc->vcore_state, so we can
-        * drop the vcore lock if necessary.
+        * Initialize *vc.
          */
-       vc->n_woken = 0;
-       vc->nap_count = 0;
-       vc->entry_exit_count = 0;
+       vc->entry_exit_map = 0;
         vc->preempt_tb = TB_NIL;
-       vc->vcore_state = VCORE_STARTING;
         vc->in_guest = 0;
         vc->napping_threads = 0;
         vc->conferring_threads = 0;
  
         /*
-        * Updating any of the vpas requires calling kvmppc_pin_guest_page,
-        * which can't be called with any spinlocks held.
-        */
-       if (need_vpa_update) {
-               spin_unlock(&vc->lock);
-               for (i = 0; i < need_vpa_update; ++i)
-                       kvmppc_update_vpas(vcpus_to_update[i]);
-               spin_lock(&vc->lock);
-       }
-
-       /*
          * Make sure we are running on primary threads, and that secondary
          * threads are offline.  Also check if the number of threads in this
          * guest are greater than the current system threads per guest.
          */
         if ((threads_per_core > 1) &&
             ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-               list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
                         vcpu->arch.ret = -EBUSY;
+                       kvmppc_remove_runnable(vc, vcpu);
+                       wake_up(&vcpu->arch.cpu_run);
+               }
                 goto out;
         }
  
@@ -1797,8 +2027,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
         list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
                 vcpu->cpu = -1;
         /* wait for secondary threads to finish writing their state to memory */
-       if (vc->nap_count < vc->n_woken)
-               kvmppc_wait_for_nap(vc);
+       kvmppc_wait_for_nap();
         for (i = 0; i < threads_per_subcore; ++i)
                 kvmppc_release_hwthread(vc->pcpu + i);
         /* prevent other vcpu threads from doing kvmppc_start_thread() now */
@@ -1812,44 +2041,12 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
         kvm_guest_exit();
  
         preempt_enable();
-       cond_resched();
  
         spin_lock(&vc->lock);
-       now = get_tb();
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-               /* cancel pending dec exception if dec is positive */
-               if (now < vcpu->arch.dec_expires &&
-                   kvmppc_core_pending_dec(vcpu))
-                       kvmppc_core_dequeue_dec(vcpu);
-
-               trace_kvm_guest_exit(vcpu);
-
-               ret = RESUME_GUEST;
-               if (vcpu->arch.trap)
-                       ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
-                                                   vcpu->arch.run_task);
-
-               vcpu->arch.ret = ret;
-               vcpu->arch.trap = 0;
-
-               if (vcpu->arch.ceded) {
-                       if (!is_kvmppc_resume_guest(ret))
-                               kvmppc_end_cede(vcpu);
-                       else
-                               kvmppc_set_timer(vcpu);
-               }
-       }
+       post_guest_process(vc);
  
   out:
         vc->vcore_state = VCORE_INACTIVE;
-       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-                                arch.run_list) {
-               if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
-                       kvmppc_remove_runnable(vc, vcpu);
-                       wake_up(&vcpu->arch.cpu_run);
-               }
-       }
-
         trace_kvmppc_run_core(vc, 1);
  }
  
@@ -1939,8 +2136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
          * this thread straight away and have it join in.
          */
         if (!signal_pending(current)) {
-               if (vc->vcore_state == VCORE_RUNNING &&
-                   VCORE_EXIT_COUNT(vc) == 0) {
+               if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
                         kvmppc_create_dtl_entry(vcpu, vc);
                         kvmppc_start_thread(vcpu);
                         trace_kvm_guest_enter(vcpu);
@@ -1971,7 +2167,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                 }
                 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
                         break;
-               vc->runner = vcpu;
                 n_ceded = 0;
                 list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
                         if (!v->arch.pending_exceptions)
@@ -1979,10 +2174,17 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                         else
                                 v->arch.ceded = 0;
                 }
-               if (n_ceded == vc->n_runnable)
+               vc->runner = vcpu;
+               if (n_ceded == vc->n_runnable) {
                         kvmppc_vcore_blocked(vc);
-               else
+               } else if (should_resched()) {
+                       vc->vcore_state = VCORE_PREEMPT;
+                       /* Let something else run */
+                       cond_resched_lock(&vc->lock);
+                       vc->vcore_state = VCORE_INACTIVE;
+               } else {
                         kvmppc_run_core(vc);
+               }
                 vc->runner = NULL;
         }
  
@@ -2032,11 +2234,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
         }
  
         atomic_inc(&vcpu->kvm->arch.vcpus_running);
-       /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
+       /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
         smp_mb();
  
         /* On the first time here, set up HTAB and VRMA */
-       if (!vcpu->kvm->arch.rma_setup_done) {
+       if (!vcpu->kvm->arch.hpte_setup_done) {
                 r = kvmppc_hv_setup_htab_rma(vcpu);
                 if (r)
                         goto out;
@@ -2238,7 +2440,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
         int srcu_idx;
  
         mutex_lock(&kvm->lock);
-       if (kvm->arch.rma_setup_done)
+       if (kvm->arch.hpte_setup_done)
                 goto out;       /* another vcpu beat us to it */
  
         /* Allocate hashed page table (if not done already) and reset it */
@@ -2289,9 +2491,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
  
         kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
  
-       /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
+       /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
         smp_wmb();
-       kvm->arch.rma_setup_done = 1;
+       kvm->arch.hpte_setup_done = 1;
         err = 0;
   out_srcu:
         srcu_read_unlock(&kvm->srcu, srcu_idx);
@@ -2307,6 +2509,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
  static int kvmppc_core_init_vm_hv(struct kvm *kvm)
  {
         unsigned long lpcr, lpid;
+       char buf[32];
  
         /* Allocate the guest's logical partition ID */
  
@@ -2347,6 +2550,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
          */
         kvm_hv_vm_activated();
  
+       /*
+        * Create a debugfs directory for the VM
+        */
+       snprintf(buf, sizeof(buf), "vm%d", current->pid);
+       kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
+       if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+               kvmppc_mmu_debugfs_init(kvm);
+
         return 0;
  }
  
@@ -2367,6 +2578,8 @@ static void kvmppc_free_vcores(struct kvm *kvm)
  
  static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
  {
+       debugfs_remove_recursive(kvm->arch.debugfs_dir);
+
         kvm_hv_vm_deactivated();
  
         kvmppc_free_vcores(kvm);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c

index 1f083ff..ed2589d 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -21,6 +21,10 @@
  #include <asm/cputable.h>
  #include <asm/kvm_ppc.h>
  #include <asm/kvm_book3s.h>
+#include <asm/archrandom.h>
+#include <asm/xics.h>
+#include <asm/dbell.h>
+#include <asm/cputhreads.h>
  
  #define KVM_CMA_CHUNK_ORDER    18
  
@@ -114,11 +118,11 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
         int rv = H_SUCCESS; /* => don't yield */
  
         set_bit(vcpu->arch.ptid, &vc->conferring_threads);
-       while ((get_tb() < stop) && (VCORE_EXIT_COUNT(vc) == 0)) {
-               threads_running = VCORE_ENTRY_COUNT(vc);
-               threads_ceded = hweight32(vc->napping_threads);
-               threads_conferring = hweight32(vc->conferring_threads);
-               if (threads_ceded + threads_conferring >= threads_running) {
+       while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
+               threads_running = VCORE_ENTRY_MAP(vc);
+               threads_ceded = vc->napping_threads;
+               threads_conferring = vc->conferring_threads;
+               if ((threads_ceded | threads_conferring) == threads_running) {
                         rv = H_TOO_HARD; /* => do yield */
                         break;
                 }
@@ -169,3 +173,89 @@ int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
         return 0;
  }
  EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
+
+int kvmppc_hwrng_present(void)
+{
+       return powernv_hwrng_present();
+}
+EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+long kvmppc_h_random(struct kvm_vcpu *vcpu)
+{
+       if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
+               return H_SUCCESS;
+
+       return H_HARDWARE;
+}
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+       __asm__ __volatile__("stbcix %0,0,%1"
+               : : "r" (val), "r" (paddr) : "memory");
+}
+
+/*
+ * Send an interrupt or message to another CPU.
+ * This can only be called in real mode.
+ * The caller needs to include any barrier needed to order writes
+ * to memory vs. the IPI/message.
+ */
+void kvmhv_rm_send_ipi(int cpu)
+{
+       unsigned long xics_phys;
+
+       /* On POWER8 for IPIs to threads in the same core, use msgsnd */
+       if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+           cpu_first_thread_sibling(cpu) ==
+           cpu_first_thread_sibling(raw_smp_processor_id())) {
+               unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+               msg |= cpu_thread_in_core(cpu);
+               __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+               return;
+       }
+
+       /* Else poke the target with an IPI */
+       xics_phys = paca[cpu].kvm_hstate.xics_phys;
+       rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+/*
+ * The following functions are called from the assembly code
+ * in book3s_hv_rmhandlers.S.
+ */
+static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active)
+{
+       int cpu = vc->pcpu;
+
+       /* Order setting of exit map vs. msgsnd/IPI */
+       smp_mb();
+       for (; active; active >>= 1, ++cpu)
+               if (active & 1)
+                       kvmhv_rm_send_ipi(cpu);
+}
+
+void kvmhv_commence_exit(int trap)
+{
+       struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+       int ptid = local_paca->kvm_hstate.ptid;
+       int me, ee;
+
+       /* Set our bit in the threads-exiting-guest map in the 0xff00
+          bits of vcore->entry_exit_map */
+       me = 0x100 << ptid;
+       do {
+               ee = vc->entry_exit_map;
+       } while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee);
+
+       /* Are we the first here? */
+       if ((ee >> 8) != 0)
+               return;
+
+       /*
+        * Trigger the other threads in this vcore to exit the guest.
+        * If this is a hypervisor decrementer interrupt then they
+        * will be already on their way out of the guest.
+        */
+       if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
+               kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c

index 625407e..f6bf0b1 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -150,12 +150,6 @@ static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
         return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
  }
  
-static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
-{
-       asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
-       hpte[0] = cpu_to_be64(hpte_v);
-}
-
  long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
                        long pte_index, unsigned long pteh, unsigned long ptel,
                        pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
@@ -271,10 +265,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
                                 u64 pte;
                                 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
                                         cpu_relax();
-                               pte = be64_to_cpu(*hpte);
+                               pte = be64_to_cpu(hpte[0]);
                                 if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT)))
                                         break;
-                               *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+                               __unlock_hpte(hpte, pte);
                                 hpte += 2;
                         }
                         if (i == 8)
@@ -290,9 +284,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
  
                         while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
                                 cpu_relax();
-                       pte = be64_to_cpu(*hpte);
+                       pte = be64_to_cpu(hpte[0]);
                         if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
-                               *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+                               __unlock_hpte(hpte, pte);
                                 return H_PTEG_FULL;
                         }
                 }
@@ -331,7 +325,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
  
         /* Write the first HPTE dword, unlocking the HPTE and making it valid */
         eieio();
-       hpte[0] = cpu_to_be64(pteh);
+       __unlock_hpte(hpte, pteh);
         asm volatile("ptesync" : : : "memory");
  
         *pte_idx_ret = pte_index;
@@ -412,7 +406,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
         if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
             ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
             ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
-               hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               __unlock_hpte(hpte, pte);
                 return H_NOT_FOUND;
         }
  
@@ -548,7 +542,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
                                 be64_to_cpu(hp[0]), be64_to_cpu(hp[1]));
                         rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
                         args[j] |= rcbits << (56 - 5);
-                       hp[0] = 0;
+                       __unlock_hpte(hp, 0);
                 }
         }
  
@@ -574,7 +568,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
         pte = be64_to_cpu(hpte[0]);
         if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
             ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
-               hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+               __unlock_hpte(hpte, pte);
                 return H_NOT_FOUND;
         }
  
@@ -755,8 +749,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
                                 /* Return with the HPTE still locked */
                                 return (hash << 3) + (i >> 1);
  
-                       /* Unlock and move on */
-                       hpte[i] = cpu_to_be64(v);
+                       __unlock_hpte(&hpte[i], v);
                 }
  
                 if (val & HPTE_V_SECONDARY)
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c

index 7c22997..00e45b6 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -23,17 +23,37 @@
  
  #define DEBUG_PASSUP
  
-static inline void rm_writeb(unsigned long paddr, u8 val)
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq);
+
+/* -- ICS routines -- */
+static void ics_rm_check_resend(struct kvmppc_xics *xics,
+                               struct kvmppc_ics *ics, struct kvmppc_icp *icp)
  {
-       __asm__ __volatile__("sync; stbcix %0,0,%1"
-               : : "r" (val), "r" (paddr) : "memory");
+       int i;
+
+       arch_spin_lock(&ics->lock);
+
+       for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+               struct ics_irq_state *state = &ics->irq_state[i];
+
+               if (!state->resend)
+                       continue;
+
+               arch_spin_unlock(&ics->lock);
+               icp_rm_deliver_irq(xics, icp, state->number);
+               arch_spin_lock(&ics->lock);
+       }
+
+       arch_spin_unlock(&ics->lock);
  }
  
+/* -- ICP routines -- */
+
  static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
                                 struct kvm_vcpu *this_vcpu)
  {
         struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
-       unsigned long xics_phys;
         int cpu;
  
         /* Mark the target VCPU as having an interrupt pending */
@@ -56,9 +76,8 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
         /* In SMT cpu will always point to thread 0, we adjust it */
         cpu += vcpu->arch.ptid;
  
-       /* Not too hard, then poke the target */
-       xics_phys = paca[cpu].kvm_hstate.xics_phys;
-       rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+       smp_mb();
+       kvmhv_rm_send_ipi(cpu);
  }
  
  static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
@@ -116,6 +135,180 @@ static inline int check_too_hard(struct kvmppc_xics *xics,
         return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
  }
  
+static void icp_rm_check_resend(struct kvmppc_xics *xics,
+                            struct kvmppc_icp *icp)
+{
+       u32 icsid;
+
+       /* Order this load with the test for need_resend in the caller */
+       smp_rmb();
+       for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+               struct kvmppc_ics *ics = xics->ics[icsid];
+
+               if (!test_and_clear_bit(icsid, icp->resend_map))
+                       continue;
+               if (!ics)
+                       continue;
+               ics_rm_check_resend(xics, ics, icp);
+       }
+}
+
+static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+                              u32 *reject)
+{
+       union kvmppc_icp_state old_state, new_state;
+       bool success;
+
+       do {
+               old_state = new_state = READ_ONCE(icp->state);
+
+               *reject = 0;
+
+               /* See if we can deliver */
+               success = new_state.cppr > priority &&
+                       new_state.mfrr > priority &&
+                       new_state.pending_pri > priority;
+
+               /*
+                * If we can, check for a rejection and perform the
+                * delivery
+                */
+               if (success) {
+                       *reject = new_state.xisr;
+                       new_state.xisr = irq;
+                       new_state.pending_pri = priority;
+               } else {
+                       /*
+                        * If we failed to deliver we set need_resend
+                        * so a subsequent CPPR state change causes us
+                        * to try a new delivery.
+                        */
+                       new_state.need_resend = true;
+               }
+
+       } while (!icp_rm_try_update(icp, old_state, new_state));
+
+       return success;
+}
+
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq)
+{
+       struct ics_irq_state *state;
+       struct kvmppc_ics *ics;
+       u32 reject;
+       u16 src;
+
+       /*
+        * This is used both for initial delivery of an interrupt and
+        * for subsequent rejection.
+        *
+        * Rejection can be racy vs. resends. We have evaluated the
+        * rejection in an atomic ICP transaction which is now complete,
+        * so potentially the ICP can already accept the interrupt again.
+        *
+        * So we need to retry the delivery. Essentially the reject path
+        * boils down to a failed delivery. Always.
+        *
+        * Now the interrupt could also have moved to a different target,
+        * thus we may need to re-do the ICP lookup as well
+        */
+
+ again:
+       /* Get the ICS state and lock it */
+       ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+       if (!ics) {
+               /* Unsafe increment, but this does not need to be accurate */
+               xics->err_noics++;
+               return;
+       }
+       state = &ics->irq_state[src];
+
+       /* Get a lock on the ICS */
+       arch_spin_lock(&ics->lock);
+
+       /* Get our server */
+       if (!icp || state->server != icp->server_num) {
+               icp = kvmppc_xics_find_server(xics->kvm, state->server);
+               if (!icp) {
+                       /* Unsafe increment again*/
+                       xics->err_noicp++;
+                       goto out;
+               }
+       }
+
+       /* Clear the resend bit of that interrupt */
+       state->resend = 0;
+
+       /*
+        * If masked, bail out
+        *
+        * Note: PAPR doesn't mention anything about masked pending
+        * when doing a resend, only when doing a delivery.
+        *
+        * However that would have the effect of losing a masked
+        * interrupt that was rejected and isn't consistent with
+        * the whole masked_pending business which is about not
+        * losing interrupts that occur while masked.
+        *
+        * I don't differentiate normal deliveries and resends, this
+        * implementation will differ from PAPR and not lose such
+        * interrupts.
+        */
+       if (state->priority == MASKED) {
+               state->masked_pending = 1;
+               goto out;
+       }
+
+       /*
+        * Try the delivery, this will set the need_resend flag
+        * in the ICP as part of the atomic transaction if the
+        * delivery is not possible.
+        *
+        * Note that if successful, the new delivery might have itself
+        * rejected an interrupt that was "delivered" before we took the
+        * ics spin lock.
+        *
+        * In this case we do the whole sequence all over again for the
+        * new guy. We cannot assume that the rejected interrupt is less
+        * favored than the new one, and thus doesn't need to be delivered,
+        * because by the time we exit icp_rm_try_to_deliver() the target
+        * processor may well have already consumed & completed it, and thus
+        * the rejected interrupt might actually be already acceptable.
+        */
+       if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+               /*
+                * Delivery was successful, did we reject somebody else ?
+                */
+               if (reject && reject != XICS_IPI) {
+                       arch_spin_unlock(&ics->lock);
+                       new_irq = reject;
+                       goto again;
+               }
+       } else {
+               /*
+                * We failed to deliver the interrupt we need to set the
+                * resend map bit and mark the ICS state as needing a resend
+                */
+               set_bit(ics->icsid, icp->resend_map);
+               state->resend = 1;
+
+               /*
+                * If the need_resend flag got cleared in the ICP some time
+                * between icp_rm_try_to_deliver() atomic update and now, then
+                * we know it might have missed the resend_map bit. So we
+                * retry
+                */
+               smp_mb();
+               if (!icp->state.need_resend) {
+                       arch_spin_unlock(&ics->lock);
+                       goto again;
+               }
+       }
+ out:
+       arch_spin_unlock(&ics->lock);
+}
+
  static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
                              u8 new_cppr)
  {
@@ -184,8 +377,8 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
          * separately here as well.
          */
         if (resend) {
-               icp->rm_action |= XICS_RM_CHECK_RESEND;
-               icp->rm_resend_icp = icp;
+               icp->n_check_resend++;
+               icp_rm_check_resend(xics, icp);
         }
  }
  
@@ -300,16 +493,16 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
                 }
         } while (!icp_rm_try_update(icp, old_state, new_state));
  
-       /* Pass rejects to virtual mode */
+       /* Handle reject in real mode */
         if (reject && reject != XICS_IPI) {
-               this_icp->rm_action |= XICS_RM_REJECT;
-               this_icp->rm_reject = reject;
+               this_icp->n_reject++;
+               icp_rm_deliver_irq(xics, icp, reject);
         }
  
-       /* Pass resends to virtual mode */
+       /* Handle resends in real mode */
         if (resend) {
-               this_icp->rm_action |= XICS_RM_CHECK_RESEND;
-               this_icp->rm_resend_icp = icp;
+               this_icp->n_check_resend++;
+               icp_rm_check_resend(xics, icp);
         }
  
         return check_too_hard(xics, this_icp);
@@ -365,10 +558,13 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
  
         } while (!icp_rm_try_update(icp, old_state, new_state));
  
-       /* Pass rejects to virtual mode */
+       /*
+        * Check for rejects. They are handled by doing a new delivery
+        * attempt (see comments in icp_rm_deliver_irq).
+        */
         if (reject && reject != XICS_IPI) {
-               icp->rm_action |= XICS_RM_REJECT;
-               icp->rm_reject = reject;
+               icp->n_reject++;
+               icp_rm_deliver_irq(xics, icp, reject);
         }
   bail:
         return check_too_hard(xics, icp);
@@ -416,10 +612,10 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
                 goto bail;
         state = &ics->irq_state[src];
  
-       /* Still asserted, resend it, we make it look like a reject */
+       /* Still asserted, resend it */
         if (state->asserted) {
-               icp->rm_action |= XICS_RM_REJECT;
-               icp->rm_reject = irq;
+               icp->n_reject++;
+               icp_rm_deliver_irq(xics, icp, irq);
         }
  
         if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 6cbf163..4d70df2 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -172,6 +172,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  
  kvmppc_primary_no_guest:
         /* We handle this much like a ceded vcpu */
+       /* put the HDEC into the DEC, since HDEC interrupts don't wake us */
+       mfspr   r3, SPRN_HDEC
+       mtspr   SPRN_DEC, r3
+       /*
+        * Make sure the primary has finished the MMU switch.
+        * We should never get here on a secondary thread, but
+        * check it for robustness' sake.
+        */
+       ld      r5, HSTATE_KVM_VCORE(r13)
+65:    lbz     r0, VCORE_IN_GUEST(r5)
+       cmpwi   r0, 0
+       beq     65b
+       /* Set LPCR. */
+       ld      r8,VCORE_LPCR(r5)
+       mtspr   SPRN_LPCR,r8
+       isync
         /* set our bit in napping_threads */
         ld      r5, HSTATE_KVM_VCORE(r13)
         lbz     r7, HSTATE_PTID(r13)
@@ -182,7 +198,7 @@ kvmppc_primary_no_guest:
         or      r3, r3, r0
         stwcx.  r3, 0, r6
         bne     1b
-       /* order napping_threads update vs testing entry_exit_count */
+       /* order napping_threads update vs testing entry_exit_map */
         isync
         li      r12, 0
         lwz     r7, VCORE_ENTRY_EXIT(r5)
@@ -191,6 +207,7 @@ kvmppc_primary_no_guest:
         li      r3, NAPPING_NOVCPU
         stb     r3, HSTATE_NAPPING(r13)
  
+       li      r3, 0           /* Don't wake on privileged (OS) doorbell */
         b       kvm_do_nap
  
  kvm_novcpu_wakeup:
@@ -202,7 +219,7 @@ kvm_novcpu_wakeup:
  
         /* check the wake reason */
         bl      kvmppc_check_wake_reason
-       
+
         /* see if any other thread is already exiting */
         lwz     r0, VCORE_ENTRY_EXIT(r5)
         cmpwi   r0, 0x100
@@ -222,13 +239,37 @@ kvm_novcpu_wakeup:
         cmpdi   r3, 0
         bge     kvm_novcpu_exit
  
+       /* See if our timeslice has expired (HDEC is negative) */
+       mfspr   r0, SPRN_HDEC
+       li      r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+       cmpwi   r0, 0
+       blt     kvm_novcpu_exit
+
         /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
         ld      r4, HSTATE_KVM_VCPU(r13)
         cmpdi   r4, 0
-       bne     kvmppc_got_guest
+       beq     kvmppc_primary_no_guest
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r4, VCPU_TB_RMENTRY
+       bl      kvmhv_start_timing
+#endif
+       b       kvmppc_got_guest
  
  kvm_novcpu_exit:
-       b       hdec_soon
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       cmpdi   r4, 0
+       beq     13f
+       addi    r3, r4, VCPU_TB_RMEXIT
+       bl      kvmhv_accumulate_time
+#endif
+13:    mr      r3, r12
+       stw     r12, 112-4(r1)
+       bl      kvmhv_commence_exit
+       nop
+       lwz     r12, 112-4(r1)
+       b       kvmhv_switch_to_host
  
  /*
   * We come in here when wakened from nap mode.
@@ -239,9 +280,9 @@ kvm_novcpu_exit:
  kvm_start_guest:
  
         /* Set runlatch bit the minute you wake up from nap */
-       mfspr   r1, SPRN_CTRLF
-       ori     r1, r1, 1
-       mtspr   SPRN_CTRLT, r1
+       mfspr   r0, SPRN_CTRLF
+       ori     r0, r0, 1
+       mtspr   SPRN_CTRLT, r0
  
         ld      r2,PACATOC(r13)
  
@@ -286,26 +327,21 @@ kvm_secondary_got_guest:
         ld      r6, PACA_DSCR(r13)
         std     r6, HSTATE_DSCR(r13)
  
+       /* Order load of vcore, ptid etc. after load of vcpu */
+       lwsync
         bl      kvmppc_hv_entry
  
         /* Back from the guest, go back to nap */
         /* Clear our vcpu pointer so we don't come back in early */
         li      r0, 0
-       std     r0, HSTATE_KVM_VCPU(r13)
         /*
-        * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing
-        * the nap_count, because once the increment to nap_count is
-        * visible we could be given another vcpu.
+        * Once we clear HSTATE_KVM_VCPU(r13), the code in
+        * kvmppc_run_core() is going to assume that all our vcpu
+        * state is visible in memory.  This lwsync makes sure
+        * that that is true.
          */
         lwsync
-
-       /* increment the nap count and then go to nap mode */
-       ld      r4, HSTATE_KVM_VCORE(r13)
-       addi    r4, r4, VCORE_NAP_COUNT
-51:    lwarx   r3, 0, r4
-       addi    r3, r3, 1
-       stwcx.  r3, 0, r4
-       bne     51b
+       std     r0, HSTATE_KVM_VCPU(r13)
  
  /*
   * At this point we have finished executing in the guest.
@@ -376,6 +412,14 @@ kvmppc_hv_entry:
         li      r6, KVM_GUEST_MODE_HOST_HV
         stb     r6, HSTATE_IN_GUEST(r13)
  
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       /* Store initial timestamp */
+       cmpdi   r4, 0
+       beq     1f
+       addi    r3, r4, VCPU_TB_RMENTRY
+       bl      kvmhv_start_timing
+1:
+#endif
         /* Clear out SLB */
         li      r6,0
         slbmte  r6,r6
@@ -387,21 +431,23 @@ kvmppc_hv_entry:
          * We don't have to lock against concurrent tlbies,
          * but we do have to coordinate across hardware threads.
          */
-       /* Increment entry count iff exit count is zero. */
-       ld      r5,HSTATE_KVM_VCORE(r13)
-       addi    r9,r5,VCORE_ENTRY_EXIT
-21:    lwarx   r3,0,r9
-       cmpwi   r3,0x100                /* any threads starting to exit? */
+       /* Set bit in entry map iff exit map is zero. */
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       li      r7, 1
+       lbz     r6, HSTATE_PTID(r13)
+       sld     r7, r7, r6
+       addi    r9, r5, VCORE_ENTRY_EXIT
+21:    lwarx   r3, 0, r9
+       cmpwi   r3, 0x100               /* any threads starting to exit? */
         bge     secondary_too_late      /* if so we're too late to the party */
-       addi    r3,r3,1
-       stwcx.  r3,0,r9
+       or      r3, r3, r7
+       stwcx.  r3, 0, r9
         bne     21b
  
         /* Primary thread switches to guest partition. */
         ld      r9,VCORE_KVM(r5)        /* pointer to struct kvm */
-       lbz     r6,HSTATE_PTID(r13)
         cmpwi   r6,0
-       bne     20f
+       bne     10f
         ld      r6,KVM_SDR1(r9)
         lwz     r7,KVM_LPID(r9)
         li      r0,LPID_RSVD            /* switch to reserved LPID */
@@ -472,28 +518,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  
         li      r0,1
         stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
-       b       10f
-
-       /* Secondary threads wait for primary to have done partition switch */
-20:    lbz     r0,VCORE_IN_GUEST(r5)
-       cmpwi   r0,0
-       beq     20b
-
-       /* Set LPCR and RMOR. */
-10:    ld      r8,VCORE_LPCR(r5)
-       mtspr   SPRN_LPCR,r8
-       ld      r8,KVM_RMOR(r9)
-       mtspr   SPRN_RMOR,r8
-       isync
-
-       /* Check if HDEC expires soon */
-       mfspr   r3,SPRN_HDEC
-       cmpwi   r3,512          /* 1 microsecond */
-       li      r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-       blt     hdec_soon
  
         /* Do we have a guest vcpu to run? */
-       cmpdi   r4, 0
+10:    cmpdi   r4, 0
         beq     kvmppc_primary_no_guest
  kvmppc_got_guest:
  
@@ -818,6 +845,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         clrrdi  r6,r6,1
         mtspr   SPRN_CTRLT,r6
  4:
+       /* Secondary threads wait for primary to have done partition switch */
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       lbz     r6, HSTATE_PTID(r13)
+       cmpwi   r6, 0
+       beq     21f
+       lbz     r0, VCORE_IN_GUEST(r5)
+       cmpwi   r0, 0
+       bne     21f
+       HMT_LOW
+20:    lbz     r0, VCORE_IN_GUEST(r5)
+       cmpwi   r0, 0
+       beq     20b
+       HMT_MEDIUM
+21:
+       /* Set LPCR. */
+       ld      r8,VCORE_LPCR(r5)
+       mtspr   SPRN_LPCR,r8
+       isync
+
+       /* Check if HDEC expires soon */
+       mfspr   r3, SPRN_HDEC
+       cmpwi   r3, 512         /* 1 microsecond */
+       blt     hdec_soon
+
         ld      r6, VCPU_CTR(r4)
         lwz     r7, VCPU_XER(r4)
  
@@ -880,6 +931,12 @@ fast_guest_return:
         li      r9, KVM_GUEST_MODE_GUEST_HV
         stb     r9, HSTATE_IN_GUEST(r13)
  
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       /* Accumulate timing */
+       addi    r3, r4, VCPU_TB_GUEST
+       bl      kvmhv_accumulate_time
+#endif
+
         /* Enter guest */
  
  BEGIN_FTR_SECTION
@@ -917,6 +974,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
         hrfid
         b       .
  
+secondary_too_late:
+       li      r12, 0
+       cmpdi   r4, 0
+       beq     11f
+       stw     r12, VCPU_TRAP(r4)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r4, VCPU_TB_RMEXIT
+       bl      kvmhv_accumulate_time
+#endif
+11:    b       kvmhv_switch_to_host
+
+hdec_soon:
+       li      r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+       stw     r12, VCPU_TRAP(r4)
+       mr      r9, r4
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r4, VCPU_TB_RMEXIT
+       bl      kvmhv_accumulate_time
+#endif
+       b       guest_exit_cont
+
  /******************************************************************************
   *                                                                            *
   *                               Exit code                                    *
@@ -1002,6 +1080,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
  
         stw     r12,VCPU_TRAP(r9)
  
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r9, VCPU_TB_RMINTR
+       mr      r4, r9
+       bl      kvmhv_accumulate_time
+       ld      r5, VCPU_GPR(R5)(r9)
+       ld      r6, VCPU_GPR(R6)(r9)
+       ld      r7, VCPU_GPR(R7)(r9)
+       ld      r8, VCPU_GPR(R8)(r9)
+#endif
+
         /* Save HEIR (HV emulation assist reg) in emul_inst
            if this is an HEI (HV emulation interrupt, e40) */
         li      r3,KVM_INST_FETCH_FAILED
@@ -1028,34 +1116,37 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
         bne     2f
         mfspr   r3,SPRN_HDEC
         cmpwi   r3,0
-       bge     ignore_hdec
+       mr      r4,r9
+       bge     fast_guest_return
  2:
         /* See if this is an hcall we can handle in real mode */
         cmpwi   r12,BOOK3S_INTERRUPT_SYSCALL
         beq     hcall_try_real_mode
  
+       /* Hypervisor doorbell - exit only if host IPI flag set */
+       cmpwi   r12, BOOK3S_INTERRUPT_H_DOORBELL
+       bne     3f
+       lbz     r0, HSTATE_HOST_IPI(r13)
+       beq     4f
+       b       guest_exit_cont
+3:
         /* External interrupt ? */
         cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
-       bne+    ext_interrupt_to_host
+       bne+    guest_exit_cont
  
         /* External interrupt, first check for host_ipi. If this is
          * set, we know the host wants us out so let's do it now
          */
         bl      kvmppc_read_intr
         cmpdi   r3, 0
-       bgt     ext_interrupt_to_host
+       bgt     guest_exit_cont
  
         /* Check if any CPU is heading out to the host, if so head out too */
-       ld      r5, HSTATE_KVM_VCORE(r13)
+4:     ld      r5, HSTATE_KVM_VCORE(r13)
         lwz     r0, VCORE_ENTRY_EXIT(r5)
         cmpwi   r0, 0x100
-       bge     ext_interrupt_to_host
-
-       /* Return to guest after delivering any pending interrupt */
         mr      r4, r9
-       b       deliver_guest_interrupt
-
-ext_interrupt_to_host:
+       blt     deliver_guest_interrupt
  
  guest_exit_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
         /* Save more register state  */
@@ -1065,7 +1156,7 @@ guest_exit_cont:          /* r9 = vcpu, r12 = trap, r13 = paca */
         stw     r7, VCPU_DSISR(r9)
         /* don't overwrite fault_dar/fault_dsisr if HDSI */
         cmpwi   r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
-       beq     6f
+       beq     mc_cont
         std     r6, VCPU_FAULT_DAR(r9)
         stw     r7, VCPU_FAULT_DSISR(r9)
  
@@ -1073,9 +1164,20 @@ guest_exit_cont:         /* r9 = vcpu, r12 = trap, r13 = paca */
         cmpwi   r12, BOOK3S_INTERRUPT_MACHINE_CHECK
         beq     machine_check_realmode
  mc_cont:
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r9, VCPU_TB_RMEXIT
+       mr      r4, r9
+       bl      kvmhv_accumulate_time
+#endif
+
+       /* Increment exit count, poke other threads to exit */
+       bl      kvmhv_commence_exit
+       nop
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       lwz     r12, VCPU_TRAP(r9)
  
         /* Save guest CTRL register, set runlatch to 1 */
-6:     mfspr   r6,SPRN_CTRLF
+       mfspr   r6,SPRN_CTRLF
         stw     r6,VCPU_CTRL(r9)
         andi.   r0,r6,1
         bne     4f
@@ -1417,68 +1519,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
         slbia
         ptesync
  
-hdec_soon:                     /* r12 = trap, r13 = paca */
         /*
          * POWER7/POWER8 guest -> host partition switch code.
          * We don't have to lock against tlbies but we do
          * have to coordinate the hardware threads.
          */
-       /* Increment the threads-exiting-guest count in the 0xff00
-          bits of vcore->entry_exit_count */
-       ld      r5,HSTATE_KVM_VCORE(r13)
-       addi    r6,r5,VCORE_ENTRY_EXIT
-41:    lwarx   r3,0,r6
-       addi    r0,r3,0x100
-       stwcx.  r0,0,r6
-       bne     41b
-       isync           /* order stwcx. vs. reading napping_threads */
-
-       /*
-        * At this point we have an interrupt that we have to pass
-        * up to the kernel or qemu; we can't handle it in real mode.
-        * Thus we have to do a partition switch, so we have to
-        * collect the other threads, if we are the first thread
-        * to take an interrupt.  To do this, we set the HDEC to 0,
-        * which causes an HDEC interrupt in all threads within 2ns
-        * because the HDEC register is shared between all 4 threads.
-        * However, we don't need to bother if this is an HDEC
-        * interrupt, since the other threads will already be on their
-        * way here in that case.
-        */
-       cmpwi   r3,0x100        /* Are we the first here? */
-       bge     43f
-       cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-       beq     40f
-       li      r0,0
-       mtspr   SPRN_HDEC,r0
-40:
-       /*
-        * Send an IPI to any napping threads, since an HDEC interrupt
-        * doesn't wake CPUs up from nap.
-        */
-       lwz     r3,VCORE_NAPPING_THREADS(r5)
-       lbz     r4,HSTATE_PTID(r13)
-       li      r0,1
-       sld     r0,r0,r4
-       andc.   r3,r3,r0                /* no sense IPI'ing ourselves */
-       beq     43f
-       /* Order entry/exit update vs. IPIs */
-       sync
-       mulli   r4,r4,PACA_SIZE         /* get paca for thread 0 */
-       subf    r6,r4,r13
-42:    andi.   r0,r3,1
-       beq     44f
-       ld      r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
-       li      r0,IPI_PRIORITY
-       li      r7,XICS_MFRR
-       stbcix  r0,r7,r8                /* trigger the IPI */
-44:    srdi.   r3,r3,1
-       addi    r6,r6,PACA_SIZE
-       bne     42b
-
-secondary_too_late:
+kvmhv_switch_to_host:
         /* Secondary threads wait for primary to do partition switch */
-43:    ld      r5,HSTATE_KVM_VCORE(r13)
+       ld      r5,HSTATE_KVM_VCORE(r13)
         ld      r4,VCORE_KVM(r5)        /* pointer to struct kvm */
         lbz     r3,HSTATE_PTID(r13)
         cmpwi   r3,0
@@ -1562,6 +1610,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  1:     addi    r8,r8,16
         .endr
  
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       /* Finish timing, if we have a vcpu */
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       cmpdi   r4, 0
+       li      r3, 0
+       beq     2f
+       bl      kvmhv_accumulate_time
+2:
+#endif
         /* Unset guest mode */
         li      r0, KVM_GUEST_MODE_NONE
         stb     r0, HSTATE_IN_GUEST(r13)
@@ -1696,8 +1753,10 @@ kvmppc_hisi:
   * Returns to the guest if we handle it, or continues on up to
   * the kernel if we can't (i.e. if we don't have a handler for
   * it, or if the handler returns H_TOO_HARD).
+ *
+ * r5 - r8 contain hcall args,
+ * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca
   */
-       .globl  hcall_try_real_mode
  hcall_try_real_mode:
         ld      r3,VCPU_GPR(R3)(r9)
         andi.   r0,r11,MSR_PR
@@ -1839,13 +1898,124 @@ hcall_real_table:
         .long   0               /* 0x12c */
         .long   0               /* 0x130 */
         .long   DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
+       .long   0               /* 0x138 */
+       .long   0               /* 0x13c */
+       .long   0               /* 0x140 */
+       .long   0               /* 0x144 */
+       .long   0               /* 0x148 */
+       .long   0               /* 0x14c */
+       .long   0               /* 0x150 */
+       .long   0               /* 0x154 */
+       .long   0               /* 0x158 */
+       .long   0               /* 0x15c */
+       .long   0               /* 0x160 */
+       .long   0               /* 0x164 */
+       .long   0               /* 0x168 */
+       .long   0               /* 0x16c */
+       .long   0               /* 0x170 */
+       .long   0               /* 0x174 */
+       .long   0               /* 0x178 */
+       .long   0               /* 0x17c */
+       .long   0               /* 0x180 */
+       .long   0               /* 0x184 */
+       .long   0               /* 0x188 */
+       .long   0               /* 0x18c */
+       .long   0               /* 0x190 */
+       .long   0               /* 0x194 */
+       .long   0               /* 0x198 */
+       .long   0               /* 0x19c */
+       .long   0               /* 0x1a0 */
+       .long   0               /* 0x1a4 */
+       .long   0               /* 0x1a8 */
+       .long   0               /* 0x1ac */
+       .long   0               /* 0x1b0 */
+       .long   0               /* 0x1b4 */
+       .long   0               /* 0x1b8 */
+       .long   0               /* 0x1bc */
+       .long   0               /* 0x1c0 */
+       .long   0               /* 0x1c4 */
+       .long   0               /* 0x1c8 */
+       .long   0               /* 0x1cc */
+       .long   0               /* 0x1d0 */
+       .long   0               /* 0x1d4 */
+       .long   0               /* 0x1d8 */
+       .long   0               /* 0x1dc */
+       .long   0               /* 0x1e0 */
+       .long   0               /* 0x1e4 */
+       .long   0               /* 0x1e8 */
+       .long   0               /* 0x1ec */
+       .long   0               /* 0x1f0 */
+       .long   0               /* 0x1f4 */
+       .long   0               /* 0x1f8 */
+       .long   0               /* 0x1fc */
+       .long   0               /* 0x200 */
+       .long   0               /* 0x204 */
+       .long   0               /* 0x208 */
+       .long   0               /* 0x20c */
+       .long   0               /* 0x210 */
+       .long   0               /* 0x214 */
+       .long   0               /* 0x218 */
+       .long   0               /* 0x21c */
+       .long   0               /* 0x220 */
+       .long   0               /* 0x224 */
+       .long   0               /* 0x228 */
+       .long   0               /* 0x22c */
+       .long   0               /* 0x230 */
+       .long   0               /* 0x234 */
+       .long   0               /* 0x238 */
+       .long   0               /* 0x23c */
+       .long   0               /* 0x240 */
+       .long   0               /* 0x244 */
+       .long   0               /* 0x248 */
+       .long   0               /* 0x24c */
+       .long   0               /* 0x250 */
+       .long   0               /* 0x254 */
+       .long   0               /* 0x258 */
+       .long   0               /* 0x25c */
+       .long   0               /* 0x260 */
+       .long   0               /* 0x264 */
+       .long   0               /* 0x268 */
+       .long   0               /* 0x26c */
+       .long   0               /* 0x270 */
+       .long   0               /* 0x274 */
+       .long   0               /* 0x278 */
+       .long   0               /* 0x27c */
+       .long   0               /* 0x280 */
+       .long   0               /* 0x284 */
+       .long   0               /* 0x288 */
+       .long   0               /* 0x28c */
+       .long   0               /* 0x290 */
+       .long   0               /* 0x294 */
+       .long   0               /* 0x298 */
+       .long   0               /* 0x29c */
+       .long   0               /* 0x2a0 */
+       .long   0               /* 0x2a4 */
+       .long   0               /* 0x2a8 */
+       .long   0               /* 0x2ac */
+       .long   0               /* 0x2b0 */
+       .long   0               /* 0x2b4 */
+       .long   0               /* 0x2b8 */
+       .long   0               /* 0x2bc */
+       .long   0               /* 0x2c0 */
+       .long   0               /* 0x2c4 */
+       .long   0               /* 0x2c8 */
+       .long   0               /* 0x2cc */
+       .long   0               /* 0x2d0 */
+       .long   0               /* 0x2d4 */
+       .long   0               /* 0x2d8 */
+       .long   0               /* 0x2dc */
+       .long   0               /* 0x2e0 */
+       .long   0               /* 0x2e4 */
+       .long   0               /* 0x2e8 */
+       .long   0               /* 0x2ec */
+       .long   0               /* 0x2f0 */
+       .long   0               /* 0x2f4 */
+       .long   0               /* 0x2f8 */
+       .long   0               /* 0x2fc */
+       .long   DOTSYM(kvmppc_h_random) - hcall_real_table
         .globl  hcall_real_table_end
  hcall_real_table_end:
  
-ignore_hdec:
-       mr      r4,r9
-       b       fast_guest_return
-
  _GLOBAL(kvmppc_h_set_xdabr)
         andi.   r0, r5, DABRX_USER | DABRX_KERNEL
         beq     6f
@@ -1884,7 +2054,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
         li      r3, 0
         blr
  
-_GLOBAL(kvmppc_h_cede)
+_GLOBAL(kvmppc_h_cede)         /* r3 = vcpu pointer, r11 = msr, r13 = paca */
         ori     r11,r11,MSR_EE
         std     r11,VCPU_MSR(r3)
         li      r0,1
@@ -1893,8 +2063,8 @@ _GLOBAL(kvmppc_h_cede)
         lbz     r5,VCPU_PRODDED(r3)
         cmpwi   r5,0
         bne     kvm_cede_prodded
-       li      r0,0            /* set trap to 0 to say hcall is handled */
-       stw     r0,VCPU_TRAP(r3)
+       li      r12,0           /* set trap to 0 to say hcall is handled */
+       stw     r12,VCPU_TRAP(r3)
         li      r0,H_SUCCESS
         std     r0,VCPU_GPR(R3)(r3)
  
@@ -1912,12 +2082,11 @@ _GLOBAL(kvmppc_h_cede)
         addi    r6,r5,VCORE_NAPPING_THREADS
  31:    lwarx   r4,0,r6
         or      r4,r4,r0
-       PPC_POPCNTW(R7,R4)
-       cmpw    r7,r8
-       bge     kvm_cede_exit
+       cmpw    r4,r8
+       beq     kvm_cede_exit
         stwcx.  r4,0,r6
         bne     31b
-       /* order napping_threads update vs testing entry_exit_count */
+       /* order napping_threads update vs testing entry_exit_map */
         isync
         li      r0,NAPPING_CEDE
         stb     r0,HSTATE_NAPPING(r13)
@@ -1955,21 +2124,52 @@ _GLOBAL(kvmppc_h_cede)
         bl      kvmppc_save_fp
  
         /*
+        * Set DEC to the smaller of DEC and HDEC, so that we wake
+        * no later than the end of our timeslice (HDEC interrupts
+        * don't wake us from nap).
+        */
+       mfspr   r3, SPRN_DEC
+       mfspr   r4, SPRN_HDEC
+       mftb    r5
+       cmpw    r3, r4
+       ble     67f
+       mtspr   SPRN_DEC, r4
+67:
+       /* save expiry time of guest decrementer */
+       extsw   r3, r3
+       add     r3, r3, r5
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       ld      r6, VCORE_TB_OFFSET(r5)
+       subf    r3, r6, r3      /* convert to host TB value */
+       std     r3, VCPU_DEC_EXPIRES(r4)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       addi    r3, r4, VCPU_TB_CEDE
+       bl      kvmhv_accumulate_time
+#endif
+
+       lis     r3, LPCR_PECEDP@h       /* Do wake on privileged doorbell */
+
+       /*
          * Take a nap until a decrementer or external or doobell interrupt
-        * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the
-        * runlatch bit before napping.
+        * occurs, with PECE1 and PECE0 set in LPCR.
+        * On POWER8, set PECEDH, and if we are ceding, also set PECEDP.
+        * Also clear the runlatch bit before napping.
          */
  kvm_do_nap:
-       mfspr   r2, SPRN_CTRLF
-       clrrdi  r2, r2, 1
-       mtspr   SPRN_CTRLT, r2
+       mfspr   r0, SPRN_CTRLF
+       clrrdi  r0, r0, 1
+       mtspr   SPRN_CTRLT, r0
  
         li      r0,1
         stb     r0,HSTATE_HWTHREAD_REQ(r13)
         mfspr   r5,SPRN_LPCR
         ori     r5,r5,LPCR_PECE0 | LPCR_PECE1
  BEGIN_FTR_SECTION
-       oris    r5,r5,LPCR_PECEDP@h
+       ori     r5, r5, LPCR_PECEDH
+       rlwimi  r5, r3, 0, LPCR_PECEDP
  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
         mtspr   SPRN_LPCR,r5
         isync
@@ -1994,9 +2194,23 @@ kvm_end_cede:
         /* Woken by external or decrementer interrupt */
         ld      r1, HSTATE_HOST_R1(r13)
  
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+       addi    r3, r4, VCPU_TB_RMINTR
+       bl      kvmhv_accumulate_time
+#endif
+
         /* load up FP state */
         bl      kvmppc_load_fp
  
+       /* Restore guest decrementer */
+       ld      r3, VCPU_DEC_EXPIRES(r4)
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       ld      r6, VCORE_TB_OFFSET(r5)
+       add     r3, r3, r6      /* convert host TB to guest TB value */
+       mftb    r7
+       subf    r3, r7, r3
+       mtspr   SPRN_DEC, r3
+
         /* Load NV GPRS */
         ld      r14, VCPU_GPR(R14)(r4)
         ld      r15, VCPU_GPR(R15)(r4)
@@ -2057,7 +2271,8 @@ kvm_cede_prodded:
  
         /* we've ceded but we want to give control to the host */
  kvm_cede_exit:
-       b       hcall_real_fallback
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       b       guest_exit_cont
  
         /* Try to handle a machine check in real mode */
  machine_check_realmode:
@@ -2089,13 +2304,14 @@ machine_check_realmode:
  
  /*
   * Check the reason we woke from nap, and take appropriate action.
- * Returns:
+ * Returns (in r3):
   *     0 if nothing needs to be done
   *     1 if something happened that needs to be handled by the host
- *     -1 if there was a guest wakeup (IPI)
+ *     -1 if there was a guest wakeup (IPI or msgsnd)
   *
   * Also sets r12 to the interrupt vector for any interrupt that needs
   * to be handled now by the host (0x500 for external interrupt), or zero.
+ * Modifies r0, r6, r7, r8.
   */
  kvmppc_check_wake_reason:
         mfspr   r6, SPRN_SRR1
@@ -2122,7 +2338,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  
         /* hypervisor doorbell */
  3:     li      r12, BOOK3S_INTERRUPT_H_DOORBELL
+       /* see if it's a host IPI */
         li      r3, 1
+       lbz     r0, HSTATE_HOST_IPI(r13)
+       cmpwi   r0, 0
+       bnelr
+       /* if not, clear it and return -1 */
+       lis     r6, (PPC_DBELL_SERVER << (63-36))@h
+       PPC_MSGCLR(6)
+       li      r3, -1
         blr
  
  /*
@@ -2131,6 +2355,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
   *     0 if no interrupt is pending
   *     1 if an interrupt is pending that needs to be handled by the host
   *     -1 if there was a guest wakeup IPI (which has now been cleared)
+ * Modifies r0, r6, r7, r8, returns value in r3.
   */
  kvmppc_read_intr:
         /* see if a host IPI is pending */
@@ -2185,6 +2410,7 @@ kvmppc_read_intr:
         bne-    43f
  
         /* OK, it's an IPI for us */
+       li      r12, 0
         li      r3, -1
  1:     blr
  
@@ -2314,3 +2540,62 @@ kvmppc_fix_pmao:
         mtspr   SPRN_PMC6, r3
         isync
         blr
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+/*
+ * Start timing an activity
+ * r3 = pointer to time accumulation struct, r4 = vcpu
+ */
+kvmhv_start_timing:
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       lbz     r6, VCORE_IN_GUEST(r5)
+       cmpwi   r6, 0
+       beq     5f                              /* if in guest, need to */
+       ld      r6, VCORE_TB_OFFSET(r5)         /* subtract timebase offset */
+5:     mftb    r5
+       subf    r5, r6, r5
+       std     r3, VCPU_CUR_ACTIVITY(r4)
+       std     r5, VCPU_ACTIVITY_START(r4)
+       blr
+
+/*
+ * Accumulate time to one activity and start another.
+ * r3 = pointer to new time accumulation struct, r4 = vcpu
+ */
+kvmhv_accumulate_time:
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       lbz     r8, VCORE_IN_GUEST(r5)
+       cmpwi   r8, 0
+       beq     4f                              /* if in guest, need to */
+       ld      r8, VCORE_TB_OFFSET(r5)         /* subtract timebase offset */
+4:     ld      r5, VCPU_CUR_ACTIVITY(r4)
+       ld      r6, VCPU_ACTIVITY_START(r4)
+       std     r3, VCPU_CUR_ACTIVITY(r4)
+       mftb    r7
+       subf    r7, r8, r7
+       std     r7, VCPU_ACTIVITY_START(r4)
+       cmpdi   r5, 0
+       beqlr
+       subf    r3, r6, r7
+       ld      r8, TAS_SEQCOUNT(r5)
+       cmpdi   r8, 0
+       addi    r8, r8, 1
+       std     r8, TAS_SEQCOUNT(r5)
+       lwsync
+       ld      r7, TAS_TOTAL(r5)
+       add     r7, r7, r3
+       std     r7, TAS_TOTAL(r5)
+       ld      r6, TAS_MIN(r5)
+       ld      r7, TAS_MAX(r5)
+       beq     3f
+       cmpd    r3, r6
+       bge     1f
+3:     std     r3, TAS_MIN(r5)
+1:     cmpd    r3, r7
+       ble     2f
+       std     r3, TAS_MAX(r5)
+2:     lwsync
+       addi    r8, r8, 1
+       std     r8, TAS_SEQCOUNT(r5)
+       blr
+#endif
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c

index ce3c893..f2c75a1 100644 (file)
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -258,6 +258,28 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
         return EMULATE_DONE;
  }
  
+static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+       long rc;
+
+       rc = kvmppc_h_logical_ci_load(vcpu);
+       if (rc == H_TOO_HARD)
+               return EMULATE_FAIL;
+       kvmppc_set_gpr(vcpu, 3, rc);
+       return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+       long rc;
+
+       rc = kvmppc_h_logical_ci_store(vcpu);
+       if (rc == H_TOO_HARD)
+               return EMULATE_FAIL;
+       kvmppc_set_gpr(vcpu, 3, rc);
+       return EMULATE_DONE;
+}
+
  static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
  {
         long rc = kvmppc_xics_hcall(vcpu, cmd);
@@ -290,6 +312,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
                 vcpu->stat.halt_wakeup++;
                 return EMULATE_DONE;
+       case H_LOGICAL_CI_LOAD:
+               return kvmppc_h_pr_logical_ci_load(vcpu);
+       case H_LOGICAL_CI_STORE:
+               return kvmppc_h_pr_logical_ci_store(vcpu);
         case H_XIRR:
         case H_CPPR:
         case H_EOI:
@@ -323,6 +349,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
         case H_BULK_REMOVE:
         case H_PUT_TCE:
         case H_CEDE:
+       case H_LOGICAL_CI_LOAD:
+       case H_LOGICAL_CI_STORE:
  #ifdef CONFIG_KVM_XICS
         case H_XIRR:
         case H_CPPR:
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c

index a4a8d9f..8f3e6cc 100644 (file)
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -20,6 +20,7 @@
  #include <asm/xics.h>
  #include <asm/debug.h>
  #include <asm/time.h>
+#include <asm/spinlock.h>
  
  #include <linux/debugfs.h>
  #include <linux/seq_file.h>
@@ -39,7 +40,7 @@
   * LOCKING
   * =======
   *
- * Each ICS has a mutex protecting the information about the IRQ
+ * Each ICS has a spin lock protecting the information about the IRQ
   * sources and avoiding simultaneous deliveries if the same interrupt.
   *
   * ICP operations are done via a single compare & swap transaction
@@ -109,7 +110,10 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
  {
         int i;
  
-       mutex_lock(&ics->lock);
+       unsigned long flags;
+
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
  
         for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
                 struct ics_irq_state *state = &ics->irq_state[i];
@@ -120,12 +124,15 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
                 XICS_DBG("resend %#x prio %#x\n", state->number,
                               state->priority);
  
-               mutex_unlock(&ics->lock);
+               arch_spin_unlock(&ics->lock);
+               local_irq_restore(flags);
                 icp_deliver_irq(xics, icp, state->number);
-               mutex_lock(&ics->lock);
+               local_irq_save(flags);
+               arch_spin_lock(&ics->lock);
         }
  
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
  }
  
  static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -133,8 +140,10 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
                        u32 server, u32 priority, u32 saved_priority)
  {
         bool deliver;
+       unsigned long flags;
  
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
  
         state->server = server;
         state->priority = priority;
@@ -145,7 +154,8 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
                 deliver = true;
         }
  
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
  
         return deliver;
  }
@@ -186,6 +196,7 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
         struct kvmppc_ics *ics;
         struct ics_irq_state *state;
         u16 src;
+       unsigned long flags;
  
         if (!xics)
                 return -ENODEV;
@@ -195,10 +206,12 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
                 return -EINVAL;
         state = &ics->irq_state[src];
  
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
         *server = state->server;
         *priority = state->priority;
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
  
         return 0;
  }
@@ -365,6 +378,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
         struct kvmppc_ics *ics;
         u32 reject;
         u16 src;
+       unsigned long flags;
  
         /*
          * This is used both for initial delivery of an interrupt and
@@ -391,7 +405,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
         state = &ics->irq_state[src];
  
         /* Get a lock on the ICS */
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
  
         /* Get our server */
         if (!icp || state->server != icp->server_num) {
@@ -434,7 +449,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
          *
          * Note that if successful, the new delivery might have itself
          * rejected an interrupt that was "delivered" before we took the
-        * icp mutex.
+        * ics spin lock.
          *
          * In this case we do the whole sequence all over again for the
          * new guy. We cannot assume that the rejected interrupt is less
@@ -448,7 +463,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
                  * Delivery was successful, did we reject somebody else ?
                  */
                 if (reject && reject != XICS_IPI) {
-                       mutex_unlock(&ics->lock);
+                       arch_spin_unlock(&ics->lock);
+                       local_irq_restore(flags);
                         new_irq = reject;
                         goto again;
                 }
@@ -468,12 +484,14 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
                  */
                 smp_mb();
                 if (!icp->state.need_resend) {
-                       mutex_unlock(&ics->lock);
+                       arch_spin_unlock(&ics->lock);
+                       local_irq_restore(flags);
                         goto again;
                 }
         }
   out:
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
  }
  
  static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
@@ -802,14 +820,22 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
         XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
                  hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
  
-       if (icp->rm_action & XICS_RM_KICK_VCPU)
+       if (icp->rm_action & XICS_RM_KICK_VCPU) {
+               icp->n_rm_kick_vcpu++;
                 kvmppc_fast_vcpu_kick(icp->rm_kick_target);
-       if (icp->rm_action & XICS_RM_CHECK_RESEND)
+       }
+       if (icp->rm_action & XICS_RM_CHECK_RESEND) {
+               icp->n_rm_check_resend++;
                 icp_check_resend(xics, icp->rm_resend_icp);
-       if (icp->rm_action & XICS_RM_REJECT)
+       }
+       if (icp->rm_action & XICS_RM_REJECT) {
+               icp->n_rm_reject++;
                 icp_deliver_irq(xics, icp, icp->rm_reject);
-       if (icp->rm_action & XICS_RM_NOTIFY_EOI)
+       }
+       if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
+               icp->n_rm_notify_eoi++;
                 kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
+       }
  
         icp->rm_action = 0;
  
@@ -872,10 +898,21 @@ static int xics_debug_show(struct seq_file *m, void *private)
         struct kvm *kvm = xics->kvm;
         struct kvm_vcpu *vcpu;
         int icsid, i;
+       unsigned long flags;
+       unsigned long t_rm_kick_vcpu, t_rm_check_resend;
+       unsigned long t_rm_reject, t_rm_notify_eoi;
+       unsigned long t_reject, t_check_resend;
  
         if (!kvm)
                 return 0;
  
+       t_rm_kick_vcpu = 0;
+       t_rm_notify_eoi = 0;
+       t_rm_check_resend = 0;
+       t_rm_reject = 0;
+       t_check_resend = 0;
+       t_reject = 0;
+
         seq_printf(m, "=========\nICP state\n=========\n");
  
         kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -890,8 +927,19 @@ static int xics_debug_show(struct seq_file *m, void *private)
                            icp->server_num, state.xisr,
                            state.pending_pri, state.cppr, state.mfrr,
                            state.out_ee, state.need_resend);
+               t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
+               t_rm_notify_eoi += icp->n_rm_notify_eoi;
+               t_rm_check_resend += icp->n_rm_check_resend;
+               t_rm_reject += icp->n_rm_reject;
+               t_check_resend += icp->n_check_resend;
+               t_reject += icp->n_reject;
         }
  
+       seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+                       t_rm_kick_vcpu, t_rm_check_resend,
+                       t_rm_reject, t_rm_notify_eoi);
+       seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
+                       t_check_resend, t_reject);
         for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
                 struct kvmppc_ics *ics = xics->ics[icsid];
  
@@ -901,7 +949,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
                 seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
                            icsid);
  
-               mutex_lock(&ics->lock);
+               local_irq_save(flags);
+               arch_spin_lock(&ics->lock);
  
                 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
                         struct ics_irq_state *irq = &ics->irq_state[i];
@@ -912,7 +961,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
                                    irq->resend, irq->masked_pending);
  
                 }
-               mutex_unlock(&ics->lock);
+               arch_spin_unlock(&ics->lock);
+               local_irq_restore(flags);
         }
         return 0;
  }
@@ -965,7 +1015,6 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
         if (!ics)
                 goto out;
  
-       mutex_init(&ics->lock);
         ics->icsid = icsid;
  
         for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
@@ -1107,13 +1156,15 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
         u64 __user *ubufp = (u64 __user *) addr;
         u16 idx;
         u64 val, prio;
+       unsigned long flags;
  
         ics = kvmppc_xics_find_ics(xics, irq, &idx);
         if (!ics)
                 return -ENOENT;
  
         irqp = &ics->irq_state[idx];
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
         ret = -ENOENT;
         if (irqp->exists) {
                 val = irqp->server;
@@ -1129,7 +1180,8 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
                         val |= KVM_XICS_PENDING;
                 ret = 0;
         }
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
  
         if (!ret && put_user(val, ubufp))
                 ret = -EFAULT;
@@ -1146,6 +1198,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
         u64 val;
         u8 prio;
         u32 server;
+       unsigned long flags;
  
         if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
                 return -ENOENT;
@@ -1166,7 +1219,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
             kvmppc_xics_find_server(xics->kvm, server) == NULL)
                 return -EINVAL;
  
-       mutex_lock(&ics->lock);
+       local_irq_save(flags);
+       arch_spin_lock(&ics->lock);
         irqp->server = server;
         irqp->saved_priority = prio;
         if (val & KVM_XICS_MASKED)
@@ -1178,7 +1232,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
         if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
                 irqp->asserted = 1;
         irqp->exists = 1;
-       mutex_unlock(&ics->lock);
+       arch_spin_unlock(&ics->lock);
+       local_irq_restore(flags);
  
         if (val & KVM_XICS_PENDING)
                 icp_deliver_irq(xics, NULL, irqp->number);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h

index 73f0f27..56ea44f 100644 (file)
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -78,13 +78,22 @@ struct kvmppc_icp {
         u32  rm_reject;
         u32  rm_eoied_irq;
  
+       /* Counters for each reason we exited real mode */
+       unsigned long n_rm_kick_vcpu;
+       unsigned long n_rm_check_resend;
+       unsigned long n_rm_reject;
+       unsigned long n_rm_notify_eoi;
+       /* Counters for handling ICP processing in real mode */
+       unsigned long n_check_resend;
+       unsigned long n_reject;
+
         /* Debug stuff for real mode */
         union kvmppc_icp_state rm_dbgstate;
         struct kvm_vcpu *rm_dbgtgt;
  };
  
  struct kvmppc_ics {
-       struct mutex lock;
+       arch_spinlock_t lock;
         u16 icsid;
         struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
  };
@@ -96,6 +105,8 @@ struct kvmppc_xics {
         u32 max_icsid;
         bool real_mode;
         bool real_mode_dbg;
+       u32 err_noics;
+       u32 err_noicp;
         struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
  };
  
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c

index 24bfe40..55a4763 100644 (file)
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -529,6 +529,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_PPC_RMA:
                 r = 0;
                 break;
+       case KVM_CAP_PPC_HWRNG:
+               r = kvmppc_hwrng_present();
+               break;
  #endif
         case KVM_CAP_SYNC_MMU:
  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c

index 170a034..f7deebd 100644 (file)
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -41,6 +41,7 @@ void __spin_yield(arch_spinlock_t *lock)
         plpar_hcall_norets(H_CONFER,
                 get_hard_smp_processor_id(holder_cpu), yield_count);
  }
+EXPORT_SYMBOL_GPL(__spin_yield);
  
  /*
   * Waiting for a read lock or a write lock on a rwlock...
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c

index 80db439..6eb808f 100644 (file)
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -24,12 +24,22 @@
  
  struct powernv_rng {
         void __iomem *regs;
+       void __iomem *regs_real;
         unsigned long mask;
  };
  
  static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
  
  
+int powernv_hwrng_present(void)
+{
+       struct powernv_rng *rng;
+
+       rng = get_cpu_var(powernv_rng);
+       put_cpu_var(rng);
+       return rng != NULL;
+}
+
  static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
  {
         unsigned long parity;
@@ -46,6 +56,17 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
         return val;
  }
  
+int powernv_get_random_real_mode(unsigned long *v)
+{
+       struct powernv_rng *rng;
+
+       rng = raw_cpu_read(powernv_rng);
+
+       *v = rng_whiten(rng, in_rm64(rng->regs_real));
+
+       return 1;
+}
+
  int powernv_get_random_long(unsigned long *v)
  {
         struct powernv_rng *rng;
@@ -80,12 +101,20 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng,
  static __init int rng_create(struct device_node *dn)
  {
         struct powernv_rng *rng;
+       struct resource res;
         unsigned long val;
  
         rng = kzalloc(sizeof(*rng), GFP_KERNEL);
         if (!rng)
                 return -ENOMEM;
  
+       if (of_address_to_resource(dn, 0, &res)) {
+               kfree(rng);
+               return -ENXIO;
+       }
+
+       rng->regs_real = (void __iomem *)res.start;
+
         rng->regs = of_iomap(dn, 0);
         if (!rng->regs) {
                 kfree(rng);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index afa2bd7..8cd8e7b 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -110,7 +110,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
  /* upper facilities limit for kvm */
  unsigned long kvm_s390_fac_list_mask[] = {
         0xffe6fffbfcfdfc40UL,
-       0x205c800000000000UL,
+       0x005c800000000000UL,
  };
  
  unsigned long kvm_s390_fac_list_mask_size(void)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index d67206a..629af0f 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -683,8 +683,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
         unsigned long bitmap = 1;
         struct kvm_lapic **dst;
         int i;
-       bool ret = false;
-       bool x2apic_ipi = src && apic_x2apic_mode(src);
+       bool ret, x2apic_ipi;
  
         *r = -1;
  
@@ -696,16 +695,18 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
         if (irq->shorthand)
                 return false;
  
+       x2apic_ipi = src && apic_x2apic_mode(src);
         if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
                 return false;
  
+       ret = true;
         rcu_read_lock();
         map = rcu_dereference(kvm->arch.apic_map);
  
-       if (!map)
+       if (!map) {
+               ret = false;
                 goto out;
-
-       ret = true;
+       }
  
         if (irq->dest_mode == APIC_DEST_PHYSICAL) {
                 if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 146f295..d43867c 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4481,9 +4481,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
                 pfn = spte_to_pfn(*sptep);
  
                 /*
-                * Only EPT supported for now; otherwise, one would need to
-                * find out efficiently whether the guest page tables are
-                * also using huge pages.
+                * We cannot do huge page mapping for indirect shadow pages,
+                * which are found on the last rmap (level = 1) when not using
+                * tdp; such shadow pages are synced with the page table in
+                * the guest, and the guest page table is using 4K page size
+                * mapping if the indirect sp has level = 1.
                  */
                 if (sp->role.direct &&
                         !kvm_is_reserved_pfn(pfn) &&
@@ -4504,19 +4506,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
         bool flush = false;
         unsigned long *rmapp;
         unsigned long last_index, index;
-       gfn_t gfn_start, gfn_end;
  
         spin_lock(&kvm->mmu_lock);
  
-       gfn_start = memslot->base_gfn;
-       gfn_end = memslot->base_gfn + memslot->npages - 1;
-
-       if (gfn_start >= gfn_end)
-               goto out;
-
         rmapp = memslot->arch.rmap[0];
-       last_index = gfn_to_index(gfn_end, memslot->base_gfn,
-                                       PT_PAGE_TABLE_LEVEL);
+       last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
+                               memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
  
         for (index = 0; index <= last_index; ++index, ++rmapp) {
                 if (*rmapp)
@@ -4534,7 +4529,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
         if (flush)
                 kvm_flush_remote_tlbs(kvm);
  
-out:
         spin_unlock(&kvm->mmu_lock);
  }
  
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index f5e8dce..f7b6168 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3622,8 +3622,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  
  static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
-       unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
-                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+       /*
+        * Pass through host's Machine Check Enable value to hw_cr4, which
+        * is in force while we are in guest mode.  Do not let guests control
+        * this bit, even if host CR4.MCE == 0.
+        */
+       unsigned long hw_cr4 =
+               (cr4_read_shadow() & X86_CR4_MCE) |
+               (cr4 & ~X86_CR4_MCE) |
+               (to_vmx(vcpu)->rmode.vm86_active ?
+                KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
  
         if (cr4 & X86_CR4_VMXE) {
                 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index e1a8126..ed31c31 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5799,7 +5799,6 @@ int kvm_arch_init(void *opaque)
         kvm_set_mmio_spte_mask();
  
         kvm_x86_ops = ops;
-       kvm_init_msr_list();
  
         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -7253,7 +7252,14 @@ void kvm_arch_hardware_disable(void)
  
  int kvm_arch_hardware_setup(void)
  {
-       return kvm_x86_ops->hardware_setup();
+       int r;
+
+       r = kvm_x86_ops->hardware_setup();
+       if (r != 0)
+               return r;
+
+       kvm_init_msr_list();
+       return 0;
  }
  
  void kvm_arch_hardware_unsetup(void)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index f574d7b..4b60056 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -813,6 +813,7 @@ struct kvm_ppc_smmu_info {
  #define KVM_CAP_MIPS_MSA 112
  #define KVM_CAP_S390_INJECT_IRQ 113
  #define KVM_CAP_S390_IRQ_STATE 114
+#define KVM_CAP_PPC_HWRNG 115
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index d3fc939..9097741 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -89,6 +89,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
  static __read_mostly struct preempt_ops kvm_preempt_ops;
  
  struct dentry *kvm_debugfs_dir;
+EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
  
  static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                            unsigned long arg);
author	Paolo Bonzini <pbonzini@redhat.com>
	Wed, 22 Apr 2015 15:08:12 +0000 (17:08 +0200)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Wed, 22 Apr 2015 15:08:12 +0000 (17:08 +0200)
Documentation/virtual/kvm/api.txt		patch \| blob \| history
arch/powerpc/include/asm/archrandom.h		patch \| blob \| history
arch/powerpc/include/asm/kvm_book3s.h		patch \| blob \| history
arch/powerpc/include/asm/kvm_book3s_64.h		patch \| blob \| history
arch/powerpc/include/asm/kvm_host.h		patch \| blob \| history
arch/powerpc/include/asm/kvm_ppc.h		patch \| blob \| history
arch/powerpc/include/asm/time.h		patch \| blob \| history
arch/powerpc/kernel/asm-offsets.c		patch \| blob \| history
arch/powerpc/kernel/time.c		patch \| blob \| history
arch/powerpc/kvm/Kconfig		patch \| blob \| history
arch/powerpc/kvm/book3s.c		patch \| blob \| history
arch/powerpc/kvm/book3s_64_mmu_hv.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_builtin.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_rm_mmu.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_rm_xics.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_rmhandlers.S		patch \| blob \| history
arch/powerpc/kvm/book3s_pr_papr.c		patch \| blob \| history
arch/powerpc/kvm/book3s_xics.c		patch \| blob \| history
arch/powerpc/kvm/book3s_xics.h		patch \| blob \| history
arch/powerpc/kvm/powerpc.c		patch \| blob \| history
arch/powerpc/lib/locks.c		patch \| blob \| history
arch/powerpc/platforms/powernv/rng.c		patch \| blob \| history
arch/s390/kvm/kvm-s390.c		patch \| blob \| history
arch/x86/kvm/lapic.c		patch \| blob \| history
arch/x86/kvm/mmu.c		patch \| blob \| history
arch/x86/kvm/vmx.c		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
include/uapi/linux/kvm.h		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history