From: Linus Torvalds Date: Sun, 26 Apr 2015 20:23:15 +0000 (-0700) Subject: Merge tag 'powerpc-4.1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mpe/linux X-Git-Tag: v4.14-rc1~5477 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=63905bba5b0170492777b327ac5e2aaef64989d6;hp=-c;p=platform%2Fkernel%2Flinux-rpi.git Merge tag 'powerpc-4.1-2' of git://git./linux/kernel/git/mpe/linux Pull powerpc fixes from Michael Ellerman: - fix for mm_dec_nr_pmds() from Scott. - fixes for oopses seen with KVM + THP from Aneesh. - build fixes from Aneesh & Shreyas. * tag 'powerpc-4.1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mpe/linux: powerpc/mm: Fix build error with CONFIG_PPC_TRANSACTIONAL_MEM disabled powerpc/kvm: Fix ppc64_defconfig + PPC_POWERNV=n build error powerpc/mm/thp: Return pte address if we find trans_splitting. powerpc/mm/thp: Make page table walk safe against thp split/collapse KVM: PPC: Remove page table walk helpers KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer powerpc/hugetlb: Call mm_dec_nr_pmds() in hugetlb_free_pmd_range() --- 63905bba5b0170492777b327ac5e2aaef64989d6 diff --combined arch/powerpc/include/asm/kvm_book3s_64.h index 7ae407941be2,ce5610aa124c..3536d12eb798 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@@ -85,20 -85,6 +85,20 @@@ static inline long try_lock_hpte(__be6 return old == 0; } +static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) +{ + hpte_v &= ~HPTE_V_HVLOCK; + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hpte[0] = cpu_to_be64(hpte_v); +} + +/* Without barrier */ +static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v) +{ + hpte_v &= ~HPTE_V_HVLOCK; + hpte[0] = cpu_to_be64(hpte_v); +} + static inline int __hpte_actual_psize(unsigned int lp, int psize) { int i, shift; @@@ -295,16 -281,17 +295,17 @@@ static inline int hpte_cache_flags_ok(u /* * If it's present and writable, atomically set dirty and referenced bits and - * return the PTE, otherwise return 0. If we find a transparent hugepage - * and if it is marked splitting we return 0; + * return the PTE, otherwise return 0. */ - static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing, - unsigned int hugepage) + static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) { pte_t old_pte, new_pte = __pte(0); while (1) { - old_pte = *ptep; + /* + * Make sure we don't reload from ptep + */ + old_pte = READ_ONCE(*ptep); /* * wait until _PAGE_BUSY is clear then set it atomically */ @@@ -312,12 -299,6 +313,6 @@@ cpu_relax(); continue; } - #ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* If hugepage and is trans splitting return None */ - if (unlikely(hugepage && - pmd_trans_splitting(pte_pmd(old_pte)))) - return __pte(0); - #endif /* If pte is not present return None */ if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT))) return __pte(0); @@@ -438,10 -419,6 +433,10 @@@ static inline struct kvm_memslots *kvm_ return rcu_dereference_raw_notrace(kvm->memslots); } +extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); + +extern void kvmhv_rm_send_ipi(int cpu); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --combined arch/powerpc/kvm/Kconfig index 2963e4dd0b80,b3b3d9f62a93..3caec2c42105 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@@ -75,7 -75,7 +75,7 @@@ config KVM_BOOK3S_6 config KVM_BOOK3S_64_HV tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" - depends on KVM_BOOK3S_64 + depends on KVM_BOOK3S_64 && PPC_POWERNV select KVM_BOOK3S_HV_POSSIBLE select MMU_NOTIFIER select CMA @@@ -110,20 -110,6 +110,20 @@@ config KVM_BOOK3S_64_P processor, including emulating 32-bit processors on a 64-bit host. +config KVM_BOOK3S_HV_EXIT_TIMING + bool "Detailed timing for hypervisor real-mode code" + depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS + ---help--- + Calculate time taken for each vcpu in the real-mode guest entry, + exit, and interrupt handling code, plus time spent in the guest + and in nap mode due to idle (cede) while other threads are still + in the guest. The total, minimum and maximum times in nanoseconds + together with the number of executions are reported in debugfs in + kvm/vm#/vcpu#/timings. The overhead is of the order of 30 - 40 + ns per exit on POWER8. + + If unsure, say N. + config KVM_BOOKE_HV bool diff --combined arch/powerpc/kvm/book3s_64_mmu_hv.c index d6fe30835c58,0fe9c92e78ed..1a4acf8bf4f4 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@@ -27,7 -27,6 +27,7 @@@ #include #include #include +#include #include #include @@@ -117,12 -116,12 +117,12 @@@ long kvmppc_alloc_reset_hpt(struct kvm long order; mutex_lock(&kvm->lock); - if (kvm->arch.rma_setup_done) { - kvm->arch.rma_setup_done = 0; - /* order rma_setup_done vs. vcpus_running */ + if (kvm->arch.hpte_setup_done) { + kvm->arch.hpte_setup_done = 0; + /* order hpte_setup_done vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.rma_setup_done = 1; + kvm->arch.hpte_setup_done = 1; goto out; } } @@@ -339,7 -338,9 +339,7 @@@ static int kvmppc_mmu_book3s_64_hv_xlat v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; gr = kvm->arch.revmap[index].guest_rpte; - /* Unlock the HPTE */ - asm volatile("lwsync" : : : "memory"); - hptep[0] = cpu_to_be64(v); + unlock_hpte(hptep, v); preempt_enable(); gpte->eaddr = eaddr; @@@ -468,7 -469,8 +468,7 @@@ int kvmppc_book3s_hv_page_fault(struct hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; hpte[1] = be64_to_cpu(hptep[1]); hpte[2] = r = rev->guest_rpte; - asm volatile("lwsync" : : : "memory"); - hptep[0] = cpu_to_be64(hpte[0]); + unlock_hpte(hptep, hpte[0]); preempt_enable(); if (hpte[0] != vcpu->arch.pgfault_hpte[0] || @@@ -535,23 -537,21 +535,21 @@@ } /* if the guest wants write access, see if that is OK */ if (!writing && hpte_is_writable(r)) { - unsigned int hugepage_shift; pte_t *ptep, pte; - + unsigned long flags; /* * We need to protect against page table destruction - * while looking up and updating the pte. + * hugepage split and collapse. */ - rcu_read_lock_sched(); + local_irq_save(flags); ptep = find_linux_pte_or_hugepte(current->mm->pgd, - hva, &hugepage_shift); + hva, NULL); if (ptep) { - pte = kvmppc_read_update_linux_pte(ptep, 1, - hugepage_shift); + pte = kvmppc_read_update_linux_pte(ptep, 1); if (pte_write(pte)) write_ok = 1; } - rcu_read_unlock_sched(); + local_irq_restore(flags); } } @@@ -619,7 -619,7 +617,7 @@@ hptep[1] = cpu_to_be64(r); eieio(); - hptep[0] = cpu_to_be64(hpte[0]); + __unlock_hpte(hptep, hpte[0]); asm volatile("ptesync" : : : "memory"); preempt_enable(); if (page && hpte_is_writable(r)) @@@ -640,7 -640,7 +638,7 @@@ return ret; out_unlock: - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); preempt_enable(); goto out_put; } @@@ -769,7 -769,7 +767,7 @@@ static int kvm_unmap_rmapp(struct kvm * } } unlock_rmap(rmapp); - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } return 0; } @@@ -855,7 -855,7 +853,7 @@@ static int kvm_age_rmapp(struct kvm *kv } ret = 1; } - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } while ((i = j) != head); unlock_rmap(rmapp); @@@ -972,7 -972,8 +970,7 @@@ static int kvm_test_clear_dirty_npages( /* Now check and modify the HPTE */ if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { - /* unlock and continue */ - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); continue; } @@@ -993,9 -994,9 +991,9 @@@ npages_dirty = n; eieio(); } - v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); + v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; - hptep[0] = cpu_to_be64(v); + __unlock_hpte(hptep, v); } while ((i = j) != head); unlock_rmap(rmapp); @@@ -1215,7 -1216,8 +1213,7 @@@ static long record_hpte(unsigned long f r &= ~HPTE_GR_MODIFIED; revp->guest_rpte = r; } - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); - hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + unlock_hpte(hptp, be64_to_cpu(hptp[0])); preempt_enable(); if (!(valid == want_valid && (first_pass || dirty))) ok = 0; @@@ -1335,20 -1337,20 +1333,20 @@@ static ssize_t kvm_htab_write(struct fi unsigned long tmp[2]; ssize_t nb; long int err, ret; - int rma_setup; + int hpte_setup; if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; /* lock out vcpus from running while we're doing this */ mutex_lock(&kvm->lock); - rma_setup = kvm->arch.rma_setup_done; - if (rma_setup) { - kvm->arch.rma_setup_done = 0; /* temporarily */ - /* order rma_setup_done vs. vcpus_running */ + hpte_setup = kvm->arch.hpte_setup_done; + if (hpte_setup) { + kvm->arch.hpte_setup_done = 0; /* temporarily */ + /* order hpte_setup_done vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.rma_setup_done = 1; + kvm->arch.hpte_setup_done = 1; mutex_unlock(&kvm->lock); return -EBUSY; } @@@ -1401,7 -1403,7 +1399,7 @@@ "r=%lx\n", ret, i, v, r); goto out; } - if (!rma_setup && is_vrma_hpte(v)) { + if (!hpte_setup && is_vrma_hpte(v)) { unsigned long psize = hpte_base_page_size(v, r); unsigned long senc = slb_pgsize_encoding(psize); unsigned long lpcr; @@@ -1410,7 -1412,7 +1408,7 @@@ (VRMA_VSID << SLB_VSID_SHIFT_1T); lpcr = senc << (LPCR_VRMASD_SH - 4); kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - rma_setup = 1; + hpte_setup = 1; } ++i; hptp += 2; @@@ -1426,9 -1428,9 +1424,9 @@@ } out: - /* Order HPTE updates vs. rma_setup_done */ + /* Order HPTE updates vs. hpte_setup_done */ smp_wmb(); - kvm->arch.rma_setup_done = rma_setup; + kvm->arch.hpte_setup_done = hpte_setup; mutex_unlock(&kvm->lock); if (err) @@@ -1491,141 -1493,6 +1489,141 @@@ int kvm_vm_ioctl_get_htab_fd(struct kv return ret; } +struct debugfs_htab_state { + struct kvm *kvm; + struct mutex mutex; + unsigned long hpt_index; + int chars_left; + int buf_index; + char buf[64]; +}; + +static int debugfs_htab_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + struct debugfs_htab_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(kvm); + p->kvm = kvm; + mutex_init(&p->mutex); + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_htab_release(struct inode *inode, struct file *file) +{ + struct debugfs_htab_state *p = file->private_data; + + kvm_put_kvm(p->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_htab_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_htab_state *p = file->private_data; + ssize_t ret, r; + unsigned long i, n; + unsigned long v, hr, gr; + struct kvm *kvm; + __be64 *hptp; + + ret = mutex_lock_interruptible(&p->mutex); + if (ret) + return ret; + + if (p->chars_left) { + n = p->chars_left; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf + p->buf_index, n); + n -= r; + p->chars_left -= n; + p->buf_index += n; + buf += n; + len -= n; + ret = n; + if (r) { + if (!n) + ret = -EFAULT; + goto out; + } + } + + kvm = p->kvm; + i = p->hpt_index; + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { + if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) + continue; + + /* lock the HPTE so it's stable and read it */ + preempt_disable(); + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) + cpu_relax(); + v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; + hr = be64_to_cpu(hptp[1]); + gr = kvm->arch.revmap[i].guest_rpte; + unlock_hpte(hptp, v); + preempt_enable(); + + if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) + continue; + + n = scnprintf(p->buf, sizeof(p->buf), + "%6lx %.16lx %.16lx %.16lx\n", + i, v, hr, gr); + p->chars_left = n; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf, n); + n -= r; + p->chars_left -= n; + p->buf_index = n; + buf += n; + len -= n; + ret += n; + if (r) { + if (!ret) + ret = -EFAULT; + goto out; + } + } + p->hpt_index = i; + + out: + mutex_unlock(&p->mutex); + return ret; +} + +ssize_t debugfs_htab_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_htab_fops = { + .owner = THIS_MODULE, + .open = debugfs_htab_open, + .release = debugfs_htab_release, + .read = debugfs_htab_read, + .write = debugfs_htab_write, + .llseek = generic_file_llseek, +}; + +void kvmppc_mmu_debugfs_init(struct kvm *kvm) +{ + kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, + kvm->arch.debugfs_dir, kvm, + &debugfs_htab_fops); +} + void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) { struct kvmppc_mmu *mmu = &vcpu->arch.mmu; diff --combined arch/powerpc/kvm/book3s_hv_rm_mmu.c index f6bf0b1de6d7,d839f08cb903..b027a89737b6 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@@ -26,11 -26,14 +26,14 @@@ static void *real_vmalloc_addr(void *x { unsigned long addr = (unsigned long) x; pte_t *p; - - p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); + /* + * assume we don't have huge pages in vmalloc space... + * So don't worry about THP collapse/split. Called + * Only in realmode, hence won't need irq_save/restore. + */ + p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); if (!p || !pte_present(*p)) return NULL; - /* assume we don't have huge pages in vmalloc space... */ addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); return __va(addr); } @@@ -131,25 -134,12 +134,6 @@@ static void remove_revmap_chain(struct unlock_rmap(rmap); } - static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, - int writing, unsigned long *pte_sizep) -static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) --{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int hugepage_shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); - if (!ptep) - return __pte(0); - if (hugepage_shift) - *pte_sizep = 1ul << hugepage_shift; - else - *pte_sizep = PAGE_SIZE; - if (ps > *pte_sizep) - return __pte(0); - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); - hpte[0] = cpu_to_be64(hpte_v); --} -- long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) @@@ -160,13 -150,13 +144,13 @@@ struct revmap_entry *rev; unsigned long g_ptel; struct kvm_memory_slot *memslot; - unsigned long pte_size; + unsigned hpage_shift; unsigned long is_io; unsigned long *rmap; - pte_t pte; + pte_t *ptep; unsigned int writing; unsigned long mmu_seq; - unsigned long rcbits; + unsigned long rcbits, irq_flags = 0; psize = hpte_page_size(pteh, ptel); if (!psize) @@@ -202,22 -192,46 +186,46 @@@ /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); - - /* Look up the Linux PTE for the backing page */ - pte_size = psize; - pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size); - if (pte_present(pte) && !pte_protnone(pte)) { - if (writing && !pte_write(pte)) - /* make the actual HPTE be read-only */ - ptel = hpte_make_readonly(ptel); - is_io = hpte_cache_bits(pte_val(pte)); - pa = pte_pfn(pte) << PAGE_SHIFT; - pa |= hva & (pte_size - 1); - pa |= gpa & ~PAGE_MASK; + /* + * If we had a page table table change after lookup, we would + * retry via mmu_notifier_retry. + */ + if (realmode) + ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); + else { + local_irq_save(irq_flags); + ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); } + if (ptep) { + pte_t pte; + unsigned int host_pte_size; - if (pte_size < psize) - return H_PARAMETER; + if (hpage_shift) + host_pte_size = 1ul << hpage_shift; + else + host_pte_size = PAGE_SIZE; + /* + * We should always find the guest page size + * to <= host page size, if host is using hugepage + */ + if (host_pte_size < psize) { + if (!realmode) + local_irq_restore(flags); + return H_PARAMETER; + } + pte = kvmppc_read_update_linux_pte(ptep, writing); + if (pte_present(pte) && !pte_protnone(pte)) { + if (writing && !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) << PAGE_SHIFT; + pa |= hva & (host_pte_size - 1); + pa |= gpa & ~PAGE_MASK; + } + } + if (!realmode) + local_irq_restore(irq_flags); ptel &= ~(HPTE_R_PP0 - psize); ptel |= pa; @@@ -265,10 -279,10 +273,10 @@@ u64 pte; while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(*hpte); + pte = be64_to_cpu(hpte[0]); if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT))) break; - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); hpte += 2; } if (i == 8) @@@ -284,9 -298,9 +292,9 @@@ while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(*hpte); + pte = be64_to_cpu(hpte[0]); if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) { - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); return H_PTEG_FULL; } } @@@ -325,7 -339,7 +333,7 @@@ /* Write the first HPTE dword, unlocking the HPTE and making it valid */ eieio(); - hpte[0] = cpu_to_be64(pteh); + __unlock_hpte(hpte, pteh); asm volatile("ptesync" : : : "memory"); *pte_idx_ret = pte_index; @@@ -406,7 -420,7 +414,7 @@@ long kvmppc_do_h_remove(struct kvm *kvm if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) || ((flags & H_ANDCOND) && (pte & avpn) != 0)) { - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); return H_NOT_FOUND; } @@@ -542,7 -556,7 +550,7 @@@ long kvmppc_h_bulk_remove(struct kvm_vc be64_to_cpu(hp[0]), be64_to_cpu(hp[1])); rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); args[j] |= rcbits << (56 - 5); - hp[0] = 0; + __unlock_hpte(hp, 0); } } @@@ -568,7 -582,7 +576,7 @@@ long kvmppc_h_protect(struct kvm_vcpu * pte = be64_to_cpu(hpte[0]); if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) { - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); return H_NOT_FOUND; } @@@ -749,7 -763,8 +757,7 @@@ long kvmppc_hv_find_lock_hpte(struct kv /* Return with the HPTE still locked */ return (hash << 3) + (i >> 1); - /* Unlock and move on */ - hpte[i] = cpu_to_be64(v); + __unlock_hpte(&hpte[i], v); } if (val & HPTE_V_SECONDARY)