Merge tag 'powerpc-4.1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mpe/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 26 Apr 2015 20:23:15 +0000 (13:23 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 26 Apr 2015 20:23:15 +0000 (13:23 -0700)
Pull powerpc fixes from Michael Ellerman:

 - fix for mm_dec_nr_pmds() from Scott.

 - fixes for oopses seen with KVM + THP from Aneesh.

 - build fixes from Aneesh & Shreyas.

* tag 'powerpc-4.1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mpe/linux:
  powerpc/mm: Fix build error with CONFIG_PPC_TRANSACTIONAL_MEM disabled
  powerpc/kvm: Fix ppc64_defconfig + PPC_POWERNV=n build error
  powerpc/mm/thp: Return pte address if we find trans_splitting.
  powerpc/mm/thp: Make page table walk safe against thp split/collapse
  KVM: PPC: Remove page table walk helpers
  KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer
  powerpc/hugetlb: Call mm_dec_nr_pmds() in hugetlb_free_pmd_range()

arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/kernel/eeh.c
arch/powerpc/kernel/io-workarounds.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/e500_mmu_host.c
arch/powerpc/mm/hash_utils_64.c
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/perf/callchain.c

index 7ae4079..3536d12 100644 (file)
@@ -295,16 +295,17 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
 
 /*
  * If it's present and writable, atomically set dirty and referenced bits and
- * return the PTE, otherwise return 0. If we find a transparent hugepage
- * and if it is marked splitting we return 0;
+ * return the PTE, otherwise return 0.
  */
-static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
-                                                unsigned int hugepage)
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
 {
        pte_t old_pte, new_pte = __pte(0);
 
        while (1) {
-               old_pte = *ptep;
+               /*
+                * Make sure we don't reload from ptep
+                */
+               old_pte = READ_ONCE(*ptep);
                /*
                 * wait until _PAGE_BUSY is clear then set it atomically
                 */
@@ -312,12 +313,6 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
                        cpu_relax();
                        continue;
                }
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-               /* If hugepage and is trans splitting return None */
-               if (unlikely(hugepage &&
-                            pmd_trans_splitting(pte_pmd(old_pte))))
-                       return __pte(0);
-#endif
                /* If pte is not present return None */
                if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT)))
                        return __pte(0);
index 9835ac4..11a3863 100644 (file)
@@ -247,28 +247,16 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 #define pmd_large(pmd)         0
 #define has_transparent_hugepage() 0
 #endif
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
                                 unsigned *shift);
-
-static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
-                                    unsigned long *pte_sizep)
+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+                                              unsigned *shift)
 {
-       pte_t *ptep;
-       unsigned long ps = *pte_sizep;
-       unsigned int shift;
-
-       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
-       if (!ptep)
-               return NULL;
-       if (shift)
-               *pte_sizep = 1ul << shift;
-       else
-               *pte_sizep = PAGE_SIZE;
-
-       if (ps > *pte_sizep)
-               return NULL;
-
-       return ptep;
+       if (!arch_irqs_disabled()) {
+               pr_info("%s called with irq enabled\n", __func__);
+               dump_stack();
+       }
+       return __find_linux_pte_or_hugepte(pgdir, ea, shift);
 }
 #endif /* __ASSEMBLY__ */
 
index a4c62eb..44b480e 100644 (file)
@@ -334,9 +334,11 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
        int hugepage_shift;
 
        /*
-        * We won't find hugepages here, iomem
+        * We won't find hugepages here(this is iomem). Hence we are not
+        * worried about _PAGE_SPLITTING/collapse. Also we will not hit
+        * page table free, because of init_mm.
         */
-       ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
+       ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
        if (!ptep)
                return token;
        WARN_ON(hugepage_shift);
index 24b968f..63d9cc4 100644 (file)
@@ -71,15 +71,15 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
                vaddr = (unsigned long)PCI_FIX_ADDR(addr);
                if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
                        return NULL;
-
-               ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+               /*
+                * We won't find huge pages here (iomem). Also can't hit
+                * a page table free due to init_mm
+                */
+               ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
                                                 &hugepage_shift);
                if (ptep == NULL)
                        paddr = 0;
                else {
-                       /*
-                        * we don't have hugepages backing iomem
-                        */
                        WARN_ON(hugepage_shift);
                        paddr = pte_pfn(*ptep) << PAGE_SHIFT;
                }
index 2963e4d..3caec2c 100644 (file)
@@ -75,7 +75,7 @@ config KVM_BOOK3S_64
 
 config KVM_BOOK3S_64_HV
        tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host"
-       depends on KVM_BOOK3S_64
+       depends on KVM_BOOK3S_64 && PPC_POWERNV
        select KVM_BOOK3S_HV_POSSIBLE
        select MMU_NOTIFIER
        select CMA
index d6fe308..1a4acf8 100644 (file)
@@ -535,23 +535,21 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                }
                /* if the guest wants write access, see if that is OK */
                if (!writing && hpte_is_writable(r)) {
-                       unsigned int hugepage_shift;
                        pte_t *ptep, pte;
-
+                       unsigned long flags;
                        /*
                         * We need to protect against page table destruction
-                        * while looking up and updating the pte.
+                        * hugepage split and collapse.
                         */
-                       rcu_read_lock_sched();
+                       local_irq_save(flags);
                        ptep = find_linux_pte_or_hugepte(current->mm->pgd,
-                                                        hva, &hugepage_shift);
+                                                        hva, NULL);
                        if (ptep) {
-                               pte = kvmppc_read_update_linux_pte(ptep, 1,
-                                                          hugepage_shift);
+                               pte = kvmppc_read_update_linux_pte(ptep, 1);
                                if (pte_write(pte))
                                        write_ok = 1;
                        }
-                       rcu_read_unlock_sched();
+                       local_irq_restore(flags);
                }
        }
 
index f6bf0b1..b027a89 100644 (file)
@@ -26,11 +26,14 @@ static void *real_vmalloc_addr(void *x)
 {
        unsigned long addr = (unsigned long) x;
        pte_t *p;
-
-       p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
+       /*
+        * assume we don't have huge pages in vmalloc space...
+        * So don't worry about THP collapse/split. Called
+        * Only in realmode, hence won't need irq_save/restore.
+        */
+       p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
        if (!p || !pte_present(*p))
                return NULL;
-       /* assume we don't have huge pages in vmalloc space... */
        addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
        return __va(addr);
 }
@@ -131,25 +134,6 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
        unlock_rmap(rmap);
 }
 
-static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
-                             int writing, unsigned long *pte_sizep)
-{
-       pte_t *ptep;
-       unsigned long ps = *pte_sizep;
-       unsigned int hugepage_shift;
-
-       ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
-       if (!ptep)
-               return __pte(0);
-       if (hugepage_shift)
-               *pte_sizep = 1ul << hugepage_shift;
-       else
-               *pte_sizep = PAGE_SIZE;
-       if (ps > *pte_sizep)
-               return __pte(0);
-       return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
-}
-
 long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
                       long pte_index, unsigned long pteh, unsigned long ptel,
                       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
@@ -160,13 +144,13 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
        struct revmap_entry *rev;
        unsigned long g_ptel;
        struct kvm_memory_slot *memslot;
-       unsigned long pte_size;
+       unsigned hpage_shift;
        unsigned long is_io;
        unsigned long *rmap;
-       pte_t pte;
+       pte_t *ptep;
        unsigned int writing;
        unsigned long mmu_seq;
-       unsigned long rcbits;
+       unsigned long rcbits, irq_flags = 0;
 
        psize = hpte_page_size(pteh, ptel);
        if (!psize)
@@ -202,22 +186,46 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
        /* Translate to host virtual address */
        hva = __gfn_to_hva_memslot(memslot, gfn);
-
-       /* Look up the Linux PTE for the backing page */
-       pte_size = psize;
-       pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size);
-       if (pte_present(pte) && !pte_protnone(pte)) {
-               if (writing && !pte_write(pte))
-                       /* make the actual HPTE be read-only */
-                       ptel = hpte_make_readonly(ptel);
-               is_io = hpte_cache_bits(pte_val(pte));
-               pa = pte_pfn(pte) << PAGE_SHIFT;
-               pa |= hva & (pte_size - 1);
-               pa |= gpa & ~PAGE_MASK;
+       /*
+        * If we had a page table table change after lookup, we would
+        * retry via mmu_notifier_retry.
+        */
+       if (realmode)
+               ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
+       else {
+               local_irq_save(irq_flags);
+               ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
        }
+       if (ptep) {
+               pte_t pte;
+               unsigned int host_pte_size;
 
-       if (pte_size < psize)
-               return H_PARAMETER;
+               if (hpage_shift)
+                       host_pte_size = 1ul << hpage_shift;
+               else
+                       host_pte_size = PAGE_SIZE;
+               /*
+                * We should always find the guest page size
+                * to <= host page size, if host is using hugepage
+                */
+               if (host_pte_size < psize) {
+                       if (!realmode)
+                               local_irq_restore(flags);
+                       return H_PARAMETER;
+               }
+               pte = kvmppc_read_update_linux_pte(ptep, writing);
+               if (pte_present(pte) && !pte_protnone(pte)) {
+                       if (writing && !pte_write(pte))
+                               /* make the actual HPTE be read-only */
+                               ptel = hpte_make_readonly(ptel);
+                       is_io = hpte_cache_bits(pte_val(pte));
+                       pa = pte_pfn(pte) << PAGE_SHIFT;
+                       pa |= hva & (host_pte_size - 1);
+                       pa |= gpa & ~PAGE_MASK;
+               }
+       }
+       if (!realmode)
+               local_irq_restore(irq_flags);
 
        ptel &= ~(HPTE_R_PP0 - psize);
        ptel |= pa;
index cc536d4..4d33e19 100644 (file)
@@ -338,6 +338,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
        pte_t *ptep;
        unsigned int wimg = 0;
        pgd_t *pgdir;
+       unsigned long flags;
 
        /* used to check for invalidations in progress */
        mmu_seq = kvm->mmu_notifier_seq;
@@ -468,15 +469,28 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 
        pgdir = vcpu_e500->vcpu.arch.pgdir;
-       ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages);
-       if (pte_present(*ptep))
-               wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
-       else {
-               if (printk_ratelimit())
-                       pr_err("%s: pte not present: gfn %lx, pfn %lx\n",
-                               __func__, (long)gfn, pfn);
-               ret = -EINVAL;
-               goto out;
+       /*
+        * We are just looking at the wimg bits, so we don't
+        * care much about the trans splitting bit.
+        * We are holding kvm->mmu_lock so a notifier invalidate
+        * can't run hence pfn won't change.
+        */
+       local_irq_save(flags);
+       ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL);
+       if (ptep) {
+               pte_t pte = READ_ONCE(*ptep);
+
+               if (pte_present(pte)) {
+                       wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
+                               MAS2_WIMGE_MASK;
+                       local_irq_restore(flags);
+               } else {
+                       local_irq_restore(flags);
+                       pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
+                                          __func__, (long)gfn, pfn);
+                       ret = -EINVAL;
+                       goto out;
+               }
        }
        kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
 
index 2c2022d..fda236f 100644 (file)
@@ -1066,7 +1066,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 #endif /* CONFIG_PPC_64K_PAGES */
 
        /* Get PTE and page size from page tables */
-       ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
+       ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
        if (ptep == NULL || !pte_present(*ptep)) {
                DBG_LOW(" no PTE !\n");
                rc = 1;
@@ -1394,6 +1394,7 @@ tm_abort:
                tm_abort(TM_CAUSE_TLBI);
        }
 #endif
+       return;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
index fa9d5c2..0ce968b 100644 (file)
@@ -109,7 +109,7 @@ int pgd_huge(pgd_t pgd)
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
        /* Only called for hugetlbfs pages, hence can ignore THP */
-       return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+       return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -581,6 +581,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
+       mm_dec_nr_pmds(tlb->mm);
 }
 
 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -681,28 +682,35 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
        } while (addr = next, addr != end);
 }
 
+/*
+ * We are holding mmap_sem, so a parallel huge page collapse cannot run.
+ * To prevent hugepage split, disable irq.
+ */
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
        pte_t *ptep;
        struct page *page;
        unsigned shift;
-       unsigned long mask;
+       unsigned long mask, flags;
        /*
         * Transparent hugepages are handled by generic code. We can skip them
         * here.
         */
+       local_irq_save(flags);
        ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 
        /* Verify it is a huge page else bail. */
-       if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
+       if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) {
+               local_irq_restore(flags);
                return ERR_PTR(-EINVAL);
-
+       }
        mask = (1UL << shift) - 1;
        page = pte_page(*ptep);
        if (page)
                page += (address & mask) / PAGE_SIZE;
 
+       local_irq_restore(flags);
        return page;
 }
 
@@ -949,9 +957,12 @@ void flush_dcache_icache_hugepage(struct page *page)
  *
  * So long as we atomically load page table pointers we are safe against teardown,
  * we can follow the address down to the the page and take a ref on it.
+ * This function need to be called with interrupts disabled. We use this variant
+ * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
  */
 
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+                                  unsigned *shift)
 {
        pgd_t pgd, *pgdp;
        pud_t pud, *pudp;
@@ -1003,12 +1014,11 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
                         * A hugepage collapse is captured by pmd_none, because
                         * it mark the pmd none and do a hpte invalidate.
                         *
-                        * A hugepage split is captured by pmd_trans_splitting
-                        * because we mark the pmd trans splitting and do a
-                        * hpte invalidate
-                        *
+                        * We don't worry about pmd_trans_splitting here, The
+                        * caller if it needs to handle the splitting case
+                        * should check for that.
                         */
-                       if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+                       if (pmd_none(pmd))
                                return NULL;
 
                        if (pmd_huge(pmd) || pmd_large(pmd)) {
@@ -1030,7 +1040,7 @@ out:
                *shift = pdshift;
        return ret_pte;
 }
-EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
 
 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
index ead5535..ff09cde 100644 (file)
@@ -111,41 +111,45 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  * interrupt context, so if the access faults, we read the page tables
  * to find which page (if any) is mapped and access it directly.
  */
-static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
+static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
 {
+       int ret = -EFAULT;
        pgd_t *pgdir;
        pte_t *ptep, pte;
        unsigned shift;
        unsigned long addr = (unsigned long) ptr;
        unsigned long offset;
-       unsigned long pfn;
+       unsigned long pfn, flags;
        void *kaddr;
 
        pgdir = current->mm->pgd;
        if (!pgdir)
                return -EFAULT;
 
+       local_irq_save(flags);
        ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
+       if (!ptep)
+               goto err_out;
        if (!shift)
                shift = PAGE_SHIFT;
 
        /* align address to page boundary */
        offset = addr & ((1UL << shift) - 1);
-       addr -= offset;
 
-       if (ptep == NULL)
-               return -EFAULT;
-       pte = *ptep;
+       pte = READ_ONCE(*ptep);
        if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
-               return -EFAULT;
+               goto err_out;
        pfn = pte_pfn(pte);
        if (!page_is_ram(pfn))
-               return -EFAULT;
+               goto err_out;
 
        /* no highmem to worry about here */
        kaddr = pfn_to_kaddr(pfn);
-       memcpy(ret, kaddr + offset, nb);
-       return 0;
+       memcpy(buf, kaddr + offset, nb);
+       ret = 0;
+err_out:
+       local_irq_restore(flags);
+       return ret;
 }
 
 static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)