x86: add tizen_qemu_x86_defconfig & tizen_qemu_x86_64_defconfig
[platform/kernel/linux-rpi.git] / mm / hugetlb.c
index 95dc7b8..8599f16 100644 (file)
@@ -82,6 +82,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end);
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
 {
@@ -2813,11 +2815,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                if (!page)
                        goto out_uncharge_cgroup;
+               spin_lock_irq(&hugetlb_lock);
                if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
                        SetHPageRestoreReserve(page);
                        h->resv_huge_pages--;
                }
-               spin_lock_irq(&hugetlb_lock);
                list_add(&page->lru, &h->hugepage_activelist);
                /* Fall through */
        }
@@ -4164,6 +4166,25 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
 {
        if (addr & ~(huge_page_mask(hstate_vma(vma))))
                return -EINVAL;
+
+       /*
+        * PMD sharing is only possible for PUD_SIZE-aligned address ranges
+        * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
+        * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+        */
+       if (addr & ~PUD_MASK) {
+               /*
+                * hugetlb_vm_op_split is called right before we attempt to
+                * split the VMA. We will need to unshare PMDs in the old and
+                * new VMAs, so let's unshare before we split.
+                */
+               unsigned long floor = addr & PUD_MASK;
+               unsigned long ceil = floor + PUD_SIZE;
+
+               if (floor >= vma->vm_start && ceil <= vma->vm_end)
+                       hugetlb_unshare_pmds(vma, floor, ceil);
+       }
+
        return 0;
 }
 
@@ -4439,6 +4460,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        struct mmu_notifier_range range;
+       bool force_flush = false;
 
        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
@@ -4467,10 +4489,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(mm, vma, &address, ptep)) {
                        spin_unlock(ptl);
-                       /*
-                        * We just unmapped a page of PMDs by clearing a PUD.
-                        * The caller's TLB flush range should cover this area.
-                        */
+                       tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
+                       force_flush = true;
                        continue;
                }
 
@@ -4527,6 +4547,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        }
        mmu_notifier_invalidate_range_end(&range);
        tlb_end_vma(tlb, vma);
+
+       /*
+        * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
+        * could defer the flush until now, since by holding i_mmap_rwsem we
+        * guaranteed that the last refernece would not be dropped. But we must
+        * do the flushing before we return, as otherwise i_mmap_rwsem will be
+        * dropped and the last reference to the shared PMDs page might be
+        * dropped as well.
+        *
+        * In theory we could defer the freeing of the PMD pages as well, but
+        * huge_pmd_unshare() relies on the exact page_count for the PMD page to
+        * detect sharing, so we cannot defer the release of the page either.
+        * Instead, do flush now.
+        */
+       if (force_flush)
+               tlb_flush_mmu_tlbonly(tlb);
 }
 
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
@@ -4829,7 +4865,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
                                                  unsigned long haddr,
                                                  unsigned long reason)
 {
-       vm_fault_t ret;
        u32 hash;
        struct vm_fault vmf = {
                .vma = vma,
@@ -4846,18 +4881,14 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
        };
 
        /*
-        * hugetlb_fault_mutex and i_mmap_rwsem must be
-        * dropped before handling userfault.  Reacquire
-        * after handling fault to make calling code simpler.
+        * vma_lock and hugetlb_fault_mutex must be dropped before handling
+        * userfault. Also mmap_lock will be dropped during handling
+        * userfault, any vma operation should be careful from here.
         */
        hash = hugetlb_fault_mutex_hash(mapping, idx);
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        i_mmap_unlock_read(mapping);
-       ret = handle_userfault(&vmf, reason);
-       i_mmap_lock_read(mapping);
-       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
-       return ret;
+       return handle_userfault(&vmf, reason);
 }
 
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
@@ -4874,6 +4905,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        spinlock_t *ptl;
        unsigned long haddr = address & huge_page_mask(h);
        bool new_page, new_pagecache_page = false;
+       u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
 
        /*
         * Currently, we are forced to kill the process in the event the
@@ -4883,7 +4915,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                           current->pid);
-               return ret;
+               goto out;
        }
 
        /*
@@ -4900,12 +4932,10 @@ retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
                /* Check for page in userfault range */
-               if (userfaultfd_missing(vma)) {
-                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+               if (userfaultfd_missing(vma))
+                       return hugetlb_handle_userfault(vma, mapping, idx,
                                                       flags, haddr,
                                                       VM_UFFD_MISSING);
-                       goto out;
-               }
 
                page = alloc_huge_page(vma, haddr, 0);
                if (IS_ERR(page)) {
@@ -4965,10 +4995,9 @@ retry:
                if (userfaultfd_minor(vma)) {
                        unlock_page(page);
                        put_page(page);
-                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                       return hugetlb_handle_userfault(vma, mapping, idx,
                                                       flags, haddr,
                                                       VM_UFFD_MINOR);
-                       goto out;
                }
        }
 
@@ -5019,6 +5048,8 @@ retry:
 
        unlock_page(page);
 out:
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+       i_mmap_unlock_read(mapping);
        return ret;
 
 backout:
@@ -5116,10 +5147,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
        entry = huge_ptep_get(ptep);
-       if (huge_pte_none(entry)) {
-               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
-               goto out_mutex;
-       }
+       if (huge_pte_none(entry))
+               /*
+                * hugetlb_no_page will drop vma lock and hugetlb fault
+                * mutex internally, which make us return immediately.
+                */
+               return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
 
        ret = 0;
 
@@ -5236,13 +5269,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        int ret = -ENOMEM;
        struct page *page;
        int writable;
-       bool new_pagecache_page = false;
+       bool page_in_pagecache = false;
 
        if (is_continue) {
                ret = -EFAULT;
                page = find_lock_page(mapping, idx);
                if (!page)
                        goto out;
+               page_in_pagecache = true;
        } else if (!*pagep) {
                /* If a page already exists, then it's UFFDIO_COPY for
                 * a non-missing case. Return -EEXIST.
@@ -5298,6 +5332,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 
                page = alloc_huge_page(dst_vma, dst_addr, 0);
                if (IS_ERR(page)) {
+                       put_page(*pagep);
                        ret = -ENOMEM;
                        *pagep = NULL;
                        goto out;
@@ -5330,12 +5365,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                ret = huge_add_to_page_cache(page, mapping, idx);
                if (ret)
                        goto out_release_nounlock;
-               new_pagecache_page = true;
+               page_in_pagecache = true;
        }
 
        ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
        spin_lock(ptl);
 
+       ret = -EIO;
+       if (PageHWPoison(page))
+               goto out_release_unlock;
+
        /*
         * Recheck the i_size after holding PT lock to make sure not
         * to leave any page mapped (as page_mapped()) beyond the end
@@ -5354,7 +5393,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        if (!huge_pte_none(huge_ptep_get(dst_pte)))
                goto out_release_unlock;
 
-       if (vm_shared) {
+       if (page_in_pagecache) {
                page_dup_rmap(page, true);
        } else {
                ClearHPageRestoreReserve(page);
@@ -5394,7 +5433,7 @@ out_release_unlock:
        if (vm_shared || is_continue)
                unlock_page(page);
 out_release_nounlock:
-       if (!new_pagecache_page)
+       if (!page_in_pagecache)
                restore_reserve_on_error(h, dst_vma, dst_addr, page);
        put_page(page);
        goto out;
@@ -6044,7 +6083,14 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
        pud_clear(pud);
        put_page(virt_to_page(ptep));
        mm_dec_nr_pmds(mm);
-       *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+       /*
+        * This update of passed address optimizes loops sequentially
+        * processing addresses in increments of huge page size (PMD_SIZE
+        * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
+        * Update address to the 'last page' in the cleared area so that
+        * calling loop can move to first page past this area.
+        */
+       *addr |= PUD_SIZE - PMD_SIZE;
        return 1;
 }
 
@@ -6161,12 +6207,13 @@ follow_huge_pd(struct vm_area_struct *vma,
 }
 
 struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd, int flags)
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
 {
+       struct hstate *h = hstate_vma(vma);
+       struct mm_struct *mm = vma->vm_mm;
        struct page *page = NULL;
        spinlock_t *ptl;
-       pte_t pte;
+       pte_t *ptep, pte;
 
        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
@@ -6174,17 +6221,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
                return NULL;
 
 retry:
-       ptl = pmd_lockptr(mm, pmd);
-       spin_lock(ptl);
-       /*
-        * make sure that the address range covered by this pmd is not
-        * unmapped from other threads.
-        */
-       if (!pmd_huge(*pmd))
-               goto out;
-       pte = huge_ptep_get((pte_t *)pmd);
+       ptep = huge_pte_offset(mm, address, huge_page_size(h));
+       if (!ptep)
+               return NULL;
+
+       ptl = huge_pte_lock(h, mm, ptep);
+       pte = huge_ptep_get(ptep);
        if (pte_present(pte)) {
-               page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+               page = pte_page(pte) +
+                       ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
                /*
                 * try_grab_page() should always succeed here, because: a) we
                 * hold the pmd (ptl) lock, and b) we've just checked that the
@@ -6200,7 +6245,7 @@ retry:
        } else {
                if (is_hugetlb_entry_migration(pte)) {
                        spin_unlock(ptl);
-                       __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+                       __migration_entry_wait(mm, ptep, ptl);
                        goto retry;
                }
                /*
@@ -6267,6 +6312,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
        return ret;
 }
 
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+{
+       int ret;
+
+       spin_lock_irq(&hugetlb_lock);
+       ret = __get_huge_page_for_hwpoison(pfn, flags);
+       spin_unlock_irq(&hugetlb_lock);
+       return ret;
+}
+
 void putback_active_hugepage(struct page *page)
 {
        spin_lock_irq(&hugetlb_lock);
@@ -6315,26 +6370,21 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
        }
 }
 
-/*
- * This function will unconditionally remove all the shared pmd pgtable entries
- * within the specific vma for a hugetlbfs memory range.
- */
-void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+                                  unsigned long start,
+                                  unsigned long end)
 {
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_notifier_range range;
-       unsigned long address, start, end;
+       unsigned long address;
        spinlock_t *ptl;
        pte_t *ptep;
 
        if (!(vma->vm_flags & VM_MAYSHARE))
                return;
 
-       start = ALIGN(vma->vm_start, PUD_SIZE);
-       end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
-
        if (start >= end)
                return;
 
@@ -6366,6 +6416,16 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
        mmu_notifier_invalidate_range_end(&range);
 }
 
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+       hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
+                       ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+}
+
 #ifdef CONFIG_CMA
 static bool cma_reserve_called __initdata;