x86: add tizen_qemu_x86_defconfig & tizen_qemu_x86_64_defconfig

[platform/kernel/linux-rpi.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 95dc7b8..8599f16 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -82,6 +82,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
  
  /* Forward declaration */
  static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end);
  
  static inline bool subpool_is_free(struct hugepage_subpool *spool)
  {
@@ -2813,11 +2815,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                 if (!page)
                         goto out_uncharge_cgroup;
+               spin_lock_irq(&hugetlb_lock);
                 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
                         SetHPageRestoreReserve(page);
                         h->resv_huge_pages--;
                 }
-               spin_lock_irq(&hugetlb_lock);
                 list_add(&page->lru, &h->hugepage_activelist);
                 /* Fall through */
         }
@@ -4164,6 +4166,25 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
  {
         if (addr & ~(huge_page_mask(hstate_vma(vma))))
                 return -EINVAL;
+
+       /*
+        * PMD sharing is only possible for PUD_SIZE-aligned address ranges
+        * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
+        * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+        */
+       if (addr & ~PUD_MASK) {
+               /*
+                * hugetlb_vm_op_split is called right before we attempt to
+                * split the VMA. We will need to unshare PMDs in the old and
+                * new VMAs, so let's unshare before we split.
+                */
+               unsigned long floor = addr & PUD_MASK;
+               unsigned long ceil = floor + PUD_SIZE;
+
+               if (floor >= vma->vm_start && ceil <= vma->vm_end)
+                       hugetlb_unshare_pmds(vma, floor, ceil);
+       }
+
         return 0;
  }
  
@@ -4439,6 +4460,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
         struct hstate *h = hstate_vma(vma);
         unsigned long sz = huge_page_size(h);
         struct mmu_notifier_range range;
+       bool force_flush = false;
  
         WARN_ON(!is_vm_hugetlb_page(vma));
         BUG_ON(start & ~huge_page_mask(h));
@@ -4467,10 +4489,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 ptl = huge_pte_lock(h, mm, ptep);
                 if (huge_pmd_unshare(mm, vma, &address, ptep)) {
                         spin_unlock(ptl);
-                       /*
-                        * We just unmapped a page of PMDs by clearing a PUD.
-                        * The caller's TLB flush range should cover this area.
-                        */
+                       tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
+                       force_flush = true;
                         continue;
                 }
  
@@ -4527,6 +4547,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
         }
         mmu_notifier_invalidate_range_end(&range);
         tlb_end_vma(tlb, vma);
+
+       /*
+        * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
+        * could defer the flush until now, since by holding i_mmap_rwsem we
+        * guaranteed that the last refernece would not be dropped. But we must
+        * do the flushing before we return, as otherwise i_mmap_rwsem will be
+        * dropped and the last reference to the shared PMDs page might be
+        * dropped as well.
+        *
+        * In theory we could defer the freeing of the PMD pages as well, but
+        * huge_pmd_unshare() relies on the exact page_count for the PMD page to
+        * detect sharing, so we cannot defer the release of the page either.
+        * Instead, do flush now.
+        */
+       if (force_flush)
+               tlb_flush_mmu_tlbonly(tlb);
  }
  
  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
@@ -4829,7 +4865,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
                                                   unsigned long haddr,
                                                   unsigned long reason)
  {
-       vm_fault_t ret;
         u32 hash;
         struct vm_fault vmf = {
                 .vma = vma,
@@ -4846,18 +4881,14 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
         };
  
         /*
-        * hugetlb_fault_mutex and i_mmap_rwsem must be
-        * dropped before handling userfault.  Reacquire
-        * after handling fault to make calling code simpler.
+        * vma_lock and hugetlb_fault_mutex must be dropped before handling
+        * userfault. Also mmap_lock will be dropped during handling
+        * userfault, any vma operation should be careful from here.
          */
         hash = hugetlb_fault_mutex_hash(mapping, idx);
         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
         i_mmap_unlock_read(mapping);
-       ret = handle_userfault(&vmf, reason);
-       i_mmap_lock_read(mapping);
-       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
-       return ret;
+       return handle_userfault(&vmf, reason);
  }
  
  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
@@ -4874,6 +4905,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         spinlock_t *ptl;
         unsigned long haddr = address & huge_page_mask(h);
         bool new_page, new_pagecache_page = false;
+       u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
  
         /*
          * Currently, we are forced to kill the process in the event the
@@ -4883,7 +4915,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                            current->pid);
-               return ret;
+               goto out;
         }
  
         /*
@@ -4900,12 +4932,10 @@ retry:
         page = find_lock_page(mapping, idx);
         if (!page) {
                 /* Check for page in userfault range */
-               if (userfaultfd_missing(vma)) {
-                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+               if (userfaultfd_missing(vma))
+                       return hugetlb_handle_userfault(vma, mapping, idx,
                                                        flags, haddr,
                                                        VM_UFFD_MISSING);
-                       goto out;
-               }
  
                 page = alloc_huge_page(vma, haddr, 0);
                 if (IS_ERR(page)) {
@@ -4965,10 +4995,9 @@ retry:
                 if (userfaultfd_minor(vma)) {
                         unlock_page(page);
                         put_page(page);
-                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                       return hugetlb_handle_userfault(vma, mapping, idx,
                                                        flags, haddr,
                                                        VM_UFFD_MINOR);
-                       goto out;
                 }
         }
  
@@ -5019,6 +5048,8 @@ retry:
  
         unlock_page(page);
  out:
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+       i_mmap_unlock_read(mapping);
         return ret;
  
  backout:
@@ -5116,10 +5147,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
         entry = huge_ptep_get(ptep);
-       if (huge_pte_none(entry)) {
-               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
-               goto out_mutex;
-       }
+       if (huge_pte_none(entry))
+               /*
+                * hugetlb_no_page will drop vma lock and hugetlb fault
+                * mutex internally, which make us return immediately.
+                */
+               return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
  
         ret = 0;
  
@@ -5236,13 +5269,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         int ret = -ENOMEM;
         struct page *page;
         int writable;
-       bool new_pagecache_page = false;
+       bool page_in_pagecache = false;
  
         if (is_continue) {
                 ret = -EFAULT;
                 page = find_lock_page(mapping, idx);
                 if (!page)
                         goto out;
+               page_in_pagecache = true;
         } else if (!*pagep) {
                 /* If a page already exists, then it's UFFDIO_COPY for
                  * a non-missing case. Return -EEXIST.
@@ -5298,6 +5332,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
  
                 page = alloc_huge_page(dst_vma, dst_addr, 0);
                 if (IS_ERR(page)) {
+                       put_page(*pagep);
                         ret = -ENOMEM;
                         *pagep = NULL;
                         goto out;
@@ -5330,12 +5365,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                 ret = huge_add_to_page_cache(page, mapping, idx);
                 if (ret)
                         goto out_release_nounlock;
-               new_pagecache_page = true;
+               page_in_pagecache = true;
         }
  
         ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
         spin_lock(ptl);
  
+       ret = -EIO;
+       if (PageHWPoison(page))
+               goto out_release_unlock;
+
         /*
          * Recheck the i_size after holding PT lock to make sure not
          * to leave any page mapped (as page_mapped()) beyond the end
@@ -5354,7 +5393,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         if (!huge_pte_none(huge_ptep_get(dst_pte)))
                 goto out_release_unlock;
  
-       if (vm_shared) {
+       if (page_in_pagecache) {
                 page_dup_rmap(page, true);
         } else {
                 ClearHPageRestoreReserve(page);
@@ -5394,7 +5433,7 @@ out_release_unlock:
         if (vm_shared || is_continue)
                 unlock_page(page);
  out_release_nounlock:
-       if (!new_pagecache_page)
+       if (!page_in_pagecache)
                 restore_reserve_on_error(h, dst_vma, dst_addr, page);
         put_page(page);
         goto out;
@@ -6044,7 +6083,14 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
         pud_clear(pud);
         put_page(virt_to_page(ptep));
         mm_dec_nr_pmds(mm);
-       *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+       /*
+        * This update of passed address optimizes loops sequentially
+        * processing addresses in increments of huge page size (PMD_SIZE
+        * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
+        * Update address to the 'last page' in the cleared area so that
+        * calling loop can move to first page past this area.
+        */
+       *addr |= PUD_SIZE - PMD_SIZE;
         return 1;
  }
  
@@ -6161,12 +6207,13 @@ follow_huge_pd(struct vm_area_struct *vma,
  }
  
  struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd, int flags)
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
  {
+       struct hstate *h = hstate_vma(vma);
+       struct mm_struct *mm = vma->vm_mm;
         struct page *page = NULL;
         spinlock_t *ptl;
-       pte_t pte;
+       pte_t *ptep, pte;
  
         /* FOLL_GET and FOLL_PIN are mutually exclusive. */
         if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
@@ -6174,17 +6221,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
                 return NULL;
  
  retry:
-       ptl = pmd_lockptr(mm, pmd);
-       spin_lock(ptl);
-       /*
-        * make sure that the address range covered by this pmd is not
-        * unmapped from other threads.
-        */
-       if (!pmd_huge(*pmd))
-               goto out;
-       pte = huge_ptep_get((pte_t *)pmd);
+       ptep = huge_pte_offset(mm, address, huge_page_size(h));
+       if (!ptep)
+               return NULL;
+
+       ptl = huge_pte_lock(h, mm, ptep);
+       pte = huge_ptep_get(ptep);
         if (pte_present(pte)) {
-               page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+               page = pte_page(pte) +
+                       ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
                 /*
                  * try_grab_page() should always succeed here, because: a) we
                  * hold the pmd (ptl) lock, and b) we've just checked that the
@@ -6200,7 +6245,7 @@ retry:
         } else {
                 if (is_hugetlb_entry_migration(pte)) {
                         spin_unlock(ptl);
-                       __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+                       __migration_entry_wait(mm, ptep, ptl);
                         goto retry;
                 }
                 /*
@@ -6267,6 +6312,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
         return ret;
  }
  
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+{
+       int ret;
+
+       spin_lock_irq(&hugetlb_lock);
+       ret = __get_huge_page_for_hwpoison(pfn, flags);
+       spin_unlock_irq(&hugetlb_lock);
+       return ret;
+}
+
  void putback_active_hugepage(struct page *page)
  {
         spin_lock_irq(&hugetlb_lock);
@@ -6315,26 +6370,21 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
         }
  }
  
-/*
- * This function will unconditionally remove all the shared pmd pgtable entries
- * within the specific vma for a hugetlbfs memory range.
- */
-void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+                                  unsigned long start,
+                                  unsigned long end)
  {
         struct hstate *h = hstate_vma(vma);
         unsigned long sz = huge_page_size(h);
         struct mm_struct *mm = vma->vm_mm;
         struct mmu_notifier_range range;
-       unsigned long address, start, end;
+       unsigned long address;
         spinlock_t *ptl;
         pte_t *ptep;
  
         if (!(vma->vm_flags & VM_MAYSHARE))
                 return;
  
-       start = ALIGN(vma->vm_start, PUD_SIZE);
-       end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
-
         if (start >= end)
                 return;
  
@@ -6366,6 +6416,16 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
         mmu_notifier_invalidate_range_end(&range);
  }
  
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+       hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
+                       ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+}
+
  #ifdef CONFIG_CMA
  static bool cma_reserve_called __initdata;