x86: add tizen_qemu_x86_defconfig & tizen_qemu_x86_64_defconfig

[platform/kernel/linux-rpi.git] / mm / khugepaged.c
diff --git a/mm/khugepaged.c b/mm/khugepaged.c

index 045cc57..3afcb14 100644 (file)
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -445,22 +445,25 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
         if (!transhuge_vma_enabled(vma, vm_flags))
                 return false;
  
+       if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
+                               vma->vm_pgoff, HPAGE_PMD_NR))
+               return false;
+
         /* Enabled via shmem mount options or sysfs settings. */
-       if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
-               return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-                               HPAGE_PMD_NR);
-       }
+       if (shmem_file(vma->vm_file))
+               return shmem_huge_enabled(vma);
  
         /* THP settings require madvise. */
         if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
                 return false;
  
-       /* Read-only file mappings need to be aligned for THP to work. */
+       /* Only regular file is valid */
         if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
-           !inode_is_open_for_write(vma->vm_file->f_inode) &&
             (vm_flags & VM_EXEC)) {
-               return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-                               HPAGE_PMD_NR);
+               struct inode *inode = vma->vm_file->f_inode;
+
+               return !inode_is_open_for_write(inode) &&
+                       S_ISREG(inode->i_mode);
         }
  
         if (!vma->anon_vma || vma->vm_ops)
@@ -1143,14 +1146,17 @@ static void collapse_huge_page(struct mm_struct *mm,
  
         pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
         /*
-        * After this gup_fast can't run anymore. This also removes
-        * any huge TLB entry from the CPU so we won't allow
-        * huge and small TLB entries for the same virtual address
-        * to avoid the risk of CPU bugs in that area.
+        * This removes any huge TLB entry from the CPU so we won't allow
+        * huge and small TLB entries for the same virtual address to
+        * avoid the risk of CPU bugs in that area.
+        *
+        * Parallel fast GUP is fine since fast GUP will back off when
+        * it detects PMD is changed.
          */
         _pmd = pmdp_collapse_flush(vma, address, pmd);
         spin_unlock(pmd_ptl);
         mmu_notifier_invalidate_range_end(&range);
+       tlb_remove_table_sync_one();
  
         spin_lock(pte_ptl);
         isolated = __collapse_huge_page_isolate(vma, address, pte,
@@ -1437,6 +1443,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
         spinlock_t *ptl;
         int count = 0;
         int i;
+       struct mmu_notifier_range range;
  
         if (!vma || !vma->vm_file ||
             !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
@@ -1463,6 +1470,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
         if (!pmd)
                 goto drop_hpage;
  
+       /*
+        * We need to lock the mapping so that from here on, only GUP-fast and
+        * hardware page walks can access the parts of the page tables that
+        * we're operating on.
+        */
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+
+       /*
+        * This spinlock should be unnecessary: Nobody else should be accessing
+        * the page tables under spinlock protection here, only
+        * lockless_pages_from_mm() and the hardware page walker can access page
+        * tables while all the high-level locks are held in write mode.
+        */
         start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
  
         /* step 1: check all mapped PTEs are to the right huge page */
@@ -1509,12 +1529,23 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
         }
  
         /* step 4: collapse pmd */
-       ptl = pmd_lock(vma->vm_mm, pmd);
+       /* we make no change to anon, but protect concurrent anon page lookup */
+       if (vma->anon_vma)
+               anon_vma_lock_write(vma->anon_vma);
+
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, haddr,
+                               haddr + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
         _pmd = pmdp_collapse_flush(vma, haddr, pmd);
-       spin_unlock(ptl);
         mm_dec_nr_ptes(mm);
+       tlb_remove_table_sync_one();
+       mmu_notifier_invalidate_range_end(&range);
         pte_free(mm, pmd_pgtable(_pmd));
  
+       if (vma->anon_vma)
+               anon_vma_unlock_write(vma->anon_vma);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+
  drop_hpage:
         unlock_page(hpage);
         put_page(hpage);
@@ -1522,6 +1553,7 @@ drop_hpage:
  
  abort:
         pte_unmap_unlock(start_pte, ptl);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
         goto drop_hpage;
  }
  
@@ -1570,7 +1602,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                  * An alternative would be drop the check, but check that page
                  * table is clear before calling pmdp_collapse_flush() under
                  * ptl. It has higher chance to recover THP for the VMA, but
-                * has higher cost too.
+                * has higher cost too. It would also probably require locking
+                * the anon_vma.
                  */
                 if (vma->anon_vma)
                         continue;
@@ -1592,12 +1625,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                  */
                 if (mmap_write_trylock(mm)) {
                         if (!khugepaged_test_exit(mm)) {
-                               spinlock_t *ptl = pmd_lock(mm, pmd);
+                               struct mmu_notifier_range range;
+
+                               mmu_notifier_range_init(&range,
+                                                       MMU_NOTIFY_CLEAR, 0,
+                                                       NULL, mm, addr,
+                                                       addr + HPAGE_PMD_SIZE);
+                               mmu_notifier_invalidate_range_start(&range);
                                 /* assume page table is clear */
                                 _pmd = pmdp_collapse_flush(vma, addr, pmd);
-                               spin_unlock(ptl);
                                 mm_dec_nr_ptes(mm);
+                               tlb_remove_table_sync_one();
                                 pte_free(mm, pmd_pgtable(_pmd));
+                               mmu_notifier_invalidate_range_end(&range);
                         }
                         mmap_write_unlock(mm);
                 } else {
@@ -1763,6 +1803,10 @@ static void collapse_file(struct mm_struct *mm,
                                 filemap_flush(mapping);
                                 result = SCAN_FAIL;
                                 goto xa_unlocked;
+                       } else if (PageWriteback(page)) {
+                               xas_unlock_irq(&xas);
+                               result = SCAN_FAIL;
+                               goto xa_unlocked;
                         } else if (trylock_page(page)) {
                                 get_page(page);
                                 xas_unlock_irq(&xas);
@@ -1798,7 +1842,8 @@ static void collapse_file(struct mm_struct *mm,
                         goto out_unlock;
                 }
  
-               if (!is_shmem && PageDirty(page)) {
+               if (!is_shmem && (PageDirty(page) ||
+                                 PageWriteback(page))) {
                         /*
                          * khugepaged only works on read-only fd, so this
                          * page is dirty because it hasn't been flushed