if (!transhuge_vma_enabled(vma, vm_flags))
return false;
+ if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
+ vma->vm_pgoff, HPAGE_PMD_NR))
+ return false;
+
/* Enabled via shmem mount options or sysfs settings. */
- if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
- return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR);
- }
+ if (shmem_file(vma->vm_file))
+ return shmem_huge_enabled(vma);
/* THP settings require madvise. */
if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
return false;
- /* Read-only file mappings need to be aligned for THP to work. */
+ /* Only regular file is valid */
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
- !inode_is_open_for_write(vma->vm_file->f_inode) &&
(vm_flags & VM_EXEC)) {
- return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR);
+ struct inode *inode = vma->vm_file->f_inode;
+
+ return !inode_is_open_for_write(inode) &&
+ S_ISREG(inode->i_mode);
}
if (!vma->anon_vma || vma->vm_ops)
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
+ tlb_remove_table_sync_one();
spin_lock(pte_ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte,
spinlock_t *ptl;
int count = 0;
int i;
+ struct mmu_notifier_range range;
if (!vma || !vma->vm_file ||
!range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
if (!pmd)
goto drop_hpage;
+ /*
+ * We need to lock the mapping so that from here on, only GUP-fast and
+ * hardware page walks can access the parts of the page tables that
+ * we're operating on.
+ */
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ /*
+ * This spinlock should be unnecessary: Nobody else should be accessing
+ * the page tables under spinlock protection here, only
+ * lockless_pages_from_mm() and the hardware page walker can access page
+ * tables while all the high-level locks are held in write mode.
+ */
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
/* step 1: check all mapped PTEs are to the right huge page */
}
/* step 4: collapse pmd */
- ptl = pmd_lock(vma->vm_mm, pmd);
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
_pmd = pmdp_collapse_flush(vma, haddr, pmd);
- spin_unlock(ptl);
mm_dec_nr_ptes(mm);
+ tlb_remove_table_sync_one();
+ mmu_notifier_invalidate_range_end(&range);
pte_free(mm, pmd_pgtable(_pmd));
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
drop_hpage:
unlock_page(hpage);
put_page(hpage);
abort:
pte_unmap_unlock(start_pte, ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage;
}
* An alternative would be drop the check, but check that page
* table is clear before calling pmdp_collapse_flush() under
* ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too.
+ * has higher cost too. It would also probably require locking
+ * the anon_vma.
*/
if (vma->anon_vma)
continue;
*/
if (mmap_write_trylock(mm)) {
if (!khugepaged_test_exit(mm)) {
- spinlock_t *ptl = pmd_lock(mm, pmd);
+ struct mmu_notifier_range range;
+
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL, mm, addr,
+ addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
mm_dec_nr_ptes(mm);
+ tlb_remove_table_sync_one();
pte_free(mm, pmd_pgtable(_pmd));
+ mmu_notifier_invalidate_range_end(&range);
}
mmap_write_unlock(mm);
} else {
filemap_flush(mapping);
result = SCAN_FAIL;
goto xa_unlocked;
+ } else if (PageWriteback(page)) {
+ xas_unlock_irq(&xas);
+ result = SCAN_FAIL;
+ goto xa_unlocked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
goto out_unlock;
}
- if (!is_shmem && PageDirty(page)) {
+ if (!is_shmem && (PageDirty(page) ||
+ PageWriteback(page))) {
/*
* khugepaged only works on read-only fd, so this
* page is dirty because it hasn't been flushed