BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
-static int shrink_huge_zero_page(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- if (!sc->nr_to_scan)
- /* we can free zero page only if last reference remains */
- return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
+static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
__free_page(zero_page);
+ return HPAGE_PMD_NR;
}
return 0;
}
static struct shrinker huge_zero_page_shrinker = {
- .shrink = shrink_huge_zero_page,
+ .count_objects = shrink_huge_zero_page_count,
+ .scan_objects = shrink_huge_zero_page_scan,
.seeks = DEFAULT_SEEKS,
};
unsigned long msecs;
int err;
- err = strict_strtoul(buf, 10, &msecs);
+ err = kstrtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
unsigned long msecs;
int err;
- err = strict_strtoul(buf, 10, &msecs);
+ err = kstrtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
int err;
unsigned long pages;
- err = strict_strtoul(buf, 10, &pages);
+ err = kstrtoul(buf, 10, &pages);
if (err || !pages || pages > UINT_MAX)
return -EINVAL;
int err;
unsigned long max_ptes_none;
- err = strict_strtoul(buf, 10, &max_ptes_none);
+ err = kstrtoul(buf, 10, &max_ptes_none);
if (err || max_ptes_none > HPAGE_PMD_NR-1)
return -EINVAL;
return pmd;
}
-static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
{
pmd_t entry;
- entry = mk_pmd(page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = mk_pmd(page, prot);
entry = pmd_mkhuge(entry);
return entry;
}
pte_free(mm, pgtable);
} else {
pmd_t entry;
- entry = mk_huge_pmd(page, vma);
+ entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
{
struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK;
- pte_t *pte;
- if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
- if (unlikely(anon_vma_prepare(vma)))
- return VM_FAULT_OOM;
- if (unlikely(khugepaged_enter(vma)))
+ if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
+ return VM_FAULT_OOM;
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
+ pgtable_t pgtable;
+ struct page *zero_page;
+ bool set;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
return VM_FAULT_OOM;
- if (!(flags & FAULT_FLAG_WRITE) &&
- transparent_hugepage_use_zero_page()) {
- pgtable_t pgtable;
- struct page *zero_page;
- bool set;
- pgtable = pte_alloc_one(mm, haddr);
- if (unlikely(!pgtable))
- return VM_FAULT_OOM;
- zero_page = get_huge_zero_page();
- if (unlikely(!zero_page)) {
- pte_free(mm, pgtable);
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
- spin_lock(&mm->page_table_lock);
- set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
- zero_page);
- spin_unlock(&mm->page_table_lock);
- if (!set) {
- pte_free(mm, pgtable);
- put_huge_zero_page();
- }
- return 0;
- }
- page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
- if (unlikely(!page)) {
+ zero_page = get_huge_zero_page();
+ if (unlikely(!zero_page)) {
+ pte_free(mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
- goto out;
+ return VM_FAULT_FALLBACK;
}
- count_vm_event(THP_FAULT_ALLOC);
- if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
- put_page(page);
- goto out;
- }
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
- page))) {
- mem_cgroup_uncharge_page(page);
- put_page(page);
- goto out;
+ spin_lock(&mm->page_table_lock);
+ set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_page);
+ spin_unlock(&mm->page_table_lock);
+ if (!set) {
+ pte_free(mm, pgtable);
+ put_huge_zero_page();
}
-
return 0;
}
-out:
- /*
- * Use __pte_alloc instead of pte_alloc_map, because we can't
- * run pte_offset_map on the pmd, if an huge pmd could
- * materialize from under us from a different thread.
- */
- if (unlikely(pmd_none(*pmd)) &&
- unlikely(__pte_alloc(mm, vma, pmd, address)))
- return VM_FAULT_OOM;
- /* if an huge pmd materialized from under us just retry later */
- if (unlikely(pmd_trans_huge(*pmd)))
- return 0;
- /*
- * A regular pmd is established and it can't morph into a huge pmd
- * from under us anymore at this point because we hold the mmap_sem
- * read mode and khugepaged takes it in write mode. So now it's
- * safe to run pte_offset_map().
- */
- pte = pte_offset_map(pmd, address);
- return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+
+ count_vm_event(THP_FAULT_ALLOC);
+ return 0;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
new_page = NULL;
if (unlikely(!new_page)) {
- count_vm_event(THP_FAULT_FALLBACK);
if (is_huge_zero_pmd(orig_pmd)) {
ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
address, pmd, orig_pmd, haddr);
split_huge_page(page);
put_page(page);
}
+ count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
split_huge_page(page);
put_page(page);
}
+ count_vm_event(THP_FAULT_FALLBACK);
ret |= VM_FAULT_OOM;
goto out;
}
+ count_vm_event(THP_FAULT_ALLOC);
+
if (is_huge_zero_pmd(orig_pmd))
clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
else
goto out_mn;
} else {
pmd_t entry;
- entry = mk_huge_pmd(new_page, vma);
+ entry = mk_huge_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_clear_flush(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{
+ struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
- int target_nid;
- int current_nid = -1;
- bool migrated;
+ int page_nid = -1, this_nid = numa_node_id();
+ int target_nid, last_nidpid = -1;
+ bool page_locked;
+ bool migrated = false;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
page = pmd_page(pmd);
- get_page(page);
- current_nid = page_to_nid(page);
+ BUG_ON(is_huge_zero_page(page));
+ page_nid = page_to_nid(page);
+ last_nidpid = page_nidpid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == this_nid)
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ /*
+ * Acquire the page lock to serialise THP migrations but avoid dropping
+ * page_table_lock if at all possible
+ */
+ page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) {
- put_page(page);
- goto clear_pmdnuma;
+ /* If the page was locked, there are no parallel migrations */
+ if (page_locked)
+ goto clear_pmdnuma;
+
+ /*
+ * Otherwise wait for potential migrations and retry. We do
+ * relock and check_same as the page may no longer be mapped.
+ * As the fault is being retried, do not account for it.
+ */
+ spin_unlock(&mm->page_table_lock);
+ wait_on_page_locked(page);
+ page_nid = -1;
+ goto out;
}
- /* Acquire the page lock to serialise THP migrations */
+ /* Page is misplaced, serialise migrations and parallel THP splits */
+ get_page(page);
spin_unlock(&mm->page_table_lock);
- lock_page(page);
+ if (!page_locked)
+ lock_page(page);
+ anon_vma = page_lock_anon_vma_read(page);
- /* Confirm the PTE did not while locked */
+ /* Confirm the PMD did not change while page_table_lock was released */
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
+ page_nid = -1;
goto out_unlock;
}
- spin_unlock(&mm->page_table_lock);
- /* Migrate the THP to the requested node */
+ /*
+ * Migrate the THP to the requested node, returns with page unlocked
+ * and pmd_numa cleared.
+ */
+ spin_unlock(&mm->page_table_lock);
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
- if (!migrated)
- goto check_same;
-
- task_numa_fault(target_nid, HPAGE_PMD_NR, true);
- return 0;
+ if (migrated)
+ page_nid = target_nid;
-check_same:
- spin_lock(&mm->page_table_lock);
- if (unlikely(!pmd_same(pmd, *pmdp)))
- goto out_unlock;
+ goto out;
clear_pmdnuma:
+ BUG_ON(!PageLocked(page));
pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
+ unlock_page(page);
out_unlock:
spin_unlock(&mm->page_table_lock);
- if (current_nid != -1)
- task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+
+out:
+ if (anon_vma)
+ page_unlock_anon_vma_read(anon_vma);
+
+ if (page_nid != -1)
+ task_numa_fault(last_nidpid, page_nid, HPAGE_PMD_NR, migrated);
+
return 0;
}
return ret;
}
+/*
+ * Returns
+ * - 0 if PMD could not be locked
+ * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
+ * - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ */
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
- entry = pmdp_get_and_clear(mm, addr, pmd);
+ ret = 1;
if (!prot_numa) {
+ entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
+ ret = HPAGE_PMD_NR;
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
- /* only check non-shared pages */
- if (page_mapcount(page) == 1 &&
+ /*
+ * Do not trap faults against the zero page. The
+ * read-only data is likely to be read-cached on the
+ * local CPU cache and it is less useful to know about
+ * local vs remote hits on the zero page.
+ */
+ if (!is_huge_zero_page(page) &&
!pmd_numa(*pmd)) {
+ entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_mknuma(entry);
+ ret = HPAGE_PMD_NR;
}
}
- set_pmd_at(mm, addr, pmd, entry);
+
+ /* Set PMD if cleared earlier */
+ if (ret == HPAGE_PMD_NR)
+ set_pmd_at(mm, addr, pmd, entry);
+
spin_unlock(&vma->vm_mm->page_table_lock);
- ret = 1;
}
return ret;
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
- page_nid_xchg_last(page_tail, page_nid_last(page));
+ page_nidpid_xchg_last(page_tail, page_nidpid_last(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
BUG_ON(atomic_read(&page->_count) <= 0);
__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
- __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
ClearPageCompound(page);
compound_unlock(page);
goto out;
vma = find_vma(mm, address);
+ if (!vma)
+ goto out;
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(new_page, vma);
+ _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
/*
* spin_lock() below is not the equivalent of smp_wmb(), so