* considered failure, and furthermore, a likely bug in the caller, so a warning
* is also emitted.
*/
-struct page *try_grab_compound_head(struct page *page,
- int refs, unsigned int flags)
+__maybe_unused struct page *try_grab_compound_head(struct page *page,
+ int refs, unsigned int flags)
{
if (flags & FOLL_GET)
return try_get_compound_head(page, refs);
*/
bool __must_check try_grab_page(struct page *page, unsigned int flags)
{
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return true;
+ WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
- return try_grab_compound_head(page, 1, flags);
+ if (flags & FOLL_GET)
+ return try_get_page(page);
+ else if (flags & FOLL_PIN) {
+ int refs = 1;
+
+ page = compound_head(page);
+
+ if (WARN_ON_ONCE(page_ref_count(page) <= 0))
+ return false;
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_add(page, 1);
+ else
+ refs = GUP_PIN_COUNTING_BIAS;
+
+ /*
+ * Similar to try_grab_compound_head(): even if using the
+ * hpage_pincount_add/_sub() routines, be sure to
+ * *also* increment the normal page refcount field at least
+ * once, so that the page really is pinned.
+ */
+ page_ref_add(page, refs);
+
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
+ }
+
+ return true;
}
/**
pte_t *pte, unsigned int flags)
{
/* No page to get reference */
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return -EFAULT;
if (flags & FOLL_TOUCH) {
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
+
+ /*
+ * Considering PTE level hugetlb, like continuous-PTE hugetlb on
+ * ARM64 architecture.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = follow_huge_pmd_pte(vma, address, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd(mm, address, pmd, flags);
+ page = follow_huge_pmd_pte(vma, address, flags);
if (page)
return page;
return no_page_table(vma, flags);
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
+ if (*flags & FOLL_NOFAULT)
+ return -EFAULT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
#endif /* !CONFIG_MMU */
/**
+ * fault_in_writeable - fault in userspace address range for writing
+ * @uaddr: start of address range
+ * @size: size of address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_writeable(char __user *uaddr, size_t size)
+{
+ char __user *start = uaddr, *end;
+
+ if (unlikely(size == 0))
+ return 0;
+ if (!PAGE_ALIGNED(uaddr)) {
+ if (unlikely(__put_user(0, uaddr) != 0))
+ return size;
+ uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
+ }
+ end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
+ if (unlikely(end < start))
+ end = NULL;
+ while (uaddr != end) {
+ if (unlikely(__put_user(0, uaddr) != 0))
+ goto out;
+ uaddr += PAGE_SIZE;
+ }
+
+out:
+ if (size > uaddr - start)
+ return size - (uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_writeable);
+
+/*
+ * fault_in_safe_writeable - fault in an address range for writing
+ * @uaddr: start of address range
+ * @size: length of address range
+ *
+ * Faults in an address range for writing. This is primarily useful when we
+ * already know that some or all of the pages in the address range aren't in
+ * memory.
+ *
+ * Unlike fault_in_writeable(), this function is non-destructive.
+ *
+ * Note that we don't pin or otherwise hold the pages referenced that we fault
+ * in. There's no guarantee that they'll stay in memory for any duration of
+ * time.
+ *
+ * Returns the number of bytes not faulted in, like copy_to_user() and
+ * copy_from_user().
+ */
+size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
+{
+ unsigned long start = (unsigned long)uaddr, end;
+ struct mm_struct *mm = current->mm;
+ bool unlocked = false;
+
+ if (unlikely(size == 0))
+ return 0;
+ end = PAGE_ALIGN(start + size);
+ if (end < start)
+ end = 0;
+
+ mmap_read_lock(mm);
+ do {
+ if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
+ break;
+ start = (start + PAGE_SIZE) & PAGE_MASK;
+ } while (start != end);
+ mmap_read_unlock(mm);
+
+ if (size > (unsigned long)uaddr - start)
+ return size - ((unsigned long)uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_safe_writeable);
+
+/**
+ * fault_in_readable - fault in userspace address range for reading
+ * @uaddr: start of user address range
+ * @size: size of user address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_readable(const char __user *uaddr, size_t size)
+{
+ const char __user *start = uaddr, *end;
+ volatile char c;
+
+ if (unlikely(size == 0))
+ return 0;
+ if (!PAGE_ALIGNED(uaddr)) {
+ if (unlikely(__get_user(c, uaddr) != 0))
+ return size;
+ uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
+ }
+ end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
+ if (unlikely(end < start))
+ end = NULL;
+ while (uaddr != end) {
+ if (unlikely(__get_user(c, uaddr) != 0))
+ goto out;
+ uaddr += PAGE_SIZE;
+ }
+
+out:
+ (void)c;
+ if (size > uaddr - start)
+ return size - (uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_readable);
+
+/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
put_compound_head(head, 1, flags);
goto pte_unmap;
}
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
next = pud_addr_end(addr, end);
if (unlikely(!pud_present(pud)))
return 0;
- if (unlikely(pud_huge(pud))) {
+ if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
- FOLL_FAST_ONLY)))
+ FOLL_FAST_ONLY | FOLL_NOFAULT)))
return -EINVAL;
if (gup_flags & FOLL_PIN)