The lock can now be released.
+Exclusive access memory
+=======================
+
+Some devices have features such as atomic PTE bits that can be used to implement
+atomic access to system memory. To support atomic operations to a shared virtual
+memory page such a device needs access to that page which is exclusive of any
+userspace access from the CPU. The ``make_device_exclusive_range()`` function
+can be used to make a memory range inaccessible from userspace.
+
+This replaces all mappings for pages in the given range with special swap
+entries. Any attempt to access the swap entry results in a fault which is
+resovled by replacing the entry with the original mapping. A driver gets
+notified that the mapping has been changed by MMU notifiers, after which point
+it will no longer have exclusive access to the page. Exclusive access is
+guranteed to last until the driver drops the page lock and page reference, at
+which point any CPU faults on the page may proceed as described.
+
Memory cgroup (memcg) and rss accounting
========================================
* @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
* a device driver to possibly ignore the invalidation if the
* owner field matches the driver's device private pgmap owner.
+ *
+ * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
+ * longer have exclusive access to the page. When sent during creation of an
+ * exclusive range the owner will be initialised to the value provided by the
+ * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
*/
enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0,
MMU_NOTIFY_SOFT_DIRTY,
MMU_NOTIFY_RELEASE,
MMU_NOTIFY_MIGRATE,
+ MMU_NOTIFY_EXCLUSIVE,
};
#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
void try_to_migrate(struct page *page, enum ttu_flags flags);
void try_to_unmap(struct page *, enum ttu_flags flags);
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct page **pages,
+ void *arg);
+
/* Avoid racy checks */
#define PVMW_SYNC (1 << 0)
/* Look for migarion entries rather than present PTEs */
* migrate part of a process memory to device memory.
*
* When a page is migrated from CPU to device, we set the CPU page table entry
- * to a special SWP_DEVICE_* entry.
+ * to a special SWP_DEVICE_{READ|WRITE} entry.
+ *
+ * When a page is mapped by the device for exclusive access we set the CPU page
+ * table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
*/
#ifdef CONFIG_DEVICE_PRIVATE
-#define SWP_DEVICE_NUM 2
+#define SWP_DEVICE_NUM 4
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
+#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
+#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
#else
#define SWP_DEVICE_NUM 0
#endif
{
return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
}
+
+static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
+{
+ return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset);
+}
+
+static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
+{
+ return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset);
+}
+
+static inline bool is_device_exclusive_entry(swp_entry_t entry)
+{
+ return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
+ swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
+}
+
+static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
+{
+ return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE);
+}
#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
{
return false;
}
+
+static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
+{
+ return swp_entry(0, 0);
+}
+
+static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
+{
+ return swp_entry(0, 0);
+}
+
+static inline bool is_device_exclusive_entry(swp_entry_t entry)
+{
+ return false;
+}
+
+static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
+{
+ return false;
+}
#endif /* CONFIG_DEVICE_PRIVATE */
#ifdef CONFIG_MIGRATION
*/
static inline bool is_pfn_swap_entry(swp_entry_t entry)
{
- return is_migration_entry(entry) || is_device_private_entry(entry);
+ return is_migration_entry(entry) || is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry);
}
struct page_vma_mapped_walk;
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
+#include "internal.h"
+
struct hmm_vma_walk {
struct hmm_range *range;
unsigned long last;
if (!non_swap_entry(entry))
goto fault;
+ if (is_device_exclusive_entry(entry))
+ goto fault;
+
if (is_migration_entry(entry)) {
pte_unmap(ptep);
hmm_vma_walk->last = addr;
}
#endif
+static void restore_exclusive_pte(struct vm_area_struct *vma,
+ struct page *page, unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte;
+ swp_entry_t entry;
+
+ pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+ if (pte_swp_soft_dirty(*ptep))
+ pte = pte_mksoft_dirty(pte);
+
+ entry = pte_to_swp_entry(*ptep);
+ if (pte_swp_uffd_wp(*ptep))
+ pte = pte_mkuffd_wp(pte);
+ else if (is_writable_device_exclusive_entry(entry))
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+ set_pte_at(vma->vm_mm, address, ptep, pte);
+
+ /*
+ * No need to take a page reference as one was already
+ * created when the swap entry was made.
+ */
+ if (PageAnon(page))
+ page_add_anon_rmap(page, vma, address, false);
+ else
+ /*
+ * Currently device exclusive access only supports anonymous
+ * memory so the entry shouldn't point to a filebacked page.
+ */
+ WARN_ON_ONCE(!PageAnon(page));
+
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_vma_page(page);
+
+ /*
+ * No need to invalidate - it was non-present before. However
+ * secondary CPUs may have mappings that need invalidating.
+ */
+ update_mmu_cache(vma, address, ptep);
+}
+
+/*
+ * Tries to restore an exclusive pte if the page lock can be acquired without
+ * sleeping.
+ */
+static int
+try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ swp_entry_t entry = pte_to_swp_entry(*src_pte);
+ struct page *page = pfn_swap_entry_to_page(entry);
+
+ if (trylock_page(page)) {
+ restore_exclusive_pte(vma, page, addr, src_pte);
+ unlock_page(page);
+ return 0;
+ }
+
+ return -EBUSY;
+}
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
+ } else if (is_device_exclusive_entry(entry)) {
+ /*
+ * Make device exclusive entries present by restoring the
+ * original entry then copying as for a present pte. Device
+ * exclusive entries currently only support private writable
+ * (ie. COW) mappings.
+ */
+ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
+ if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+ return -EBUSY;
+ return -ENOENT;
}
if (!userfaultfd_wp(dst_vma))
pte = pte_swp_clear_uffd_wp(pte);
if (ret == -EIO) {
entry = pte_to_swp_entry(*src_pte);
break;
+ } else if (ret == -EBUSY) {
+ break;
+ } else if (!ret) {
+ progress += 8;
+ continue;
}
- progress += 8;
- continue;
+
+ /*
+ * Device exclusive entry restored, continue by copying
+ * the now present pte.
+ */
+ WARN_ON_ONCE(ret != -ENOENT);
}
/* copy_present_pte() will clear `*prealloc' if consumed */
ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
goto out;
}
entry.val = 0;
+ } else if (ret == -EBUSY) {
+ goto out;
} else if (ret == -EAGAIN) {
prealloc = page_copy_prealloc(src_mm, src_vma, addr);
if (!prealloc)
}
entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry)) {
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
struct page *page = pfn_swap_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
+
+ if (is_device_private_entry(entry))
+ page_remove_rmap(page, false);
+
put_page(page);
continue;
}
}
EXPORT_SYMBOL(unmap_mapping_range);
+/*
+ * Restore a potential device exclusive pte to a working pte entry
+ */
+static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+ return VM_FAULT_RETRY;
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+ vma->vm_mm, vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+ mmu_notifier_invalidate_range_start(&range);
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ unlock_page(page);
+
+ mmu_notifier_invalidate_range_end(&range);
+ return 0;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
+ } else if (is_device_exclusive_entry(entry)) {
+ vmf->page = pfn_swap_entry_to_page(entry);
+ ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (is_writable_device_exclusive_entry(entry)) {
+ entry = make_readable_device_exclusive_entry(
+ swp_offset(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(oldpte))
+ newpte = pte_swp_mksoft_dirty(newpte);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
} else {
newpte = oldpte;
}
/* Handle un-addressable ZONE_DEVICE memory */
entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry))
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
} else if (!pte_present(*pvmw->pte))
return false;
return false;
entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_migration_entry(entry))
+ if (!is_migration_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
pfn = swp_offset(entry);
/* Handle un-addressable ZONE_DEVICE memory */
entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry))
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
pfn = swp_offset(entry);
rmap_walk(page, &rwc);
}
+#ifdef CONFIG_DEVICE_PRIVATE
+struct make_exclusive_args {
+ struct mm_struct *mm;
+ unsigned long address;
+ void *owner;
+ bool valid;
+};
+
+static bool page_make_device_exclusive_one(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, void *priv)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+ struct make_exclusive_args *args = priv;
+ pte_t pteval;
+ struct page *subpage;
+ bool ret = true;
+ struct mmu_notifier_range range;
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+ vma->vm_mm, address, min(vma->vm_end,
+ address + page_size(page)), args->owner);
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ if (!pte_present(*pvmw.pte)) {
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+ /* Move the dirty bit to the page. Now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ /*
+ * Check that our target page is still mapped at the expected
+ * address.
+ */
+ if (args->mm == mm && args->address == address &&
+ pte_write(pteval))
+ args->valid = true;
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ if (pte_write(pteval))
+ entry = make_writable_device_exclusive_entry(
+ page_to_pfn(subpage));
+ else
+ entry = make_readable_device_exclusive_entry(
+ page_to_pfn(subpage));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+
+ /*
+ * There is a reference on the page for the swap entry which has
+ * been removed, so shouldn't take another.
+ */
+ page_remove_rmap(subpage, false);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return ret;
+}
+
+/**
+ * page_make_device_exclusive - mark the page exclusively owned by a device
+ * @page: the page to replace page table entries for
+ * @mm: the mm_struct where the page is expected to be mapped
+ * @address: address where the page is expected to be mapped
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
+ *
+ * Tries to remove all the page table entries which are mapping this page and
+ * replace them with special device exclusive swap entries to grant a device
+ * exclusive access to the page. Caller must hold the page lock.
+ *
+ * Returns false if the page is still mapped, or if it could not be unmapped
+ * from the expected address. Otherwise returns true (success).
+ */
+static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
+ unsigned long address, void *owner)
+{
+ struct make_exclusive_args args = {
+ .mm = mm,
+ .address = address,
+ .owner = owner,
+ .valid = false,
+ };
+ struct rmap_walk_control rwc = {
+ .rmap_one = page_make_device_exclusive_one,
+ .done = page_not_mapped,
+ .anon_lock = page_lock_anon_vma_read,
+ .arg = &args,
+ };
+
+ /*
+ * Restrict to anonymous pages for now to avoid potential writeback
+ * issues. Also tail pages shouldn't be passed to rmap_walk so skip
+ * those.
+ */
+ if (!PageAnon(page) || PageTail(page))
+ return false;
+
+ rmap_walk(page, &rwc);
+
+ return args.valid && !page_mapcount(page);
+}
+
+/**
+ * make_device_exclusive_range() - Mark a range for exclusive use by a device
+ * @mm: mm_struct of assoicated target process
+ * @start: start of the region to mark for exclusive device access
+ * @end: end address of region
+ * @pages: returns the pages which were successfully marked for exclusive access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ *
+ * Returns: number of pages found in the range by GUP. A page is marked for
+ * exclusive access only if the page pointer is non-NULL.
+ *
+ * This function finds ptes mapping page(s) to the given address range, locks
+ * them and replaces mappings with special swap entries preventing userspace CPU
+ * access. On fault these entries are replaced with the original mapping after
+ * calling MMU notifiers.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the page lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ */
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct page **pages,
+ void *owner)
+{
+ long npages = (end - start) >> PAGE_SHIFT;
+ long i;
+
+ npages = get_user_pages_remote(mm, start, npages,
+ FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+ pages, NULL, NULL);
+ if (npages < 0)
+ return npages;
+
+ for (i = 0; i < npages; i++, start += PAGE_SIZE) {
+ if (!trylock_page(pages[i])) {
+ put_page(pages[i]);
+ pages[i] = NULL;
+ continue;
+ }
+
+ if (!page_make_device_exclusive(pages[i], mm, start, owner)) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ pages[i] = NULL;
+ }
+ }
+
+ return npages;
+}
+EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+#endif
+
void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;