Merge tag 'for-linus-2022111101' of git://git.kernel.org/pub/scm/linux/kernel/git...
[platform/kernel/linux-starfive.git] / mm / userfaultfd.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  mm/userfaultfd.c
4  *
5  *  Copyright (C) 2015  Red Hat, Inc.
6  */
7
8 #include <linux/mm.h>
9 #include <linux/sched/signal.h>
10 #include <linux/pagemap.h>
11 #include <linux/rmap.h>
12 #include <linux/swap.h>
13 #include <linux/swapops.h>
14 #include <linux/userfaultfd_k.h>
15 #include <linux/mmu_notifier.h>
16 #include <linux/hugetlb.h>
17 #include <linux/shmem_fs.h>
18 #include <asm/tlbflush.h>
19 #include <asm/tlb.h>
20 #include "internal.h"
21
22 static __always_inline
23 struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
24                                     unsigned long dst_start,
25                                     unsigned long len)
26 {
27         /*
28          * Make sure that the dst range is both valid and fully within a
29          * single existing vma.
30          */
31         struct vm_area_struct *dst_vma;
32
33         dst_vma = find_vma(dst_mm, dst_start);
34         if (!dst_vma)
35                 return NULL;
36
37         if (dst_start < dst_vma->vm_start ||
38             dst_start + len > dst_vma->vm_end)
39                 return NULL;
40
41         /*
42          * Check the vma is registered in uffd, this is required to
43          * enforce the VM_MAYWRITE check done at uffd registration
44          * time.
45          */
46         if (!dst_vma->vm_userfaultfd_ctx.ctx)
47                 return NULL;
48
49         return dst_vma;
50 }
51
52 /*
53  * Install PTEs, to map dst_addr (within dst_vma) to page.
54  *
55  * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
56  * and anon, and for both shared and private VMAs.
57  */
58 int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
59                              struct vm_area_struct *dst_vma,
60                              unsigned long dst_addr, struct page *page,
61                              bool newly_allocated, bool wp_copy)
62 {
63         int ret;
64         pte_t _dst_pte, *dst_pte;
65         bool writable = dst_vma->vm_flags & VM_WRITE;
66         bool vm_shared = dst_vma->vm_flags & VM_SHARED;
67         bool page_in_cache = page->mapping;
68         spinlock_t *ptl;
69         struct inode *inode;
70         pgoff_t offset, max_off;
71
72         _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
73         _dst_pte = pte_mkdirty(_dst_pte);
74         if (page_in_cache && !vm_shared)
75                 writable = false;
76
77         /*
78          * Always mark a PTE as write-protected when needed, regardless of
79          * VM_WRITE, which the user might change.
80          */
81         if (wp_copy) {
82                 _dst_pte = pte_mkuffd_wp(_dst_pte);
83                 writable = false;
84         }
85
86         if (writable)
87                 _dst_pte = pte_mkwrite(_dst_pte);
88         else
89                 /*
90                  * We need this to make sure write bit removed; as mk_pte()
91                  * could return a pte with write bit set.
92                  */
93                 _dst_pte = pte_wrprotect(_dst_pte);
94
95         dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
96
97         if (vma_is_shmem(dst_vma)) {
98                 /* serialize against truncate with the page table lock */
99                 inode = dst_vma->vm_file->f_inode;
100                 offset = linear_page_index(dst_vma, dst_addr);
101                 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
102                 ret = -EFAULT;
103                 if (unlikely(offset >= max_off))
104                         goto out_unlock;
105         }
106
107         ret = -EEXIST;
108         /*
109          * We allow to overwrite a pte marker: consider when both MISSING|WP
110          * registered, we firstly wr-protect a none pte which has no page cache
111          * page backing it, then access the page.
112          */
113         if (!pte_none_mostly(*dst_pte))
114                 goto out_unlock;
115
116         if (page_in_cache) {
117                 /* Usually, cache pages are already added to LRU */
118                 if (newly_allocated)
119                         lru_cache_add(page);
120                 page_add_file_rmap(page, dst_vma, false);
121         } else {
122                 page_add_new_anon_rmap(page, dst_vma, dst_addr);
123                 lru_cache_add_inactive_or_unevictable(page, dst_vma);
124         }
125
126         /*
127          * Must happen after rmap, as mm_counter() checks mapping (via
128          * PageAnon()), which is set by __page_set_anon_rmap().
129          */
130         inc_mm_counter(dst_mm, mm_counter(page));
131
132         set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
133
134         /* No need to invalidate - it was non-present before */
135         update_mmu_cache(dst_vma, dst_addr, dst_pte);
136         ret = 0;
137 out_unlock:
138         pte_unmap_unlock(dst_pte, ptl);
139         return ret;
140 }
141
142 static int mcopy_atomic_pte(struct mm_struct *dst_mm,
143                             pmd_t *dst_pmd,
144                             struct vm_area_struct *dst_vma,
145                             unsigned long dst_addr,
146                             unsigned long src_addr,
147                             struct page **pagep,
148                             bool wp_copy)
149 {
150         void *page_kaddr;
151         int ret;
152         struct page *page;
153
154         if (!*pagep) {
155                 ret = -ENOMEM;
156                 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
157                 if (!page)
158                         goto out;
159
160                 page_kaddr = kmap_local_page(page);
161                 /*
162                  * The read mmap_lock is held here.  Despite the
163                  * mmap_lock being read recursive a deadlock is still
164                  * possible if a writer has taken a lock.  For example:
165                  *
166                  * process A thread 1 takes read lock on own mmap_lock
167                  * process A thread 2 calls mmap, blocks taking write lock
168                  * process B thread 1 takes page fault, read lock on own mmap lock
169                  * process B thread 2 calls mmap, blocks taking write lock
170                  * process A thread 1 blocks taking read lock on process B
171                  * process B thread 1 blocks taking read lock on process A
172                  *
173                  * Disable page faults to prevent potential deadlock
174                  * and retry the copy outside the mmap_lock.
175                  */
176                 pagefault_disable();
177                 ret = copy_from_user(page_kaddr,
178                                      (const void __user *) src_addr,
179                                      PAGE_SIZE);
180                 pagefault_enable();
181                 kunmap_local(page_kaddr);
182
183                 /* fallback to copy_from_user outside mmap_lock */
184                 if (unlikely(ret)) {
185                         ret = -ENOENT;
186                         *pagep = page;
187                         /* don't free the page */
188                         goto out;
189                 }
190
191                 flush_dcache_page(page);
192         } else {
193                 page = *pagep;
194                 *pagep = NULL;
195         }
196
197         /*
198          * The memory barrier inside __SetPageUptodate makes sure that
199          * preceding stores to the page contents become visible before
200          * the set_pte_at() write.
201          */
202         __SetPageUptodate(page);
203
204         ret = -ENOMEM;
205         if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
206                 goto out_release;
207
208         ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
209                                        page, true, wp_copy);
210         if (ret)
211                 goto out_release;
212 out:
213         return ret;
214 out_release:
215         put_page(page);
216         goto out;
217 }
218
219 static int mfill_zeropage_pte(struct mm_struct *dst_mm,
220                               pmd_t *dst_pmd,
221                               struct vm_area_struct *dst_vma,
222                               unsigned long dst_addr)
223 {
224         pte_t _dst_pte, *dst_pte;
225         spinlock_t *ptl;
226         int ret;
227         pgoff_t offset, max_off;
228         struct inode *inode;
229
230         _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
231                                          dst_vma->vm_page_prot));
232         dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
233         if (dst_vma->vm_file) {
234                 /* the shmem MAP_PRIVATE case requires checking the i_size */
235                 inode = dst_vma->vm_file->f_inode;
236                 offset = linear_page_index(dst_vma, dst_addr);
237                 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
238                 ret = -EFAULT;
239                 if (unlikely(offset >= max_off))
240                         goto out_unlock;
241         }
242         ret = -EEXIST;
243         if (!pte_none(*dst_pte))
244                 goto out_unlock;
245         set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
246         /* No need to invalidate - it was non-present before */
247         update_mmu_cache(dst_vma, dst_addr, dst_pte);
248         ret = 0;
249 out_unlock:
250         pte_unmap_unlock(dst_pte, ptl);
251         return ret;
252 }
253
254 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
255 static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
256                                 pmd_t *dst_pmd,
257                                 struct vm_area_struct *dst_vma,
258                                 unsigned long dst_addr,
259                                 bool wp_copy)
260 {
261         struct inode *inode = file_inode(dst_vma->vm_file);
262         pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
263         struct folio *folio;
264         struct page *page;
265         int ret;
266
267         ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
268         /* Our caller expects us to return -EFAULT if we failed to find folio */
269         if (ret == -ENOENT)
270                 ret = -EFAULT;
271         if (ret)
272                 goto out;
273         if (!folio) {
274                 ret = -EFAULT;
275                 goto out;
276         }
277
278         page = folio_file_page(folio, pgoff);
279         if (PageHWPoison(page)) {
280                 ret = -EIO;
281                 goto out_release;
282         }
283
284         ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
285                                        page, false, wp_copy);
286         if (ret)
287                 goto out_release;
288
289         folio_unlock(folio);
290         ret = 0;
291 out:
292         return ret;
293 out_release:
294         folio_unlock(folio);
295         folio_put(folio);
296         goto out;
297 }
298
299 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
300 {
301         pgd_t *pgd;
302         p4d_t *p4d;
303         pud_t *pud;
304
305         pgd = pgd_offset(mm, address);
306         p4d = p4d_alloc(mm, pgd, address);
307         if (!p4d)
308                 return NULL;
309         pud = pud_alloc(mm, p4d, address);
310         if (!pud)
311                 return NULL;
312         /*
313          * Note that we didn't run this because the pmd was
314          * missing, the *pmd may be already established and in
315          * turn it may also be a trans_huge_pmd.
316          */
317         return pmd_alloc(mm, pud, address);
318 }
319
320 #ifdef CONFIG_HUGETLB_PAGE
321 /*
322  * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
323  * called with mmap_lock held, it will release mmap_lock before returning.
324  */
325 static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
326                                               struct vm_area_struct *dst_vma,
327                                               unsigned long dst_start,
328                                               unsigned long src_start,
329                                               unsigned long len,
330                                               enum mcopy_atomic_mode mode,
331                                               bool wp_copy)
332 {
333         int vm_shared = dst_vma->vm_flags & VM_SHARED;
334         ssize_t err;
335         pte_t *dst_pte;
336         unsigned long src_addr, dst_addr;
337         long copied;
338         struct page *page;
339         unsigned long vma_hpagesize;
340         pgoff_t idx;
341         u32 hash;
342         struct address_space *mapping;
343
344         /*
345          * There is no default zero huge page for all huge page sizes as
346          * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
347          * by THP.  Since we can not reliably insert a zero page, this
348          * feature is not supported.
349          */
350         if (mode == MCOPY_ATOMIC_ZEROPAGE) {
351                 mmap_read_unlock(dst_mm);
352                 return -EINVAL;
353         }
354
355         src_addr = src_start;
356         dst_addr = dst_start;
357         copied = 0;
358         page = NULL;
359         vma_hpagesize = vma_kernel_pagesize(dst_vma);
360
361         /*
362          * Validate alignment based on huge page size
363          */
364         err = -EINVAL;
365         if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
366                 goto out_unlock;
367
368 retry:
369         /*
370          * On routine entry dst_vma is set.  If we had to drop mmap_lock and
371          * retry, dst_vma will be set to NULL and we must lookup again.
372          */
373         if (!dst_vma) {
374                 err = -ENOENT;
375                 dst_vma = find_dst_vma(dst_mm, dst_start, len);
376                 if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
377                         goto out_unlock;
378
379                 err = -EINVAL;
380                 if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
381                         goto out_unlock;
382
383                 vm_shared = dst_vma->vm_flags & VM_SHARED;
384         }
385
386         /*
387          * If not shared, ensure the dst_vma has a anon_vma.
388          */
389         err = -ENOMEM;
390         if (!vm_shared) {
391                 if (unlikely(anon_vma_prepare(dst_vma)))
392                         goto out_unlock;
393         }
394
395         while (src_addr < src_start + len) {
396                 BUG_ON(dst_addr >= dst_start + len);
397
398                 /*
399                  * Serialize via vma_lock and hugetlb_fault_mutex.
400                  * vma_lock ensures the dst_pte remains valid even
401                  * in the case of shared pmds.  fault mutex prevents
402                  * races with other faulting threads.
403                  */
404                 idx = linear_page_index(dst_vma, dst_addr);
405                 mapping = dst_vma->vm_file->f_mapping;
406                 hash = hugetlb_fault_mutex_hash(mapping, idx);
407                 mutex_lock(&hugetlb_fault_mutex_table[hash]);
408                 hugetlb_vma_lock_read(dst_vma);
409
410                 err = -ENOMEM;
411                 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
412                 if (!dst_pte) {
413                         hugetlb_vma_unlock_read(dst_vma);
414                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
415                         goto out_unlock;
416                 }
417
418                 if (mode != MCOPY_ATOMIC_CONTINUE &&
419                     !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
420                         err = -EEXIST;
421                         hugetlb_vma_unlock_read(dst_vma);
422                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
423                         goto out_unlock;
424                 }
425
426                 err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
427                                                dst_addr, src_addr, mode, &page,
428                                                wp_copy);
429
430                 hugetlb_vma_unlock_read(dst_vma);
431                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
432
433                 cond_resched();
434
435                 if (unlikely(err == -ENOENT)) {
436                         mmap_read_unlock(dst_mm);
437                         BUG_ON(!page);
438
439                         err = copy_huge_page_from_user(page,
440                                                 (const void __user *)src_addr,
441                                                 vma_hpagesize / PAGE_SIZE,
442                                                 true);
443                         if (unlikely(err)) {
444                                 err = -EFAULT;
445                                 goto out;
446                         }
447                         mmap_read_lock(dst_mm);
448
449                         dst_vma = NULL;
450                         goto retry;
451                 } else
452                         BUG_ON(page);
453
454                 if (!err) {
455                         dst_addr += vma_hpagesize;
456                         src_addr += vma_hpagesize;
457                         copied += vma_hpagesize;
458
459                         if (fatal_signal_pending(current))
460                                 err = -EINTR;
461                 }
462                 if (err)
463                         break;
464         }
465
466 out_unlock:
467         mmap_read_unlock(dst_mm);
468 out:
469         if (page)
470                 put_page(page);
471         BUG_ON(copied < 0);
472         BUG_ON(err > 0);
473         BUG_ON(!copied && !err);
474         return copied ? copied : err;
475 }
476 #else /* !CONFIG_HUGETLB_PAGE */
477 /* fail at build time if gcc attempts to use this */
478 extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
479                                       struct vm_area_struct *dst_vma,
480                                       unsigned long dst_start,
481                                       unsigned long src_start,
482                                       unsigned long len,
483                                       enum mcopy_atomic_mode mode,
484                                       bool wp_copy);
485 #endif /* CONFIG_HUGETLB_PAGE */
486
487 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
488                                                 pmd_t *dst_pmd,
489                                                 struct vm_area_struct *dst_vma,
490                                                 unsigned long dst_addr,
491                                                 unsigned long src_addr,
492                                                 struct page **page,
493                                                 enum mcopy_atomic_mode mode,
494                                                 bool wp_copy)
495 {
496         ssize_t err;
497
498         if (mode == MCOPY_ATOMIC_CONTINUE) {
499                 return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
500                                             wp_copy);
501         }
502
503         /*
504          * The normal page fault path for a shmem will invoke the
505          * fault, fill the hole in the file and COW it right away. The
506          * result generates plain anonymous memory. So when we are
507          * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
508          * generate anonymous memory directly without actually filling
509          * the hole. For the MAP_PRIVATE case the robustness check
510          * only happens in the pagetable (to verify it's still none)
511          * and not in the radix tree.
512          */
513         if (!(dst_vma->vm_flags & VM_SHARED)) {
514                 if (mode == MCOPY_ATOMIC_NORMAL)
515                         err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
516                                                dst_addr, src_addr, page,
517                                                wp_copy);
518                 else
519                         err = mfill_zeropage_pte(dst_mm, dst_pmd,
520                                                  dst_vma, dst_addr);
521         } else {
522                 err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
523                                              dst_addr, src_addr,
524                                              mode != MCOPY_ATOMIC_NORMAL,
525                                              wp_copy, page);
526         }
527
528         return err;
529 }
530
531 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
532                                               unsigned long dst_start,
533                                               unsigned long src_start,
534                                               unsigned long len,
535                                               enum mcopy_atomic_mode mcopy_mode,
536                                               atomic_t *mmap_changing,
537                                               __u64 mode)
538 {
539         struct vm_area_struct *dst_vma;
540         ssize_t err;
541         pmd_t *dst_pmd;
542         unsigned long src_addr, dst_addr;
543         long copied;
544         struct page *page;
545         bool wp_copy;
546
547         /*
548          * Sanitize the command parameters:
549          */
550         BUG_ON(dst_start & ~PAGE_MASK);
551         BUG_ON(len & ~PAGE_MASK);
552
553         /* Does the address range wrap, or is the span zero-sized? */
554         BUG_ON(src_start + len <= src_start);
555         BUG_ON(dst_start + len <= dst_start);
556
557         src_addr = src_start;
558         dst_addr = dst_start;
559         copied = 0;
560         page = NULL;
561 retry:
562         mmap_read_lock(dst_mm);
563
564         /*
565          * If memory mappings are changing because of non-cooperative
566          * operation (e.g. mremap) running in parallel, bail out and
567          * request the user to retry later
568          */
569         err = -EAGAIN;
570         if (mmap_changing && atomic_read(mmap_changing))
571                 goto out_unlock;
572
573         /*
574          * Make sure the vma is not shared, that the dst range is
575          * both valid and fully within a single existing vma.
576          */
577         err = -ENOENT;
578         dst_vma = find_dst_vma(dst_mm, dst_start, len);
579         if (!dst_vma)
580                 goto out_unlock;
581
582         err = -EINVAL;
583         /*
584          * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
585          * it will overwrite vm_ops, so vma_is_anonymous must return false.
586          */
587         if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
588             dst_vma->vm_flags & VM_SHARED))
589                 goto out_unlock;
590
591         /*
592          * validate 'mode' now that we know the dst_vma: don't allow
593          * a wrprotect copy if the userfaultfd didn't register as WP.
594          */
595         wp_copy = mode & UFFDIO_COPY_MODE_WP;
596         if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
597                 goto out_unlock;
598
599         /*
600          * If this is a HUGETLB vma, pass off to appropriate routine
601          */
602         if (is_vm_hugetlb_page(dst_vma))
603                 return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
604                                                src_start, len, mcopy_mode,
605                                                wp_copy);
606
607         if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
608                 goto out_unlock;
609         if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
610                 goto out_unlock;
611
612         /*
613          * Ensure the dst_vma has a anon_vma or this page
614          * would get a NULL anon_vma when moved in the
615          * dst_vma.
616          */
617         err = -ENOMEM;
618         if (!(dst_vma->vm_flags & VM_SHARED) &&
619             unlikely(anon_vma_prepare(dst_vma)))
620                 goto out_unlock;
621
622         while (src_addr < src_start + len) {
623                 pmd_t dst_pmdval;
624
625                 BUG_ON(dst_addr >= dst_start + len);
626
627                 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
628                 if (unlikely(!dst_pmd)) {
629                         err = -ENOMEM;
630                         break;
631                 }
632
633                 dst_pmdval = pmd_read_atomic(dst_pmd);
634                 /*
635                  * If the dst_pmd is mapped as THP don't
636                  * override it and just be strict.
637                  */
638                 if (unlikely(pmd_trans_huge(dst_pmdval))) {
639                         err = -EEXIST;
640                         break;
641                 }
642                 if (unlikely(pmd_none(dst_pmdval)) &&
643                     unlikely(__pte_alloc(dst_mm, dst_pmd))) {
644                         err = -ENOMEM;
645                         break;
646                 }
647                 /* If an huge pmd materialized from under us fail */
648                 if (unlikely(pmd_trans_huge(*dst_pmd))) {
649                         err = -EFAULT;
650                         break;
651                 }
652
653                 BUG_ON(pmd_none(*dst_pmd));
654                 BUG_ON(pmd_trans_huge(*dst_pmd));
655
656                 err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
657                                        src_addr, &page, mcopy_mode, wp_copy);
658                 cond_resched();
659
660                 if (unlikely(err == -ENOENT)) {
661                         void *page_kaddr;
662
663                         mmap_read_unlock(dst_mm);
664                         BUG_ON(!page);
665
666                         page_kaddr = kmap_local_page(page);
667                         err = copy_from_user(page_kaddr,
668                                              (const void __user *) src_addr,
669                                              PAGE_SIZE);
670                         kunmap_local(page_kaddr);
671                         if (unlikely(err)) {
672                                 err = -EFAULT;
673                                 goto out;
674                         }
675                         flush_dcache_page(page);
676                         goto retry;
677                 } else
678                         BUG_ON(page);
679
680                 if (!err) {
681                         dst_addr += PAGE_SIZE;
682                         src_addr += PAGE_SIZE;
683                         copied += PAGE_SIZE;
684
685                         if (fatal_signal_pending(current))
686                                 err = -EINTR;
687                 }
688                 if (err)
689                         break;
690         }
691
692 out_unlock:
693         mmap_read_unlock(dst_mm);
694 out:
695         if (page)
696                 put_page(page);
697         BUG_ON(copied < 0);
698         BUG_ON(err > 0);
699         BUG_ON(!copied && !err);
700         return copied ? copied : err;
701 }
702
703 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
704                      unsigned long src_start, unsigned long len,
705                      atomic_t *mmap_changing, __u64 mode)
706 {
707         return __mcopy_atomic(dst_mm, dst_start, src_start, len,
708                               MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
709 }
710
711 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
712                        unsigned long len, atomic_t *mmap_changing)
713 {
714         return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
715                               mmap_changing, 0);
716 }
717
718 ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
719                        unsigned long len, atomic_t *mmap_changing)
720 {
721         return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
722                               mmap_changing, 0);
723 }
724
725 void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
726                    unsigned long start, unsigned long len, bool enable_wp)
727 {
728         struct mmu_gather tlb;
729         pgprot_t newprot;
730
731         if (enable_wp)
732                 newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
733         else
734                 newprot = vm_get_page_prot(dst_vma->vm_flags);
735
736         tlb_gather_mmu(&tlb, dst_mm);
737         change_protection(&tlb, dst_vma, start, start + len, newprot,
738                           enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
739         tlb_finish_mmu(&tlb);
740 }
741
742 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
743                         unsigned long len, bool enable_wp,
744                         atomic_t *mmap_changing)
745 {
746         struct vm_area_struct *dst_vma;
747         unsigned long page_mask;
748         int err;
749
750         /*
751          * Sanitize the command parameters:
752          */
753         BUG_ON(start & ~PAGE_MASK);
754         BUG_ON(len & ~PAGE_MASK);
755
756         /* Does the address range wrap, or is the span zero-sized? */
757         BUG_ON(start + len <= start);
758
759         mmap_read_lock(dst_mm);
760
761         /*
762          * If memory mappings are changing because of non-cooperative
763          * operation (e.g. mremap) running in parallel, bail out and
764          * request the user to retry later
765          */
766         err = -EAGAIN;
767         if (mmap_changing && atomic_read(mmap_changing))
768                 goto out_unlock;
769
770         err = -ENOENT;
771         dst_vma = find_dst_vma(dst_mm, start, len);
772
773         if (!dst_vma)
774                 goto out_unlock;
775         if (!userfaultfd_wp(dst_vma))
776                 goto out_unlock;
777         if (!vma_can_userfault(dst_vma, dst_vma->vm_flags))
778                 goto out_unlock;
779
780         if (is_vm_hugetlb_page(dst_vma)) {
781                 err = -EINVAL;
782                 page_mask = vma_kernel_pagesize(dst_vma) - 1;
783                 if ((start & page_mask) || (len & page_mask))
784                         goto out_unlock;
785         }
786
787         uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
788
789         err = 0;
790 out_unlock:
791         mmap_read_unlock(dst_mm);
792         return err;
793 }