Merge tag 'armsoc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
[platform/kernel/linux-rpi.git] / mm / mremap.c
1 /*
2  *      mm/mremap.c
3  *
4  *      (C) Copyright 1996 Linus Torvalds
5  *
6  *      Address space accounting code   <alan@lxorguk.ukuu.org.uk>
7  *      (C) Copyright 2002 Red Hat Inc, All Rights Reserved
8  */
9
10 #include <linux/mm.h>
11 #include <linux/hugetlb.h>
12 #include <linux/shm.h>
13 #include <linux/ksm.h>
14 #include <linux/mman.h>
15 #include <linux/swap.h>
16 #include <linux/capability.h>
17 #include <linux/fs.h>
18 #include <linux/swapops.h>
19 #include <linux/highmem.h>
20 #include <linux/security.h>
21 #include <linux/syscalls.h>
22 #include <linux/mmu_notifier.h>
23 #include <linux/uaccess.h>
24 #include <linux/mm-arch-hooks.h>
25 #include <linux/userfaultfd_k.h>
26
27 #include <asm/cacheflush.h>
28 #include <asm/tlbflush.h>
29
30 #include "internal.h"
31
32 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
33 {
34         pgd_t *pgd;
35         p4d_t *p4d;
36         pud_t *pud;
37         pmd_t *pmd;
38
39         pgd = pgd_offset(mm, addr);
40         if (pgd_none_or_clear_bad(pgd))
41                 return NULL;
42
43         p4d = p4d_offset(pgd, addr);
44         if (p4d_none_or_clear_bad(p4d))
45                 return NULL;
46
47         pud = pud_offset(p4d, addr);
48         if (pud_none_or_clear_bad(pud))
49                 return NULL;
50
51         pmd = pmd_offset(pud, addr);
52         if (pmd_none(*pmd))
53                 return NULL;
54
55         return pmd;
56 }
57
58 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
59                             unsigned long addr)
60 {
61         pgd_t *pgd;
62         p4d_t *p4d;
63         pud_t *pud;
64         pmd_t *pmd;
65
66         pgd = pgd_offset(mm, addr);
67         p4d = p4d_alloc(mm, pgd, addr);
68         if (!p4d)
69                 return NULL;
70         pud = pud_alloc(mm, p4d, addr);
71         if (!pud)
72                 return NULL;
73
74         pmd = pmd_alloc(mm, pud, addr);
75         if (!pmd)
76                 return NULL;
77
78         VM_BUG_ON(pmd_trans_huge(*pmd));
79
80         return pmd;
81 }
82
83 static void take_rmap_locks(struct vm_area_struct *vma)
84 {
85         if (vma->vm_file)
86                 i_mmap_lock_write(vma->vm_file->f_mapping);
87         if (vma->anon_vma)
88                 anon_vma_lock_write(vma->anon_vma);
89 }
90
91 static void drop_rmap_locks(struct vm_area_struct *vma)
92 {
93         if (vma->anon_vma)
94                 anon_vma_unlock_write(vma->anon_vma);
95         if (vma->vm_file)
96                 i_mmap_unlock_write(vma->vm_file->f_mapping);
97 }
98
99 static pte_t move_soft_dirty_pte(pte_t pte)
100 {
101         /*
102          * Set soft dirty bit so we can notice
103          * in userspace the ptes were moved.
104          */
105 #ifdef CONFIG_MEM_SOFT_DIRTY
106         if (pte_present(pte))
107                 pte = pte_mksoft_dirty(pte);
108         else if (is_swap_pte(pte))
109                 pte = pte_swp_mksoft_dirty(pte);
110 #endif
111         return pte;
112 }
113
114 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
115                 unsigned long old_addr, unsigned long old_end,
116                 struct vm_area_struct *new_vma, pmd_t *new_pmd,
117                 unsigned long new_addr, bool need_rmap_locks, bool *need_flush)
118 {
119         struct mm_struct *mm = vma->vm_mm;
120         pte_t *old_pte, *new_pte, pte;
121         spinlock_t *old_ptl, *new_ptl;
122         bool force_flush = false;
123         unsigned long len = old_end - old_addr;
124
125         /*
126          * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
127          * locks to ensure that rmap will always observe either the old or the
128          * new ptes. This is the easiest way to avoid races with
129          * truncate_pagecache(), page migration, etc...
130          *
131          * When need_rmap_locks is false, we use other ways to avoid
132          * such races:
133          *
134          * - During exec() shift_arg_pages(), we use a specially tagged vma
135          *   which rmap call sites look for using is_vma_temporary_stack().
136          *
137          * - During mremap(), new_vma is often known to be placed after vma
138          *   in rmap traversal order. This ensures rmap will always observe
139          *   either the old pte, or the new pte, or both (the page table locks
140          *   serialize access to individual ptes, but only rmap traversal
141          *   order guarantees that we won't miss both the old and new ptes).
142          */
143         if (need_rmap_locks)
144                 take_rmap_locks(vma);
145
146         /*
147          * We don't have to worry about the ordering of src and dst
148          * pte locks because exclusive mmap_sem prevents deadlock.
149          */
150         old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
151         new_pte = pte_offset_map(new_pmd, new_addr);
152         new_ptl = pte_lockptr(mm, new_pmd);
153         if (new_ptl != old_ptl)
154                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
155         arch_enter_lazy_mmu_mode();
156
157         for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
158                                    new_pte++, new_addr += PAGE_SIZE) {
159                 if (pte_none(*old_pte))
160                         continue;
161
162                 pte = ptep_get_and_clear(mm, old_addr, old_pte);
163                 /*
164                  * If we are remapping a dirty PTE, make sure
165                  * to flush TLB before we drop the PTL for the
166                  * old PTE or we may race with page_mkclean().
167                  *
168                  * This check has to be done after we removed the
169                  * old PTE from page tables or another thread may
170                  * dirty it after the check and before the removal.
171                  */
172                 if (pte_present(pte) && pte_dirty(pte))
173                         force_flush = true;
174                 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
175                 pte = move_soft_dirty_pte(pte);
176                 set_pte_at(mm, new_addr, new_pte, pte);
177         }
178
179         arch_leave_lazy_mmu_mode();
180         if (new_ptl != old_ptl)
181                 spin_unlock(new_ptl);
182         pte_unmap(new_pte - 1);
183         if (force_flush)
184                 flush_tlb_range(vma, old_end - len, old_end);
185         else
186                 *need_flush = true;
187         pte_unmap_unlock(old_pte - 1, old_ptl);
188         if (need_rmap_locks)
189                 drop_rmap_locks(vma);
190 }
191
192 #define LATENCY_LIMIT   (64 * PAGE_SIZE)
193
194 unsigned long move_page_tables(struct vm_area_struct *vma,
195                 unsigned long old_addr, struct vm_area_struct *new_vma,
196                 unsigned long new_addr, unsigned long len,
197                 bool need_rmap_locks)
198 {
199         unsigned long extent, next, old_end;
200         pmd_t *old_pmd, *new_pmd;
201         bool need_flush = false;
202         unsigned long mmun_start;       /* For mmu_notifiers */
203         unsigned long mmun_end;         /* For mmu_notifiers */
204
205         old_end = old_addr + len;
206         flush_cache_range(vma, old_addr, old_end);
207
208         mmun_start = old_addr;
209         mmun_end   = old_end;
210         mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
211
212         for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
213                 cond_resched();
214                 next = (old_addr + PMD_SIZE) & PMD_MASK;
215                 /* even if next overflowed, extent below will be ok */
216                 extent = next - old_addr;
217                 if (extent > old_end - old_addr)
218                         extent = old_end - old_addr;
219                 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
220                 if (!old_pmd)
221                         continue;
222                 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
223                 if (!new_pmd)
224                         break;
225                 if (pmd_trans_huge(*old_pmd)) {
226                         if (extent == HPAGE_PMD_SIZE) {
227                                 bool moved;
228                                 /* See comment in move_ptes() */
229                                 if (need_rmap_locks)
230                                         take_rmap_locks(vma);
231                                 moved = move_huge_pmd(vma, old_addr, new_addr,
232                                                     old_end, old_pmd, new_pmd,
233                                                     &need_flush);
234                                 if (need_rmap_locks)
235                                         drop_rmap_locks(vma);
236                                 if (moved)
237                                         continue;
238                         }
239                         split_huge_pmd(vma, old_pmd, old_addr);
240                         if (pmd_trans_unstable(old_pmd))
241                                 continue;
242                 }
243                 if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
244                         break;
245                 next = (new_addr + PMD_SIZE) & PMD_MASK;
246                 if (extent > next - new_addr)
247                         extent = next - new_addr;
248                 if (extent > LATENCY_LIMIT)
249                         extent = LATENCY_LIMIT;
250                 move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
251                           new_pmd, new_addr, need_rmap_locks, &need_flush);
252         }
253         if (need_flush)
254                 flush_tlb_range(vma, old_end-len, old_addr);
255
256         mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
257
258         return len + old_addr - old_end;        /* how much done */
259 }
260
261 static unsigned long move_vma(struct vm_area_struct *vma,
262                 unsigned long old_addr, unsigned long old_len,
263                 unsigned long new_len, unsigned long new_addr,
264                 bool *locked, struct vm_userfaultfd_ctx *uf,
265                 struct list_head *uf_unmap)
266 {
267         struct mm_struct *mm = vma->vm_mm;
268         struct vm_area_struct *new_vma;
269         unsigned long vm_flags = vma->vm_flags;
270         unsigned long new_pgoff;
271         unsigned long moved_len;
272         unsigned long excess = 0;
273         unsigned long hiwater_vm;
274         int split = 0;
275         int err;
276         bool need_rmap_locks;
277
278         /*
279          * We'd prefer to avoid failure later on in do_munmap:
280          * which may split one vma into three before unmapping.
281          */
282         if (mm->map_count >= sysctl_max_map_count - 3)
283                 return -ENOMEM;
284
285         /*
286          * Advise KSM to break any KSM pages in the area to be moved:
287          * it would be confusing if they were to turn up at the new
288          * location, where they happen to coincide with different KSM
289          * pages recently unmapped.  But leave vma->vm_flags as it was,
290          * so KSM can come around to merge on vma and new_vma afterwards.
291          */
292         err = ksm_madvise(vma, old_addr, old_addr + old_len,
293                                                 MADV_UNMERGEABLE, &vm_flags);
294         if (err)
295                 return err;
296
297         new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
298         new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
299                            &need_rmap_locks);
300         if (!new_vma)
301                 return -ENOMEM;
302
303         moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
304                                      need_rmap_locks);
305         if (moved_len < old_len) {
306                 err = -ENOMEM;
307         } else if (vma->vm_ops && vma->vm_ops->mremap) {
308                 err = vma->vm_ops->mremap(new_vma);
309         }
310
311         if (unlikely(err)) {
312                 /*
313                  * On error, move entries back from new area to old,
314                  * which will succeed since page tables still there,
315                  * and then proceed to unmap new area instead of old.
316                  */
317                 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
318                                  true);
319                 vma = new_vma;
320                 old_len = new_len;
321                 old_addr = new_addr;
322                 new_addr = err;
323         } else {
324                 mremap_userfaultfd_prep(new_vma, uf);
325                 arch_remap(mm, old_addr, old_addr + old_len,
326                            new_addr, new_addr + new_len);
327         }
328
329         /* Conceal VM_ACCOUNT so old reservation is not undone */
330         if (vm_flags & VM_ACCOUNT) {
331                 vma->vm_flags &= ~VM_ACCOUNT;
332                 excess = vma->vm_end - vma->vm_start - old_len;
333                 if (old_addr > vma->vm_start &&
334                     old_addr + old_len < vma->vm_end)
335                         split = 1;
336         }
337
338         /*
339          * If we failed to move page tables we still do total_vm increment
340          * since do_munmap() will decrement it by old_len == new_len.
341          *
342          * Since total_vm is about to be raised artificially high for a
343          * moment, we need to restore high watermark afterwards: if stats
344          * are taken meanwhile, total_vm and hiwater_vm appear too high.
345          * If this were a serious issue, we'd add a flag to do_munmap().
346          */
347         hiwater_vm = mm->hiwater_vm;
348         vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
349
350         /* Tell pfnmap has moved from this vma */
351         if (unlikely(vma->vm_flags & VM_PFNMAP))
352                 untrack_pfn_moved(vma);
353
354         if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
355                 /* OOM: unable to split vma, just get accounts right */
356                 vm_unacct_memory(excess >> PAGE_SHIFT);
357                 excess = 0;
358         }
359         mm->hiwater_vm = hiwater_vm;
360
361         /* Restore VM_ACCOUNT if one or two pieces of vma left */
362         if (excess) {
363                 vma->vm_flags |= VM_ACCOUNT;
364                 if (split)
365                         vma->vm_next->vm_flags |= VM_ACCOUNT;
366         }
367
368         if (vm_flags & VM_LOCKED) {
369                 mm->locked_vm += new_len >> PAGE_SHIFT;
370                 *locked = true;
371         }
372
373         return new_addr;
374 }
375
376 static struct vm_area_struct *vma_to_resize(unsigned long addr,
377         unsigned long old_len, unsigned long new_len, unsigned long *p)
378 {
379         struct mm_struct *mm = current->mm;
380         struct vm_area_struct *vma = find_vma(mm, addr);
381         unsigned long pgoff;
382
383         if (!vma || vma->vm_start > addr)
384                 return ERR_PTR(-EFAULT);
385
386         if (is_vm_hugetlb_page(vma))
387                 return ERR_PTR(-EINVAL);
388
389         /* We can't remap across vm area boundaries */
390         if (old_len > vma->vm_end - addr)
391                 return ERR_PTR(-EFAULT);
392
393         if (new_len == old_len)
394                 return vma;
395
396         /* Need to be careful about a growing mapping */
397         pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
398         pgoff += vma->vm_pgoff;
399         if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
400                 return ERR_PTR(-EINVAL);
401
402         if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
403                 return ERR_PTR(-EFAULT);
404
405         if (vma->vm_flags & VM_LOCKED) {
406                 unsigned long locked, lock_limit;
407                 locked = mm->locked_vm << PAGE_SHIFT;
408                 lock_limit = rlimit(RLIMIT_MEMLOCK);
409                 locked += new_len - old_len;
410                 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
411                         return ERR_PTR(-EAGAIN);
412         }
413
414         if (!may_expand_vm(mm, vma->vm_flags,
415                                 (new_len - old_len) >> PAGE_SHIFT))
416                 return ERR_PTR(-ENOMEM);
417
418         if (vma->vm_flags & VM_ACCOUNT) {
419                 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
420                 if (security_vm_enough_memory_mm(mm, charged))
421                         return ERR_PTR(-ENOMEM);
422                 *p = charged;
423         }
424
425         return vma;
426 }
427
428 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
429                 unsigned long new_addr, unsigned long new_len, bool *locked,
430                 struct vm_userfaultfd_ctx *uf,
431                 struct list_head *uf_unmap)
432 {
433         struct mm_struct *mm = current->mm;
434         struct vm_area_struct *vma;
435         unsigned long ret = -EINVAL;
436         unsigned long charged = 0;
437         unsigned long map_flags;
438
439         if (offset_in_page(new_addr))
440                 goto out;
441
442         if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
443                 goto out;
444
445         /* Ensure the old/new locations do not overlap */
446         if (addr + old_len > new_addr && new_addr + new_len > addr)
447                 goto out;
448
449         ret = do_munmap(mm, new_addr, new_len, NULL);
450         if (ret)
451                 goto out;
452
453         if (old_len >= new_len) {
454                 ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
455                 if (ret && old_len != new_len)
456                         goto out;
457                 old_len = new_len;
458         }
459
460         vma = vma_to_resize(addr, old_len, new_len, &charged);
461         if (IS_ERR(vma)) {
462                 ret = PTR_ERR(vma);
463                 goto out;
464         }
465
466         map_flags = MAP_FIXED;
467         if (vma->vm_flags & VM_MAYSHARE)
468                 map_flags |= MAP_SHARED;
469
470         ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
471                                 ((addr - vma->vm_start) >> PAGE_SHIFT),
472                                 map_flags);
473         if (offset_in_page(ret))
474                 goto out1;
475
476         ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
477                        uf_unmap);
478         if (!(offset_in_page(ret)))
479                 goto out;
480 out1:
481         vm_unacct_memory(charged);
482
483 out:
484         return ret;
485 }
486
487 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
488 {
489         unsigned long end = vma->vm_end + delta;
490         if (end < vma->vm_end) /* overflow */
491                 return 0;
492         if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
493                 return 0;
494         if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
495                               0, MAP_FIXED) & ~PAGE_MASK)
496                 return 0;
497         return 1;
498 }
499
500 /*
501  * Expand (or shrink) an existing mapping, potentially moving it at the
502  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
503  *
504  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
505  * This option implies MREMAP_MAYMOVE.
506  */
507 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
508                 unsigned long, new_len, unsigned long, flags,
509                 unsigned long, new_addr)
510 {
511         struct mm_struct *mm = current->mm;
512         struct vm_area_struct *vma;
513         unsigned long ret = -EINVAL;
514         unsigned long charged = 0;
515         bool locked = false;
516         struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
517         LIST_HEAD(uf_unmap);
518
519         if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
520                 return ret;
521
522         if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
523                 return ret;
524
525         if (offset_in_page(addr))
526                 return ret;
527
528         old_len = PAGE_ALIGN(old_len);
529         new_len = PAGE_ALIGN(new_len);
530
531         /*
532          * We allow a zero old-len as a special case
533          * for DOS-emu "duplicate shm area" thing. But
534          * a zero new-len is nonsensical.
535          */
536         if (!new_len)
537                 return ret;
538
539         if (down_write_killable(&current->mm->mmap_sem))
540                 return -EINTR;
541
542         if (flags & MREMAP_FIXED) {
543                 ret = mremap_to(addr, old_len, new_addr, new_len,
544                                 &locked, &uf, &uf_unmap);
545                 goto out;
546         }
547
548         /*
549          * Always allow a shrinking remap: that just unmaps
550          * the unnecessary pages..
551          * do_munmap does all the needed commit accounting
552          */
553         if (old_len >= new_len) {
554                 ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
555                 if (ret && old_len != new_len)
556                         goto out;
557                 ret = addr;
558                 goto out;
559         }
560
561         /*
562          * Ok, we need to grow..
563          */
564         vma = vma_to_resize(addr, old_len, new_len, &charged);
565         if (IS_ERR(vma)) {
566                 ret = PTR_ERR(vma);
567                 goto out;
568         }
569
570         /* old_len exactly to the end of the area..
571          */
572         if (old_len == vma->vm_end - addr) {
573                 /* can we just expand the current mapping? */
574                 if (vma_expandable(vma, new_len - old_len)) {
575                         int pages = (new_len - old_len) >> PAGE_SHIFT;
576
577                         if (vma_adjust(vma, vma->vm_start, addr + new_len,
578                                        vma->vm_pgoff, NULL)) {
579                                 ret = -ENOMEM;
580                                 goto out;
581                         }
582
583                         vm_stat_account(mm, vma->vm_flags, pages);
584                         if (vma->vm_flags & VM_LOCKED) {
585                                 mm->locked_vm += pages;
586                                 locked = true;
587                                 new_addr = addr;
588                         }
589                         ret = addr;
590                         goto out;
591                 }
592         }
593
594         /*
595          * We weren't able to just expand or shrink the area,
596          * we need to create a new one and move it..
597          */
598         ret = -ENOMEM;
599         if (flags & MREMAP_MAYMOVE) {
600                 unsigned long map_flags = 0;
601                 if (vma->vm_flags & VM_MAYSHARE)
602                         map_flags |= MAP_SHARED;
603
604                 new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
605                                         vma->vm_pgoff +
606                                         ((addr - vma->vm_start) >> PAGE_SHIFT),
607                                         map_flags);
608                 if (offset_in_page(new_addr)) {
609                         ret = new_addr;
610                         goto out;
611                 }
612
613                 ret = move_vma(vma, addr, old_len, new_len, new_addr,
614                                &locked, &uf, &uf_unmap);
615         }
616 out:
617         if (offset_in_page(ret)) {
618                 vm_unacct_memory(charged);
619                 locked = 0;
620         }
621         up_write(&current->mm->mmap_sem);
622         if (locked && new_len > old_len)
623                 mm_populate(new_addr + old_len, new_len - old_len);
624         mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
625         userfaultfd_unmap_complete(mm, &uf_unmap);
626         return ret;
627 }