mm,ksm: add __GFP_HIGH to the allocation in alloc_stable_node()
[platform/kernel/linux-exynos.git] / mm / mprotect.c
1 /*
2  *  mm/mprotect.c
3  *
4  *  (C) Copyright 1994 Linus Torvalds
5  *  (C) Copyright 2002 Christoph Hellwig
6  *
7  *  Address space accounting code       <alan@lxorguk.ukuu.org.uk>
8  *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
9  */
10
11 #include <linux/mm.h>
12 #include <linux/hugetlb.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/fs.h>
16 #include <linux/highmem.h>
17 #include <linux/security.h>
18 #include <linux/mempolicy.h>
19 #include <linux/personality.h>
20 #include <linux/syscalls.h>
21 #include <linux/swap.h>
22 #include <linux/swapops.h>
23 #include <linux/mmu_notifier.h>
24 #include <linux/migrate.h>
25 #include <linux/perf_event.h>
26 #include <linux/ksm.h>
27 #include <linux/pkeys.h>
28 #include <asm/uaccess.h>
29 #include <asm/pgtable.h>
30 #include <asm/cacheflush.h>
31 #include <asm/tlbflush.h>
32
33 #include "internal.h"
34
35 /*
36  * For a prot_numa update we only hold mmap_sem for read so there is a
37  * potential race with faulting where a pmd was temporarily none. This
38  * function checks for a transhuge pmd under the appropriate lock. It
39  * returns a pte if it was successfully locked or NULL if it raced with
40  * a transhuge insertion.
41  */
42 static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
43                         unsigned long addr, int prot_numa, spinlock_t **ptl)
44 {
45         pte_t *pte;
46         spinlock_t *pmdl;
47
48         /* !prot_numa is protected by mmap_sem held for write */
49         if (!prot_numa)
50                 return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
51
52         pmdl = pmd_lock(vma->vm_mm, pmd);
53         if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
54                 spin_unlock(pmdl);
55                 return NULL;
56         }
57
58         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
59         spin_unlock(pmdl);
60         return pte;
61 }
62
63 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
64                 unsigned long addr, unsigned long end, pgprot_t newprot,
65                 int dirty_accountable, int prot_numa)
66 {
67         struct mm_struct *mm = vma->vm_mm;
68         pte_t *pte, oldpte;
69         spinlock_t *ptl;
70         unsigned long pages = 0;
71
72         pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
73         if (!pte)
74                 return 0;
75
76         arch_enter_lazy_mmu_mode();
77         do {
78                 oldpte = *pte;
79                 if (pte_present(oldpte)) {
80                         pte_t ptent;
81                         bool preserve_write = prot_numa && pte_write(oldpte);
82
83                         /*
84                          * Avoid trapping faults against the zero or KSM
85                          * pages. See similar comment in change_huge_pmd.
86                          */
87                         if (prot_numa) {
88                                 struct page *page;
89
90                                 page = vm_normal_page(vma, addr, oldpte);
91                                 if (!page || PageKsm(page))
92                                         continue;
93
94                                 /* Avoid TLB flush if possible */
95                                 if (pte_protnone(oldpte))
96                                         continue;
97                         }
98
99                         ptent = ptep_modify_prot_start(mm, addr, pte);
100                         ptent = pte_modify(ptent, newprot);
101                         if (preserve_write)
102                                 ptent = pte_mkwrite(ptent);
103
104                         /* Avoid taking write faults for known dirty pages */
105                         if (dirty_accountable && pte_dirty(ptent) &&
106                                         (pte_soft_dirty(ptent) ||
107                                          !(vma->vm_flags & VM_SOFTDIRTY))) {
108                                 ptent = pte_mkwrite(ptent);
109                         }
110                         ptep_modify_prot_commit(mm, addr, pte, ptent);
111                         pages++;
112                 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
113                         swp_entry_t entry = pte_to_swp_entry(oldpte);
114
115                         if (is_write_migration_entry(entry)) {
116                                 pte_t newpte;
117                                 /*
118                                  * A protection check is difficult so
119                                  * just be safe and disable write
120                                  */
121                                 make_migration_entry_read(&entry);
122                                 newpte = swp_entry_to_pte(entry);
123                                 if (pte_swp_soft_dirty(oldpte))
124                                         newpte = pte_swp_mksoft_dirty(newpte);
125                                 set_pte_at(mm, addr, pte, newpte);
126
127                                 pages++;
128                         }
129                 }
130         } while (pte++, addr += PAGE_SIZE, addr != end);
131         arch_leave_lazy_mmu_mode();
132         pte_unmap_unlock(pte - 1, ptl);
133
134         return pages;
135 }
136
137 static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
138                 pud_t *pud, unsigned long addr, unsigned long end,
139                 pgprot_t newprot, int dirty_accountable, int prot_numa)
140 {
141         pmd_t *pmd;
142         struct mm_struct *mm = vma->vm_mm;
143         unsigned long next;
144         unsigned long pages = 0;
145         unsigned long nr_huge_updates = 0;
146         unsigned long mni_start = 0;
147
148         pmd = pmd_offset(pud, addr);
149         do {
150                 unsigned long this_pages;
151
152                 next = pmd_addr_end(addr, end);
153                 if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
154                                 && pmd_none_or_clear_bad(pmd))
155                         continue;
156
157                 /* invoke the mmu notifier if the pmd is populated */
158                 if (!mni_start) {
159                         mni_start = addr;
160                         mmu_notifier_invalidate_range_start(mm, mni_start, end);
161                 }
162
163                 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
164                         if (next - addr != HPAGE_PMD_SIZE) {
165                                 split_huge_pmd(vma, pmd, addr);
166                                 if (pmd_trans_unstable(pmd))
167                                         continue;
168                         } else {
169                                 int nr_ptes = change_huge_pmd(vma, pmd, addr,
170                                                 newprot, prot_numa);
171
172                                 if (nr_ptes) {
173                                         if (nr_ptes == HPAGE_PMD_NR) {
174                                                 pages += HPAGE_PMD_NR;
175                                                 nr_huge_updates++;
176                                         }
177
178                                         /* huge pmd was handled */
179                                         continue;
180                                 }
181                         }
182                         /* fall through, the trans huge pmd just split */
183                 }
184                 this_pages = change_pte_range(vma, pmd, addr, next, newprot,
185                                  dirty_accountable, prot_numa);
186                 pages += this_pages;
187         } while (pmd++, addr = next, addr != end);
188
189         if (mni_start)
190                 mmu_notifier_invalidate_range_end(mm, mni_start, end);
191
192         if (nr_huge_updates)
193                 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
194         return pages;
195 }
196
197 static inline unsigned long change_pud_range(struct vm_area_struct *vma,
198                 pgd_t *pgd, unsigned long addr, unsigned long end,
199                 pgprot_t newprot, int dirty_accountable, int prot_numa)
200 {
201         pud_t *pud;
202         unsigned long next;
203         unsigned long pages = 0;
204
205         pud = pud_offset(pgd, addr);
206         do {
207                 next = pud_addr_end(addr, end);
208                 if (pud_none_or_clear_bad(pud))
209                         continue;
210                 pages += change_pmd_range(vma, pud, addr, next, newprot,
211                                  dirty_accountable, prot_numa);
212         } while (pud++, addr = next, addr != end);
213
214         return pages;
215 }
216
217 static unsigned long change_protection_range(struct vm_area_struct *vma,
218                 unsigned long addr, unsigned long end, pgprot_t newprot,
219                 int dirty_accountable, int prot_numa)
220 {
221         struct mm_struct *mm = vma->vm_mm;
222         pgd_t *pgd;
223         unsigned long next;
224         unsigned long start = addr;
225         unsigned long pages = 0;
226
227         BUG_ON(addr >= end);
228         pgd = pgd_offset(mm, addr);
229         flush_cache_range(vma, addr, end);
230         set_tlb_flush_pending(mm);
231         do {
232                 next = pgd_addr_end(addr, end);
233                 if (pgd_none_or_clear_bad(pgd))
234                         continue;
235                 pages += change_pud_range(vma, pgd, addr, next, newprot,
236                                  dirty_accountable, prot_numa);
237         } while (pgd++, addr = next, addr != end);
238
239         /* Only flush the TLB if we actually modified any entries: */
240         if (pages)
241                 flush_tlb_range(vma, start, end);
242         clear_tlb_flush_pending(mm);
243
244         return pages;
245 }
246
247 unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
248                        unsigned long end, pgprot_t newprot,
249                        int dirty_accountable, int prot_numa)
250 {
251         unsigned long pages;
252
253         if (is_vm_hugetlb_page(vma))
254                 pages = hugetlb_change_protection(vma, start, end, newprot);
255         else
256                 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
257
258         return pages;
259 }
260
261 int
262 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
263         unsigned long start, unsigned long end, unsigned long newflags)
264 {
265         struct mm_struct *mm = vma->vm_mm;
266         unsigned long oldflags = vma->vm_flags;
267         long nrpages = (end - start) >> PAGE_SHIFT;
268         unsigned long charged = 0;
269         pgoff_t pgoff;
270         int error;
271         int dirty_accountable = 0;
272
273         if (newflags == oldflags) {
274                 *pprev = vma;
275                 return 0;
276         }
277
278         /*
279          * If we make a private mapping writable we increase our commit;
280          * but (without finer accounting) cannot reduce our commit if we
281          * make it unwritable again. hugetlb mapping were accounted for
282          * even if read-only so there is no need to account for them here
283          */
284         if (newflags & VM_WRITE) {
285                 /* Check space limits when area turns into data. */
286                 if (!may_expand_vm(mm, newflags, nrpages) &&
287                                 may_expand_vm(mm, oldflags, nrpages))
288                         return -ENOMEM;
289                 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
290                                                 VM_SHARED|VM_NORESERVE))) {
291                         charged = nrpages;
292                         if (security_vm_enough_memory_mm(mm, charged))
293                                 return -ENOMEM;
294                         newflags |= VM_ACCOUNT;
295                 }
296         }
297
298         /*
299          * First try to merge with previous and/or next vma.
300          */
301         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
302         *pprev = vma_merge(mm, *pprev, start, end, newflags,
303                            vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
304                            vma->vm_userfaultfd_ctx);
305         if (*pprev) {
306                 vma = *pprev;
307                 goto success;
308         }
309
310         *pprev = vma;
311
312         if (start != vma->vm_start) {
313                 error = split_vma(mm, vma, start, 1);
314                 if (error)
315                         goto fail;
316         }
317
318         if (end != vma->vm_end) {
319                 error = split_vma(mm, vma, end, 0);
320                 if (error)
321                         goto fail;
322         }
323
324 success:
325         /*
326          * vm_flags and vm_page_prot are protected by the mmap_sem
327          * held in write mode.
328          */
329         vma->vm_flags = newflags;
330         dirty_accountable = vma_wants_writenotify(vma);
331         vma_set_page_prot(vma);
332
333         change_protection(vma, start, end, vma->vm_page_prot,
334                           dirty_accountable, 0);
335
336         /*
337          * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
338          * fault on access.
339          */
340         if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
341                         (newflags & VM_WRITE)) {
342                 populate_vma_page_range(vma, start, end, NULL);
343         }
344
345         vm_stat_account(mm, oldflags, -nrpages);
346         vm_stat_account(mm, newflags, nrpages);
347         perf_event_mmap(vma);
348         return 0;
349
350 fail:
351         vm_unacct_memory(charged);
352         return error;
353 }
354
355 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
356                 unsigned long, prot)
357 {
358         unsigned long nstart, end, tmp, reqprot;
359         struct vm_area_struct *vma, *prev;
360         int error = -EINVAL;
361         const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
362         const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
363                                 (prot & PROT_READ);
364
365         prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
366         if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
367                 return -EINVAL;
368
369         if (start & ~PAGE_MASK)
370                 return -EINVAL;
371         if (!len)
372                 return 0;
373         len = PAGE_ALIGN(len);
374         end = start + len;
375         if (end <= start)
376                 return -ENOMEM;
377         if (!arch_validate_prot(prot))
378                 return -EINVAL;
379
380         reqprot = prot;
381
382         if (down_write_killable(&current->mm->mmap_sem))
383                 return -EINTR;
384
385         vma = find_vma(current->mm, start);
386         error = -ENOMEM;
387         if (!vma)
388                 goto out;
389         prev = vma->vm_prev;
390         if (unlikely(grows & PROT_GROWSDOWN)) {
391                 if (vma->vm_start >= end)
392                         goto out;
393                 start = vma->vm_start;
394                 error = -EINVAL;
395                 if (!(vma->vm_flags & VM_GROWSDOWN))
396                         goto out;
397         } else {
398                 if (vma->vm_start > start)
399                         goto out;
400                 if (unlikely(grows & PROT_GROWSUP)) {
401                         end = vma->vm_end;
402                         error = -EINVAL;
403                         if (!(vma->vm_flags & VM_GROWSUP))
404                                 goto out;
405                 }
406         }
407         if (start > vma->vm_start)
408                 prev = vma;
409
410         for (nstart = start ; ; ) {
411                 unsigned long newflags;
412                 int pkey = arch_override_mprotect_pkey(vma, prot, -1);
413
414                 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
415
416                 /* Does the application expect PROT_READ to imply PROT_EXEC */
417                 if (rier && (vma->vm_flags & VM_MAYEXEC))
418                         prot |= PROT_EXEC;
419
420                 newflags = calc_vm_prot_bits(prot, pkey);
421                 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
422
423                 /* newflags >> 4 shift VM_MAY% in place of VM_% */
424                 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
425                         error = -EACCES;
426                         goto out;
427                 }
428
429                 error = security_file_mprotect(vma, reqprot, prot);
430                 if (error)
431                         goto out;
432
433                 tmp = vma->vm_end;
434                 if (tmp > end)
435                         tmp = end;
436                 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
437                 if (error)
438                         goto out;
439                 nstart = tmp;
440
441                 if (nstart < prev->vm_end)
442                         nstart = prev->vm_end;
443                 if (nstart >= end)
444                         goto out;
445
446                 vma = prev->vm_next;
447                 if (!vma || vma->vm_start != nstart) {
448                         error = -ENOMEM;
449                         goto out;
450                 }
451                 prot = reqprot;
452         }
453 out:
454         up_write(&current->mm->mmap_sem);
455         return error;
456 }