40f574c06375a41e539ab53bc7028b48448b9821
[platform/kernel/linux-rpi.git] / mm / shmem.c
1 /*
2  * Resizable virtual memory filesystem for Linux.
3  *
4  * Copyright (C) 2000 Linus Torvalds.
5  *               2000 Transmeta Corp.
6  *               2000-2001 Christoph Rohland
7  *               2000-2001 SAP AG
8  *               2002 Red Hat Inc.
9  * Copyright (C) 2002-2011 Hugh Dickins.
10  * Copyright (C) 2011 Google Inc.
11  * Copyright (C) 2002-2005 VERITAS Software Corporation.
12  * Copyright (C) 2004 Andi Kleen, SuSE Labs
13  *
14  * Extended attribute support for tmpfs:
15  * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16  * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17  *
18  * tiny-shmem:
19  * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20  *
21  * This file is released under the GPL.
22  */
23
24 #include <linux/fs.h>
25 #include <linux/init.h>
26 #include <linux/vfs.h>
27 #include <linux/mount.h>
28 #include <linux/ramfs.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include <linux/mm.h>
32 #include <linux/random.h>
33 #include <linux/sched/signal.h>
34 #include <linux/export.h>
35 #include <linux/swap.h>
36 #include <linux/uio.h>
37 #include <linux/khugepaged.h>
38 #include <linux/hugetlb.h>
39 #include <linux/frontswap.h>
40
41 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
42
43 static struct vfsmount *shm_mnt;
44
45 #ifdef CONFIG_SHMEM
46 /*
47  * This virtual memory filesystem is heavily based on the ramfs. It
48  * extends ramfs by the ability to use swap and honor resource limits
49  * which makes it a completely usable filesystem.
50  */
51
52 #include <linux/xattr.h>
53 #include <linux/exportfs.h>
54 #include <linux/posix_acl.h>
55 #include <linux/posix_acl_xattr.h>
56 #include <linux/mman.h>
57 #include <linux/string.h>
58 #include <linux/slab.h>
59 #include <linux/backing-dev.h>
60 #include <linux/shmem_fs.h>
61 #include <linux/writeback.h>
62 #include <linux/blkdev.h>
63 #include <linux/pagevec.h>
64 #include <linux/percpu_counter.h>
65 #include <linux/falloc.h>
66 #include <linux/splice.h>
67 #include <linux/security.h>
68 #include <linux/swapops.h>
69 #include <linux/mempolicy.h>
70 #include <linux/namei.h>
71 #include <linux/ctype.h>
72 #include <linux/migrate.h>
73 #include <linux/highmem.h>
74 #include <linux/seq_file.h>
75 #include <linux/magic.h>
76 #include <linux/syscalls.h>
77 #include <linux/fcntl.h>
78 #include <uapi/linux/memfd.h>
79 #include <linux/userfaultfd_k.h>
80 #include <linux/rmap.h>
81 #include <linux/uuid.h>
82
83 #include <linux/uaccess.h>
84 #include <asm/pgtable.h>
85
86 #include "internal.h"
87
88 #define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
89 #define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
90
91 /* Pretend that each entry is of this size in directory's i_size */
92 #define BOGO_DIRENT_SIZE 20
93
94 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
95 #define SHORT_SYMLINK_LEN 128
96
97 /*
98  * shmem_fallocate communicates with shmem_fault or shmem_writepage via
99  * inode->i_private (with i_mutex making sure that it has only one user at
100  * a time): we would prefer not to enlarge the shmem inode just for that.
101  */
102 struct shmem_falloc {
103         wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
104         pgoff_t start;          /* start of range currently being fallocated */
105         pgoff_t next;           /* the next page offset to be fallocated */
106         pgoff_t nr_falloced;    /* how many new pages have been fallocated */
107         pgoff_t nr_unswapped;   /* how often writepage refused to swap out */
108 };
109
110 #ifdef CONFIG_TMPFS
111 static unsigned long shmem_default_max_blocks(void)
112 {
113         return totalram_pages() / 2;
114 }
115
116 static unsigned long shmem_default_max_inodes(void)
117 {
118         unsigned long nr_pages = totalram_pages();
119
120         return min(nr_pages - totalhigh_pages(), nr_pages / 2);
121 }
122 #endif
123
124 static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
125 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
126                                 struct shmem_inode_info *info, pgoff_t index);
127 static int shmem_swapin_page(struct inode *inode, pgoff_t index,
128                              struct page **pagep, enum sgp_type sgp,
129                              gfp_t gfp, struct vm_area_struct *vma,
130                              vm_fault_t *fault_type);
131 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
132                 struct page **pagep, enum sgp_type sgp,
133                 gfp_t gfp, struct vm_area_struct *vma,
134                 struct vm_fault *vmf, vm_fault_t *fault_type);
135
136 int shmem_getpage(struct inode *inode, pgoff_t index,
137                 struct page **pagep, enum sgp_type sgp)
138 {
139         return shmem_getpage_gfp(inode, index, pagep, sgp,
140                 mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
141 }
142
143 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
144 {
145         return sb->s_fs_info;
146 }
147
148 /*
149  * shmem_file_setup pre-accounts the whole fixed size of a VM object,
150  * for shared memory and for shared anonymous (/dev/zero) mappings
151  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
152  * consistent with the pre-accounting of private mappings ...
153  */
154 static inline int shmem_acct_size(unsigned long flags, loff_t size)
155 {
156         return (flags & VM_NORESERVE) ?
157                 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
158 }
159
160 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
161 {
162         if (!(flags & VM_NORESERVE))
163                 vm_unacct_memory(VM_ACCT(size));
164 }
165
166 static inline int shmem_reacct_size(unsigned long flags,
167                 loff_t oldsize, loff_t newsize)
168 {
169         if (!(flags & VM_NORESERVE)) {
170                 if (VM_ACCT(newsize) > VM_ACCT(oldsize))
171                         return security_vm_enough_memory_mm(current->mm,
172                                         VM_ACCT(newsize) - VM_ACCT(oldsize));
173                 else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
174                         vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
175         }
176         return 0;
177 }
178
179 /*
180  * ... whereas tmpfs objects are accounted incrementally as
181  * pages are allocated, in order to allow large sparse files.
182  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
183  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
184  */
185 static inline int shmem_acct_block(unsigned long flags, long pages)
186 {
187         if (!(flags & VM_NORESERVE))
188                 return 0;
189
190         return security_vm_enough_memory_mm(current->mm,
191                         pages * VM_ACCT(PAGE_SIZE));
192 }
193
194 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
195 {
196         if (flags & VM_NORESERVE)
197                 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
198 }
199
200 static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
201 {
202         struct shmem_inode_info *info = SHMEM_I(inode);
203         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
204
205         if (shmem_acct_block(info->flags, pages))
206                 return false;
207
208         if (sbinfo->max_blocks) {
209                 if (percpu_counter_compare(&sbinfo->used_blocks,
210                                            sbinfo->max_blocks - pages) > 0)
211                         goto unacct;
212                 percpu_counter_add(&sbinfo->used_blocks, pages);
213         }
214
215         return true;
216
217 unacct:
218         shmem_unacct_blocks(info->flags, pages);
219         return false;
220 }
221
222 static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
223 {
224         struct shmem_inode_info *info = SHMEM_I(inode);
225         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
226
227         if (sbinfo->max_blocks)
228                 percpu_counter_sub(&sbinfo->used_blocks, pages);
229         shmem_unacct_blocks(info->flags, pages);
230 }
231
232 static const struct super_operations shmem_ops;
233 static const struct address_space_operations shmem_aops;
234 static const struct file_operations shmem_file_operations;
235 static const struct inode_operations shmem_inode_operations;
236 static const struct inode_operations shmem_dir_inode_operations;
237 static const struct inode_operations shmem_special_inode_operations;
238 static const struct vm_operations_struct shmem_vm_ops;
239 static struct file_system_type shmem_fs_type;
240
241 bool vma_is_shmem(struct vm_area_struct *vma)
242 {
243         return vma->vm_ops == &shmem_vm_ops;
244 }
245
246 static LIST_HEAD(shmem_swaplist);
247 static DEFINE_MUTEX(shmem_swaplist_mutex);
248
249 static int shmem_reserve_inode(struct super_block *sb)
250 {
251         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
252         if (sbinfo->max_inodes) {
253                 spin_lock(&sbinfo->stat_lock);
254                 if (!sbinfo->free_inodes) {
255                         spin_unlock(&sbinfo->stat_lock);
256                         return -ENOSPC;
257                 }
258                 sbinfo->free_inodes--;
259                 spin_unlock(&sbinfo->stat_lock);
260         }
261         return 0;
262 }
263
264 static void shmem_free_inode(struct super_block *sb)
265 {
266         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
267         if (sbinfo->max_inodes) {
268                 spin_lock(&sbinfo->stat_lock);
269                 sbinfo->free_inodes++;
270                 spin_unlock(&sbinfo->stat_lock);
271         }
272 }
273
274 /**
275  * shmem_recalc_inode - recalculate the block usage of an inode
276  * @inode: inode to recalc
277  *
278  * We have to calculate the free blocks since the mm can drop
279  * undirtied hole pages behind our back.
280  *
281  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
282  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
283  *
284  * It has to be called with the spinlock held.
285  */
286 static void shmem_recalc_inode(struct inode *inode)
287 {
288         struct shmem_inode_info *info = SHMEM_I(inode);
289         long freed;
290
291         freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
292         if (freed > 0) {
293                 info->alloced -= freed;
294                 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
295                 shmem_inode_unacct_blocks(inode, freed);
296         }
297 }
298
299 bool shmem_charge(struct inode *inode, long pages)
300 {
301         struct shmem_inode_info *info = SHMEM_I(inode);
302         unsigned long flags;
303
304         if (!shmem_inode_acct_block(inode, pages))
305                 return false;
306
307         /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
308         inode->i_mapping->nrpages += pages;
309
310         spin_lock_irqsave(&info->lock, flags);
311         info->alloced += pages;
312         inode->i_blocks += pages * BLOCKS_PER_PAGE;
313         shmem_recalc_inode(inode);
314         spin_unlock_irqrestore(&info->lock, flags);
315
316         return true;
317 }
318
319 void shmem_uncharge(struct inode *inode, long pages)
320 {
321         struct shmem_inode_info *info = SHMEM_I(inode);
322         unsigned long flags;
323
324         /* nrpages adjustment done by __delete_from_page_cache() or caller */
325
326         spin_lock_irqsave(&info->lock, flags);
327         info->alloced -= pages;
328         inode->i_blocks -= pages * BLOCKS_PER_PAGE;
329         shmem_recalc_inode(inode);
330         spin_unlock_irqrestore(&info->lock, flags);
331
332         shmem_inode_unacct_blocks(inode, pages);
333 }
334
335 /*
336  * Replace item expected in xarray by a new item, while holding xa_lock.
337  */
338 static int shmem_replace_entry(struct address_space *mapping,
339                         pgoff_t index, void *expected, void *replacement)
340 {
341         XA_STATE(xas, &mapping->i_pages, index);
342         void *item;
343
344         VM_BUG_ON(!expected);
345         VM_BUG_ON(!replacement);
346         item = xas_load(&xas);
347         if (item != expected)
348                 return -ENOENT;
349         xas_store(&xas, replacement);
350         return 0;
351 }
352
353 /*
354  * Sometimes, before we decide whether to proceed or to fail, we must check
355  * that an entry was not already brought back from swap by a racing thread.
356  *
357  * Checking page is not enough: by the time a SwapCache page is locked, it
358  * might be reused, and again be SwapCache, using the same swap as before.
359  */
360 static bool shmem_confirm_swap(struct address_space *mapping,
361                                pgoff_t index, swp_entry_t swap)
362 {
363         return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
364 }
365
366 /*
367  * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
368  *
369  * SHMEM_HUGE_NEVER:
370  *      disables huge pages for the mount;
371  * SHMEM_HUGE_ALWAYS:
372  *      enables huge pages for the mount;
373  * SHMEM_HUGE_WITHIN_SIZE:
374  *      only allocate huge pages if the page will be fully within i_size,
375  *      also respect fadvise()/madvise() hints;
376  * SHMEM_HUGE_ADVISE:
377  *      only allocate huge pages if requested with fadvise()/madvise();
378  */
379
380 #define SHMEM_HUGE_NEVER        0
381 #define SHMEM_HUGE_ALWAYS       1
382 #define SHMEM_HUGE_WITHIN_SIZE  2
383 #define SHMEM_HUGE_ADVISE       3
384
385 /*
386  * Special values.
387  * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
388  *
389  * SHMEM_HUGE_DENY:
390  *      disables huge on shm_mnt and all mounts, for emergency use;
391  * SHMEM_HUGE_FORCE:
392  *      enables huge on shm_mnt and all mounts, w/o needing option, for testing;
393  *
394  */
395 #define SHMEM_HUGE_DENY         (-1)
396 #define SHMEM_HUGE_FORCE        (-2)
397
398 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
399 /* ifdef here to avoid bloating shmem.o when not necessary */
400
401 static int shmem_huge __read_mostly;
402
403 #if defined(CONFIG_SYSFS)
404 static int shmem_parse_huge(const char *str)
405 {
406         if (!strcmp(str, "never"))
407                 return SHMEM_HUGE_NEVER;
408         if (!strcmp(str, "always"))
409                 return SHMEM_HUGE_ALWAYS;
410         if (!strcmp(str, "within_size"))
411                 return SHMEM_HUGE_WITHIN_SIZE;
412         if (!strcmp(str, "advise"))
413                 return SHMEM_HUGE_ADVISE;
414         if (!strcmp(str, "deny"))
415                 return SHMEM_HUGE_DENY;
416         if (!strcmp(str, "force"))
417                 return SHMEM_HUGE_FORCE;
418         return -EINVAL;
419 }
420 #endif
421
422 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
423 static const char *shmem_format_huge(int huge)
424 {
425         switch (huge) {
426         case SHMEM_HUGE_NEVER:
427                 return "never";
428         case SHMEM_HUGE_ALWAYS:
429                 return "always";
430         case SHMEM_HUGE_WITHIN_SIZE:
431                 return "within_size";
432         case SHMEM_HUGE_ADVISE:
433                 return "advise";
434         case SHMEM_HUGE_DENY:
435                 return "deny";
436         case SHMEM_HUGE_FORCE:
437                 return "force";
438         default:
439                 VM_BUG_ON(1);
440                 return "bad_val";
441         }
442 }
443 #endif
444
445 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
446                 struct shrink_control *sc, unsigned long nr_to_split)
447 {
448         LIST_HEAD(list), *pos, *next;
449         LIST_HEAD(to_remove);
450         struct inode *inode;
451         struct shmem_inode_info *info;
452         struct page *page;
453         unsigned long batch = sc ? sc->nr_to_scan : 128;
454         int removed = 0, split = 0;
455
456         if (list_empty(&sbinfo->shrinklist))
457                 return SHRINK_STOP;
458
459         spin_lock(&sbinfo->shrinklist_lock);
460         list_for_each_safe(pos, next, &sbinfo->shrinklist) {
461                 info = list_entry(pos, struct shmem_inode_info, shrinklist);
462
463                 /* pin the inode */
464                 inode = igrab(&info->vfs_inode);
465
466                 /* inode is about to be evicted */
467                 if (!inode) {
468                         list_del_init(&info->shrinklist);
469                         removed++;
470                         goto next;
471                 }
472
473                 /* Check if there's anything to gain */
474                 if (round_up(inode->i_size, PAGE_SIZE) ==
475                                 round_up(inode->i_size, HPAGE_PMD_SIZE)) {
476                         list_move(&info->shrinklist, &to_remove);
477                         removed++;
478                         goto next;
479                 }
480
481                 list_move(&info->shrinklist, &list);
482 next:
483                 if (!--batch)
484                         break;
485         }
486         spin_unlock(&sbinfo->shrinklist_lock);
487
488         list_for_each_safe(pos, next, &to_remove) {
489                 info = list_entry(pos, struct shmem_inode_info, shrinklist);
490                 inode = &info->vfs_inode;
491                 list_del_init(&info->shrinklist);
492                 iput(inode);
493         }
494
495         list_for_each_safe(pos, next, &list) {
496                 int ret;
497
498                 info = list_entry(pos, struct shmem_inode_info, shrinklist);
499                 inode = &info->vfs_inode;
500
501                 if (nr_to_split && split >= nr_to_split)
502                         goto leave;
503
504                 page = find_get_page(inode->i_mapping,
505                                 (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
506                 if (!page)
507                         goto drop;
508
509                 /* No huge page at the end of the file: nothing to split */
510                 if (!PageTransHuge(page)) {
511                         put_page(page);
512                         goto drop;
513                 }
514
515                 /*
516                  * Leave the inode on the list if we failed to lock
517                  * the page at this time.
518                  *
519                  * Waiting for the lock may lead to deadlock in the
520                  * reclaim path.
521                  */
522                 if (!trylock_page(page)) {
523                         put_page(page);
524                         goto leave;
525                 }
526
527                 ret = split_huge_page(page);
528                 unlock_page(page);
529                 put_page(page);
530
531                 /* If split failed leave the inode on the list */
532                 if (ret)
533                         goto leave;
534
535                 split++;
536 drop:
537                 list_del_init(&info->shrinklist);
538                 removed++;
539 leave:
540                 iput(inode);
541         }
542
543         spin_lock(&sbinfo->shrinklist_lock);
544         list_splice_tail(&list, &sbinfo->shrinklist);
545         sbinfo->shrinklist_len -= removed;
546         spin_unlock(&sbinfo->shrinklist_lock);
547
548         return split;
549 }
550
551 static long shmem_unused_huge_scan(struct super_block *sb,
552                 struct shrink_control *sc)
553 {
554         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
555
556         if (!READ_ONCE(sbinfo->shrinklist_len))
557                 return SHRINK_STOP;
558
559         return shmem_unused_huge_shrink(sbinfo, sc, 0);
560 }
561
562 static long shmem_unused_huge_count(struct super_block *sb,
563                 struct shrink_control *sc)
564 {
565         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
566         return READ_ONCE(sbinfo->shrinklist_len);
567 }
568 #else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
569
570 #define shmem_huge SHMEM_HUGE_DENY
571
572 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
573                 struct shrink_control *sc, unsigned long nr_to_split)
574 {
575         return 0;
576 }
577 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
578
579 static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
580 {
581         if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
582             (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
583             shmem_huge != SHMEM_HUGE_DENY)
584                 return true;
585         return false;
586 }
587
588 /*
589  * Like add_to_page_cache_locked, but error if expected item has gone.
590  */
591 static int shmem_add_to_page_cache(struct page *page,
592                                    struct address_space *mapping,
593                                    pgoff_t index, void *expected, gfp_t gfp)
594 {
595         XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
596         unsigned long i = 0;
597         unsigned long nr = 1UL << compound_order(page);
598
599         VM_BUG_ON_PAGE(PageTail(page), page);
600         VM_BUG_ON_PAGE(index != round_down(index, nr), page);
601         VM_BUG_ON_PAGE(!PageLocked(page), page);
602         VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
603         VM_BUG_ON(expected && PageTransHuge(page));
604
605         page_ref_add(page, nr);
606         page->mapping = mapping;
607         page->index = index;
608
609         do {
610                 void *entry;
611                 xas_lock_irq(&xas);
612                 entry = xas_find_conflict(&xas);
613                 if (entry != expected)
614                         xas_set_err(&xas, -EEXIST);
615                 xas_create_range(&xas);
616                 if (xas_error(&xas))
617                         goto unlock;
618 next:
619                 xas_store(&xas, page + i);
620                 if (++i < nr) {
621                         xas_next(&xas);
622                         goto next;
623                 }
624                 if (PageTransHuge(page)) {
625                         count_vm_event(THP_FILE_ALLOC);
626                         __inc_node_page_state(page, NR_SHMEM_THPS);
627                 }
628                 mapping->nrpages += nr;
629                 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
630                 __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
631 unlock:
632                 xas_unlock_irq(&xas);
633         } while (xas_nomem(&xas, gfp));
634
635         if (xas_error(&xas)) {
636                 page->mapping = NULL;
637                 page_ref_sub(page, nr);
638                 return xas_error(&xas);
639         }
640
641         return 0;
642 }
643
644 /*
645  * Like delete_from_page_cache, but substitutes swap for page.
646  */
647 static void shmem_delete_from_page_cache(struct page *page, void *radswap)
648 {
649         struct address_space *mapping = page->mapping;
650         int error;
651
652         VM_BUG_ON_PAGE(PageCompound(page), page);
653
654         xa_lock_irq(&mapping->i_pages);
655         error = shmem_replace_entry(mapping, page->index, page, radswap);
656         page->mapping = NULL;
657         mapping->nrpages--;
658         __dec_node_page_state(page, NR_FILE_PAGES);
659         __dec_node_page_state(page, NR_SHMEM);
660         xa_unlock_irq(&mapping->i_pages);
661         put_page(page);
662         BUG_ON(error);
663 }
664
665 /*
666  * Remove swap entry from page cache, free the swap and its page cache.
667  */
668 static int shmem_free_swap(struct address_space *mapping,
669                            pgoff_t index, void *radswap)
670 {
671         void *old;
672
673         old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
674         if (old != radswap)
675                 return -ENOENT;
676         free_swap_and_cache(radix_to_swp_entry(radswap));
677         return 0;
678 }
679
680 /*
681  * Determine (in bytes) how many of the shmem object's pages mapped by the
682  * given offsets are swapped out.
683  *
684  * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
685  * as long as the inode doesn't go away and racy results are not a problem.
686  */
687 unsigned long shmem_partial_swap_usage(struct address_space *mapping,
688                                                 pgoff_t start, pgoff_t end)
689 {
690         XA_STATE(xas, &mapping->i_pages, start);
691         struct page *page;
692         unsigned long swapped = 0;
693
694         rcu_read_lock();
695         xas_for_each(&xas, page, end - 1) {
696                 if (xas_retry(&xas, page))
697                         continue;
698                 if (xa_is_value(page))
699                         swapped++;
700
701                 if (need_resched()) {
702                         xas_pause(&xas);
703                         cond_resched_rcu();
704                 }
705         }
706
707         rcu_read_unlock();
708
709         return swapped << PAGE_SHIFT;
710 }
711
712 /*
713  * Determine (in bytes) how many of the shmem object's pages mapped by the
714  * given vma is swapped out.
715  *
716  * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
717  * as long as the inode doesn't go away and racy results are not a problem.
718  */
719 unsigned long shmem_swap_usage(struct vm_area_struct *vma)
720 {
721         struct inode *inode = file_inode(vma->vm_file);
722         struct shmem_inode_info *info = SHMEM_I(inode);
723         struct address_space *mapping = inode->i_mapping;
724         unsigned long swapped;
725
726         /* Be careful as we don't hold info->lock */
727         swapped = READ_ONCE(info->swapped);
728
729         /*
730          * The easier cases are when the shmem object has nothing in swap, or
731          * the vma maps it whole. Then we can simply use the stats that we
732          * already track.
733          */
734         if (!swapped)
735                 return 0;
736
737         if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
738                 return swapped << PAGE_SHIFT;
739
740         /* Here comes the more involved part */
741         return shmem_partial_swap_usage(mapping,
742                         linear_page_index(vma, vma->vm_start),
743                         linear_page_index(vma, vma->vm_end));
744 }
745
746 /*
747  * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
748  */
749 void shmem_unlock_mapping(struct address_space *mapping)
750 {
751         struct pagevec pvec;
752         pgoff_t indices[PAGEVEC_SIZE];
753         pgoff_t index = 0;
754
755         pagevec_init(&pvec);
756         /*
757          * Minor point, but we might as well stop if someone else SHM_LOCKs it.
758          */
759         while (!mapping_unevictable(mapping)) {
760                 /*
761                  * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
762                  * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
763                  */
764                 pvec.nr = find_get_entries(mapping, index,
765                                            PAGEVEC_SIZE, pvec.pages, indices);
766                 if (!pvec.nr)
767                         break;
768                 index = indices[pvec.nr - 1] + 1;
769                 pagevec_remove_exceptionals(&pvec);
770                 check_move_unevictable_pages(&pvec);
771                 pagevec_release(&pvec);
772                 cond_resched();
773         }
774 }
775
776 /*
777  * Remove range of pages and swap entries from page cache, and free them.
778  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
779  */
780 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
781                                                                  bool unfalloc)
782 {
783         struct address_space *mapping = inode->i_mapping;
784         struct shmem_inode_info *info = SHMEM_I(inode);
785         pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
786         pgoff_t end = (lend + 1) >> PAGE_SHIFT;
787         unsigned int partial_start = lstart & (PAGE_SIZE - 1);
788         unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
789         struct pagevec pvec;
790         pgoff_t indices[PAGEVEC_SIZE];
791         long nr_swaps_freed = 0;
792         pgoff_t index;
793         int i;
794
795         if (lend == -1)
796                 end = -1;       /* unsigned, so actually very big */
797
798         pagevec_init(&pvec);
799         index = start;
800         while (index < end) {
801                 pvec.nr = find_get_entries(mapping, index,
802                         min(end - index, (pgoff_t)PAGEVEC_SIZE),
803                         pvec.pages, indices);
804                 if (!pvec.nr)
805                         break;
806                 for (i = 0; i < pagevec_count(&pvec); i++) {
807                         struct page *page = pvec.pages[i];
808
809                         index = indices[i];
810                         if (index >= end)
811                                 break;
812
813                         if (xa_is_value(page)) {
814                                 if (unfalloc)
815                                         continue;
816                                 nr_swaps_freed += !shmem_free_swap(mapping,
817                                                                 index, page);
818                                 continue;
819                         }
820
821                         VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
822
823                         if (!trylock_page(page))
824                                 continue;
825
826                         if (PageTransTail(page)) {
827                                 /* Middle of THP: zero out the page */
828                                 clear_highpage(page);
829                                 unlock_page(page);
830                                 continue;
831                         } else if (PageTransHuge(page)) {
832                                 if (index == round_down(end, HPAGE_PMD_NR)) {
833                                         /*
834                                          * Range ends in the middle of THP:
835                                          * zero out the page
836                                          */
837                                         clear_highpage(page);
838                                         unlock_page(page);
839                                         continue;
840                                 }
841                                 index += HPAGE_PMD_NR - 1;
842                                 i += HPAGE_PMD_NR - 1;
843                         }
844
845                         if (!unfalloc || !PageUptodate(page)) {
846                                 VM_BUG_ON_PAGE(PageTail(page), page);
847                                 if (page_mapping(page) == mapping) {
848                                         VM_BUG_ON_PAGE(PageWriteback(page), page);
849                                         truncate_inode_page(mapping, page);
850                                 }
851                         }
852                         unlock_page(page);
853                 }
854                 pagevec_remove_exceptionals(&pvec);
855                 pagevec_release(&pvec);
856                 cond_resched();
857                 index++;
858         }
859
860         if (partial_start) {
861                 struct page *page = NULL;
862                 shmem_getpage(inode, start - 1, &page, SGP_READ);
863                 if (page) {
864                         unsigned int top = PAGE_SIZE;
865                         if (start > end) {
866                                 top = partial_end;
867                                 partial_end = 0;
868                         }
869                         zero_user_segment(page, partial_start, top);
870                         set_page_dirty(page);
871                         unlock_page(page);
872                         put_page(page);
873                 }
874         }
875         if (partial_end) {
876                 struct page *page = NULL;
877                 shmem_getpage(inode, end, &page, SGP_READ);
878                 if (page) {
879                         zero_user_segment(page, 0, partial_end);
880                         set_page_dirty(page);
881                         unlock_page(page);
882                         put_page(page);
883                 }
884         }
885         if (start >= end)
886                 return;
887
888         index = start;
889         while (index < end) {
890                 cond_resched();
891
892                 pvec.nr = find_get_entries(mapping, index,
893                                 min(end - index, (pgoff_t)PAGEVEC_SIZE),
894                                 pvec.pages, indices);
895                 if (!pvec.nr) {
896                         /* If all gone or hole-punch or unfalloc, we're done */
897                         if (index == start || end != -1)
898                                 break;
899                         /* But if truncating, restart to make sure all gone */
900                         index = start;
901                         continue;
902                 }
903                 for (i = 0; i < pagevec_count(&pvec); i++) {
904                         struct page *page = pvec.pages[i];
905
906                         index = indices[i];
907                         if (index >= end)
908                                 break;
909
910                         if (xa_is_value(page)) {
911                                 if (unfalloc)
912                                         continue;
913                                 if (shmem_free_swap(mapping, index, page)) {
914                                         /* Swap was replaced by page: retry */
915                                         index--;
916                                         break;
917                                 }
918                                 nr_swaps_freed++;
919                                 continue;
920                         }
921
922                         lock_page(page);
923
924                         if (PageTransTail(page)) {
925                                 /* Middle of THP: zero out the page */
926                                 clear_highpage(page);
927                                 unlock_page(page);
928                                 /*
929                                  * Partial thp truncate due 'start' in middle
930                                  * of THP: don't need to look on these pages
931                                  * again on !pvec.nr restart.
932                                  */
933                                 if (index != round_down(end, HPAGE_PMD_NR))
934                                         start++;
935                                 continue;
936                         } else if (PageTransHuge(page)) {
937                                 if (index == round_down(end, HPAGE_PMD_NR)) {
938                                         /*
939                                          * Range ends in the middle of THP:
940                                          * zero out the page
941                                          */
942                                         clear_highpage(page);
943                                         unlock_page(page);
944                                         continue;
945                                 }
946                                 index += HPAGE_PMD_NR - 1;
947                                 i += HPAGE_PMD_NR - 1;
948                         }
949
950                         if (!unfalloc || !PageUptodate(page)) {
951                                 VM_BUG_ON_PAGE(PageTail(page), page);
952                                 if (page_mapping(page) == mapping) {
953                                         VM_BUG_ON_PAGE(PageWriteback(page), page);
954                                         truncate_inode_page(mapping, page);
955                                 } else {
956                                         /* Page was replaced by swap: retry */
957                                         unlock_page(page);
958                                         index--;
959                                         break;
960                                 }
961                         }
962                         unlock_page(page);
963                 }
964                 pagevec_remove_exceptionals(&pvec);
965                 pagevec_release(&pvec);
966                 index++;
967         }
968
969         spin_lock_irq(&info->lock);
970         info->swapped -= nr_swaps_freed;
971         shmem_recalc_inode(inode);
972         spin_unlock_irq(&info->lock);
973 }
974
975 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
976 {
977         shmem_undo_range(inode, lstart, lend, false);
978         inode->i_ctime = inode->i_mtime = current_time(inode);
979 }
980 EXPORT_SYMBOL_GPL(shmem_truncate_range);
981
982 static int shmem_getattr(const struct path *path, struct kstat *stat,
983                          u32 request_mask, unsigned int query_flags)
984 {
985         struct inode *inode = path->dentry->d_inode;
986         struct shmem_inode_info *info = SHMEM_I(inode);
987         struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
988
989         if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
990                 spin_lock_irq(&info->lock);
991                 shmem_recalc_inode(inode);
992                 spin_unlock_irq(&info->lock);
993         }
994         generic_fillattr(inode, stat);
995
996         if (is_huge_enabled(sb_info))
997                 stat->blksize = HPAGE_PMD_SIZE;
998
999         return 0;
1000 }
1001
1002 static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
1003 {
1004         struct inode *inode = d_inode(dentry);
1005         struct shmem_inode_info *info = SHMEM_I(inode);
1006         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1007         int error;
1008
1009         error = setattr_prepare(dentry, attr);
1010         if (error)
1011                 return error;
1012
1013         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1014                 loff_t oldsize = inode->i_size;
1015                 loff_t newsize = attr->ia_size;
1016
1017                 /* protected by i_mutex */
1018                 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1019                     (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1020                         return -EPERM;
1021
1022                 if (newsize != oldsize) {
1023                         error = shmem_reacct_size(SHMEM_I(inode)->flags,
1024                                         oldsize, newsize);
1025                         if (error)
1026                                 return error;
1027                         i_size_write(inode, newsize);
1028                         inode->i_ctime = inode->i_mtime = current_time(inode);
1029                 }
1030                 if (newsize <= oldsize) {
1031                         loff_t holebegin = round_up(newsize, PAGE_SIZE);
1032                         if (oldsize > holebegin)
1033                                 unmap_mapping_range(inode->i_mapping,
1034                                                         holebegin, 0, 1);
1035                         if (info->alloced)
1036                                 shmem_truncate_range(inode,
1037                                                         newsize, (loff_t)-1);
1038                         /* unmap again to remove racily COWed private pages */
1039                         if (oldsize > holebegin)
1040                                 unmap_mapping_range(inode->i_mapping,
1041                                                         holebegin, 0, 1);
1042
1043                         /*
1044                          * Part of the huge page can be beyond i_size: subject
1045                          * to shrink under memory pressure.
1046                          */
1047                         if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
1048                                 spin_lock(&sbinfo->shrinklist_lock);
1049                                 /*
1050                                  * _careful to defend against unlocked access to
1051                                  * ->shrink_list in shmem_unused_huge_shrink()
1052                                  */
1053                                 if (list_empty_careful(&info->shrinklist)) {
1054                                         list_add_tail(&info->shrinklist,
1055                                                         &sbinfo->shrinklist);
1056                                         sbinfo->shrinklist_len++;
1057                                 }
1058                                 spin_unlock(&sbinfo->shrinklist_lock);
1059                         }
1060                 }
1061         }
1062
1063         setattr_copy(inode, attr);
1064         if (attr->ia_valid & ATTR_MODE)
1065                 error = posix_acl_chmod(inode, inode->i_mode);
1066         return error;
1067 }
1068
1069 static void shmem_evict_inode(struct inode *inode)
1070 {
1071         struct shmem_inode_info *info = SHMEM_I(inode);
1072         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1073
1074         if (inode->i_mapping->a_ops == &shmem_aops) {
1075                 shmem_unacct_size(info->flags, inode->i_size);
1076                 inode->i_size = 0;
1077                 shmem_truncate_range(inode, 0, (loff_t)-1);
1078                 if (!list_empty(&info->shrinklist)) {
1079                         spin_lock(&sbinfo->shrinklist_lock);
1080                         if (!list_empty(&info->shrinklist)) {
1081                                 list_del_init(&info->shrinklist);
1082                                 sbinfo->shrinklist_len--;
1083                         }
1084                         spin_unlock(&sbinfo->shrinklist_lock);
1085                 }
1086                 while (!list_empty(&info->swaplist)) {
1087                         /* Wait while shmem_unuse() is scanning this inode... */
1088                         wait_var_event(&info->stop_eviction,
1089                                        !atomic_read(&info->stop_eviction));
1090                         mutex_lock(&shmem_swaplist_mutex);
1091                         /* ...but beware of the race if we peeked too early */
1092                         if (!atomic_read(&info->stop_eviction))
1093                                 list_del_init(&info->swaplist);
1094                         mutex_unlock(&shmem_swaplist_mutex);
1095                 }
1096         }
1097
1098         simple_xattrs_free(&info->xattrs);
1099         WARN_ON(inode->i_blocks);
1100         shmem_free_inode(inode->i_sb);
1101         clear_inode(inode);
1102 }
1103
1104 extern struct swap_info_struct *swap_info[];
1105
1106 static int shmem_find_swap_entries(struct address_space *mapping,
1107                                    pgoff_t start, unsigned int nr_entries,
1108                                    struct page **entries, pgoff_t *indices,
1109                                    unsigned int type, bool frontswap)
1110 {
1111         XA_STATE(xas, &mapping->i_pages, start);
1112         struct page *page;
1113         swp_entry_t entry;
1114         unsigned int ret = 0;
1115
1116         if (!nr_entries)
1117                 return 0;
1118
1119         rcu_read_lock();
1120         xas_for_each(&xas, page, ULONG_MAX) {
1121                 if (xas_retry(&xas, page))
1122                         continue;
1123
1124                 if (!xa_is_value(page))
1125                         continue;
1126
1127                 entry = radix_to_swp_entry(page);
1128                 if (swp_type(entry) != type)
1129                         continue;
1130                 if (frontswap &&
1131                     !frontswap_test(swap_info[type], swp_offset(entry)))
1132                         continue;
1133
1134                 indices[ret] = xas.xa_index;
1135                 entries[ret] = page;
1136
1137                 if (need_resched()) {
1138                         xas_pause(&xas);
1139                         cond_resched_rcu();
1140                 }
1141                 if (++ret == nr_entries)
1142                         break;
1143         }
1144         rcu_read_unlock();
1145
1146         return ret;
1147 }
1148
1149 /*
1150  * Move the swapped pages for an inode to page cache. Returns the count
1151  * of pages swapped in, or the error in case of failure.
1152  */
1153 static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
1154                                     pgoff_t *indices)
1155 {
1156         int i = 0;
1157         int ret = 0;
1158         int error = 0;
1159         struct address_space *mapping = inode->i_mapping;
1160
1161         for (i = 0; i < pvec.nr; i++) {
1162                 struct page *page = pvec.pages[i];
1163
1164                 if (!xa_is_value(page))
1165                         continue;
1166                 error = shmem_swapin_page(inode, indices[i],
1167                                           &page, SGP_CACHE,
1168                                           mapping_gfp_mask(mapping),
1169                                           NULL, NULL);
1170                 if (error == 0) {
1171                         unlock_page(page);
1172                         put_page(page);
1173                         ret++;
1174                 }
1175                 if (error == -ENOMEM)
1176                         break;
1177                 error = 0;
1178         }
1179         return error ? error : ret;
1180 }
1181
1182 /*
1183  * If swap found in inode, free it and move page from swapcache to filecache.
1184  */
1185 static int shmem_unuse_inode(struct inode *inode, unsigned int type,
1186                              bool frontswap, unsigned long *fs_pages_to_unuse)
1187 {
1188         struct address_space *mapping = inode->i_mapping;
1189         pgoff_t start = 0;
1190         struct pagevec pvec;
1191         pgoff_t indices[PAGEVEC_SIZE];
1192         bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
1193         int ret = 0;
1194
1195         pagevec_init(&pvec);
1196         do {
1197                 unsigned int nr_entries = PAGEVEC_SIZE;
1198
1199                 if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
1200                         nr_entries = *fs_pages_to_unuse;
1201
1202                 pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
1203                                                   pvec.pages, indices,
1204                                                   type, frontswap);
1205                 if (pvec.nr == 0) {
1206                         ret = 0;
1207                         break;
1208                 }
1209
1210                 ret = shmem_unuse_swap_entries(inode, pvec, indices);
1211                 if (ret < 0)
1212                         break;
1213
1214                 if (frontswap_partial) {
1215                         *fs_pages_to_unuse -= ret;
1216                         if (*fs_pages_to_unuse == 0) {
1217                                 ret = FRONTSWAP_PAGES_UNUSED;
1218                                 break;
1219                         }
1220                 }
1221
1222                 start = indices[pvec.nr - 1];
1223         } while (true);
1224
1225         return ret;
1226 }
1227
1228 /*
1229  * Read all the shared memory data that resides in the swap
1230  * device 'type' back into memory, so the swap device can be
1231  * unused.
1232  */
1233 int shmem_unuse(unsigned int type, bool frontswap,
1234                 unsigned long *fs_pages_to_unuse)
1235 {
1236         struct shmem_inode_info *info, *next;
1237         int error = 0;
1238
1239         if (list_empty(&shmem_swaplist))
1240                 return 0;
1241
1242         mutex_lock(&shmem_swaplist_mutex);
1243         list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1244                 if (!info->swapped) {
1245                         list_del_init(&info->swaplist);
1246                         continue;
1247                 }
1248                 /*
1249                  * Drop the swaplist mutex while searching the inode for swap;
1250                  * but before doing so, make sure shmem_evict_inode() will not
1251                  * remove placeholder inode from swaplist, nor let it be freed
1252                  * (igrab() would protect from unlink, but not from unmount).
1253                  */
1254                 atomic_inc(&info->stop_eviction);
1255                 mutex_unlock(&shmem_swaplist_mutex);
1256
1257                 error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
1258                                           fs_pages_to_unuse);
1259                 cond_resched();
1260
1261                 mutex_lock(&shmem_swaplist_mutex);
1262                 next = list_next_entry(info, swaplist);
1263                 if (!info->swapped)
1264                         list_del_init(&info->swaplist);
1265                 if (atomic_dec_and_test(&info->stop_eviction))
1266                         wake_up_var(&info->stop_eviction);
1267                 if (error)
1268                         break;
1269         }
1270         mutex_unlock(&shmem_swaplist_mutex);
1271
1272         return error;
1273 }
1274
1275 /*
1276  * Move the page from the page cache to the swap cache.
1277  */
1278 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1279 {
1280         struct shmem_inode_info *info;
1281         struct address_space *mapping;
1282         struct inode *inode;
1283         swp_entry_t swap;
1284         pgoff_t index;
1285
1286         VM_BUG_ON_PAGE(PageCompound(page), page);
1287         BUG_ON(!PageLocked(page));
1288         mapping = page->mapping;
1289         index = page->index;
1290         inode = mapping->host;
1291         info = SHMEM_I(inode);
1292         if (info->flags & VM_LOCKED)
1293                 goto redirty;
1294         if (!total_swap_pages)
1295                 goto redirty;
1296
1297         /*
1298          * Our capabilities prevent regular writeback or sync from ever calling
1299          * shmem_writepage; but a stacking filesystem might use ->writepage of
1300          * its underlying filesystem, in which case tmpfs should write out to
1301          * swap only in response to memory pressure, and not for the writeback
1302          * threads or sync.
1303          */
1304         if (!wbc->for_reclaim) {
1305                 WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
1306                 goto redirty;
1307         }
1308
1309         /*
1310          * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1311          * value into swapfile.c, the only way we can correctly account for a
1312          * fallocated page arriving here is now to initialize it and write it.
1313          *
1314          * That's okay for a page already fallocated earlier, but if we have
1315          * not yet completed the fallocation, then (a) we want to keep track
1316          * of this page in case we have to undo it, and (b) it may not be a
1317          * good idea to continue anyway, once we're pushing into swap.  So
1318          * reactivate the page, and let shmem_fallocate() quit when too many.
1319          */
1320         if (!PageUptodate(page)) {
1321                 if (inode->i_private) {
1322                         struct shmem_falloc *shmem_falloc;
1323                         spin_lock(&inode->i_lock);
1324                         shmem_falloc = inode->i_private;
1325                         if (shmem_falloc &&
1326                             !shmem_falloc->waitq &&
1327                             index >= shmem_falloc->start &&
1328                             index < shmem_falloc->next)
1329                                 shmem_falloc->nr_unswapped++;
1330                         else
1331                                 shmem_falloc = NULL;
1332                         spin_unlock(&inode->i_lock);
1333                         if (shmem_falloc)
1334                                 goto redirty;
1335                 }
1336                 clear_highpage(page);
1337                 flush_dcache_page(page);
1338                 SetPageUptodate(page);
1339         }
1340
1341         swap = get_swap_page(page);
1342         if (!swap.val)
1343                 goto redirty;
1344
1345         /*
1346          * Add inode to shmem_unuse()'s list of swapped-out inodes,
1347          * if it's not already there.  Do it now before the page is
1348          * moved to swap cache, when its pagelock no longer protects
1349          * the inode from eviction.  But don't unlock the mutex until
1350          * we've incremented swapped, because shmem_unuse_inode() will
1351          * prune a !swapped inode from the swaplist under this mutex.
1352          */
1353         mutex_lock(&shmem_swaplist_mutex);
1354         if (list_empty(&info->swaplist))
1355                 list_add(&info->swaplist, &shmem_swaplist);
1356
1357         if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1358                 spin_lock_irq(&info->lock);
1359                 shmem_recalc_inode(inode);
1360                 info->swapped++;
1361                 spin_unlock_irq(&info->lock);
1362
1363                 swap_shmem_alloc(swap);
1364                 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
1365
1366                 mutex_unlock(&shmem_swaplist_mutex);
1367                 BUG_ON(page_mapped(page));
1368                 swap_writepage(page, wbc);
1369                 return 0;
1370         }
1371
1372         mutex_unlock(&shmem_swaplist_mutex);
1373         put_swap_page(page, swap);
1374 redirty:
1375         set_page_dirty(page);
1376         if (wbc->for_reclaim)
1377                 return AOP_WRITEPAGE_ACTIVATE;  /* Return with page locked */
1378         unlock_page(page);
1379         return 0;
1380 }
1381
1382 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1383 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1384 {
1385         char buffer[64];
1386
1387         if (!mpol || mpol->mode == MPOL_DEFAULT)
1388                 return;         /* show nothing */
1389
1390         mpol_to_str(buffer, sizeof(buffer), mpol);
1391
1392         seq_printf(seq, ",mpol=%s", buffer);
1393 }
1394
1395 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1396 {
1397         struct mempolicy *mpol = NULL;
1398         if (sbinfo->mpol) {
1399                 spin_lock(&sbinfo->stat_lock);  /* prevent replace/use races */
1400                 mpol = sbinfo->mpol;
1401                 mpol_get(mpol);
1402                 spin_unlock(&sbinfo->stat_lock);
1403         }
1404         return mpol;
1405 }
1406 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1407 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1408 {
1409 }
1410 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1411 {
1412         return NULL;
1413 }
1414 #endif /* CONFIG_NUMA && CONFIG_TMPFS */
1415 #ifndef CONFIG_NUMA
1416 #define vm_policy vm_private_data
1417 #endif
1418
1419 static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1420                 struct shmem_inode_info *info, pgoff_t index)
1421 {
1422         /* Create a pseudo vma that just contains the policy */
1423         vma_init(vma, NULL);
1424         /* Bias interleave by inode number to distribute better across nodes */
1425         vma->vm_pgoff = index + info->vfs_inode.i_ino;
1426         vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1427 }
1428
1429 static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1430 {
1431         /* Drop reference taken by mpol_shared_policy_lookup() */
1432         mpol_cond_put(vma->vm_policy);
1433 }
1434
1435 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1436                         struct shmem_inode_info *info, pgoff_t index)
1437 {
1438         struct vm_area_struct pvma;
1439         struct page *page;
1440         struct vm_fault vmf;
1441
1442         shmem_pseudo_vma_init(&pvma, info, index);
1443         vmf.vma = &pvma;
1444         vmf.address = 0;
1445         page = swap_cluster_readahead(swap, gfp, &vmf);
1446         shmem_pseudo_vma_destroy(&pvma);
1447
1448         return page;
1449 }
1450
1451 static struct page *shmem_alloc_hugepage(gfp_t gfp,
1452                 struct shmem_inode_info *info, pgoff_t index)
1453 {
1454         struct vm_area_struct pvma;
1455         struct address_space *mapping = info->vfs_inode.i_mapping;
1456         pgoff_t hindex;
1457         struct page *page;
1458
1459         if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1460                 return NULL;
1461
1462         hindex = round_down(index, HPAGE_PMD_NR);
1463         if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1464                                                                 XA_PRESENT))
1465                 return NULL;
1466
1467         shmem_pseudo_vma_init(&pvma, info, hindex);
1468         page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
1469                         HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
1470         shmem_pseudo_vma_destroy(&pvma);
1471         if (page)
1472                 prep_transhuge_page(page);
1473         return page;
1474 }
1475
1476 static struct page *shmem_alloc_page(gfp_t gfp,
1477                         struct shmem_inode_info *info, pgoff_t index)
1478 {
1479         struct vm_area_struct pvma;
1480         struct page *page;
1481
1482         shmem_pseudo_vma_init(&pvma, info, index);
1483         page = alloc_page_vma(gfp, &pvma, 0);
1484         shmem_pseudo_vma_destroy(&pvma);
1485
1486         return page;
1487 }
1488
1489 static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
1490                 struct inode *inode,
1491                 pgoff_t index, bool huge)
1492 {
1493         struct shmem_inode_info *info = SHMEM_I(inode);
1494         struct page *page;
1495         int nr;
1496         int err = -ENOSPC;
1497
1498         if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1499                 huge = false;
1500         nr = huge ? HPAGE_PMD_NR : 1;
1501
1502         if (!shmem_inode_acct_block(inode, nr))
1503                 goto failed;
1504
1505         if (huge)
1506                 page = shmem_alloc_hugepage(gfp, info, index);
1507         else
1508                 page = shmem_alloc_page(gfp, info, index);
1509         if (page) {
1510                 __SetPageLocked(page);
1511                 __SetPageSwapBacked(page);
1512                 return page;
1513         }
1514
1515         err = -ENOMEM;
1516         shmem_inode_unacct_blocks(inode, nr);
1517 failed:
1518         return ERR_PTR(err);
1519 }
1520
1521 /*
1522  * When a page is moved from swapcache to shmem filecache (either by the
1523  * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
1524  * shmem_unuse_inode()), it may have been read in earlier from swap, in
1525  * ignorance of the mapping it belongs to.  If that mapping has special
1526  * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1527  * we may need to copy to a suitable page before moving to filecache.
1528  *
1529  * In a future release, this may well be extended to respect cpuset and
1530  * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1531  * but for now it is a simple matter of zone.
1532  */
1533 static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
1534 {
1535         return page_zonenum(page) > gfp_zone(gfp);
1536 }
1537
1538 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1539                                 struct shmem_inode_info *info, pgoff_t index)
1540 {
1541         struct page *oldpage, *newpage;
1542         struct address_space *swap_mapping;
1543         swp_entry_t entry;
1544         pgoff_t swap_index;
1545         int error;
1546
1547         oldpage = *pagep;
1548         entry.val = page_private(oldpage);
1549         swap_index = swp_offset(entry);
1550         swap_mapping = page_mapping(oldpage);
1551
1552         /*
1553          * We have arrived here because our zones are constrained, so don't
1554          * limit chance of success by further cpuset and node constraints.
1555          */
1556         gfp &= ~GFP_CONSTRAINT_MASK;
1557         newpage = shmem_alloc_page(gfp, info, index);
1558         if (!newpage)
1559                 return -ENOMEM;
1560
1561         get_page(newpage);
1562         copy_highpage(newpage, oldpage);
1563         flush_dcache_page(newpage);
1564
1565         __SetPageLocked(newpage);
1566         __SetPageSwapBacked(newpage);
1567         SetPageUptodate(newpage);
1568         set_page_private(newpage, entry.val);
1569         SetPageSwapCache(newpage);
1570
1571         /*
1572          * Our caller will very soon move newpage out of swapcache, but it's
1573          * a nice clean interface for us to replace oldpage by newpage there.
1574          */
1575         xa_lock_irq(&swap_mapping->i_pages);
1576         error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
1577         if (!error) {
1578                 __inc_node_page_state(newpage, NR_FILE_PAGES);
1579                 __dec_node_page_state(oldpage, NR_FILE_PAGES);
1580         }
1581         xa_unlock_irq(&swap_mapping->i_pages);
1582
1583         if (unlikely(error)) {
1584                 /*
1585                  * Is this possible?  I think not, now that our callers check
1586                  * both PageSwapCache and page_private after getting page lock;
1587                  * but be defensive.  Reverse old to newpage for clear and free.
1588                  */
1589                 oldpage = newpage;
1590         } else {
1591                 mem_cgroup_migrate(oldpage, newpage);
1592                 lru_cache_add_anon(newpage);
1593                 *pagep = newpage;
1594         }
1595
1596         ClearPageSwapCache(oldpage);
1597         set_page_private(oldpage, 0);
1598
1599         unlock_page(oldpage);
1600         put_page(oldpage);
1601         put_page(oldpage);
1602         return error;
1603 }
1604
1605 /*
1606  * Swap in the page pointed to by *pagep.
1607  * Caller has to make sure that *pagep contains a valid swapped page.
1608  * Returns 0 and the page in pagep if success. On failure, returns the
1609  * the error code and NULL in *pagep.
1610  */
1611 static int shmem_swapin_page(struct inode *inode, pgoff_t index,
1612                              struct page **pagep, enum sgp_type sgp,
1613                              gfp_t gfp, struct vm_area_struct *vma,
1614                              vm_fault_t *fault_type)
1615 {
1616         struct address_space *mapping = inode->i_mapping;
1617         struct shmem_inode_info *info = SHMEM_I(inode);
1618         struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
1619         struct mem_cgroup *memcg;
1620         struct page *page;
1621         swp_entry_t swap;
1622         int error;
1623
1624         VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
1625         swap = radix_to_swp_entry(*pagep);
1626         *pagep = NULL;
1627
1628         /* Look it up and read it in.. */
1629         page = lookup_swap_cache(swap, NULL, 0);
1630         if (!page) {
1631                 /* Or update major stats only when swapin succeeds?? */
1632                 if (fault_type) {
1633                         *fault_type |= VM_FAULT_MAJOR;
1634                         count_vm_event(PGMAJFAULT);
1635                         count_memcg_event_mm(charge_mm, PGMAJFAULT);
1636                 }
1637                 /* Here we actually start the io */
1638                 page = shmem_swapin(swap, gfp, info, index);
1639                 if (!page) {
1640                         error = -ENOMEM;
1641                         goto failed;
1642                 }
1643         }
1644
1645         /* We have to do this with page locked to prevent races */
1646         lock_page(page);
1647         if (!PageSwapCache(page) || page_private(page) != swap.val ||
1648             !shmem_confirm_swap(mapping, index, swap)) {
1649                 error = -EEXIST;
1650                 goto unlock;
1651         }
1652         if (!PageUptodate(page)) {
1653                 error = -EIO;
1654                 goto failed;
1655         }
1656         wait_on_page_writeback(page);
1657
1658         if (shmem_should_replace_page(page, gfp)) {
1659                 error = shmem_replace_page(&page, gfp, info, index);
1660                 if (error)
1661                         goto failed;
1662         }
1663
1664         error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1665                                             false);
1666         if (!error) {
1667                 error = shmem_add_to_page_cache(page, mapping, index,
1668                                                 swp_to_radix_entry(swap), gfp);
1669                 /*
1670                  * We already confirmed swap under page lock, and make
1671                  * no memory allocation here, so usually no possibility
1672                  * of error; but free_swap_and_cache() only trylocks a
1673                  * page, so it is just possible that the entry has been
1674                  * truncated or holepunched since swap was confirmed.
1675                  * shmem_undo_range() will have done some of the
1676                  * unaccounting, now delete_from_swap_cache() will do
1677                  * the rest.
1678                  */
1679                 if (error) {
1680                         mem_cgroup_cancel_charge(page, memcg, false);
1681                         delete_from_swap_cache(page);
1682                 }
1683         }
1684         if (error)
1685                 goto failed;
1686
1687         mem_cgroup_commit_charge(page, memcg, true, false);
1688
1689         spin_lock_irq(&info->lock);
1690         info->swapped--;
1691         shmem_recalc_inode(inode);
1692         spin_unlock_irq(&info->lock);
1693
1694         if (sgp == SGP_WRITE)
1695                 mark_page_accessed(page);
1696
1697         delete_from_swap_cache(page);
1698         set_page_dirty(page);
1699         swap_free(swap);
1700
1701         *pagep = page;
1702         return 0;
1703 failed:
1704         if (!shmem_confirm_swap(mapping, index, swap))
1705                 error = -EEXIST;
1706 unlock:
1707         if (page) {
1708                 unlock_page(page);
1709                 put_page(page);
1710         }
1711
1712         return error;
1713 }
1714
1715 /*
1716  * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1717  *
1718  * If we allocate a new one we do not mark it dirty. That's up to the
1719  * vm. If we swap it in we mark it dirty since we also free the swap
1720  * entry since a page cannot live in both the swap and page cache.
1721  *
1722  * fault_mm and fault_type are only supplied by shmem_fault:
1723  * otherwise they are NULL.
1724  */
1725 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1726         struct page **pagep, enum sgp_type sgp, gfp_t gfp,
1727         struct vm_area_struct *vma, struct vm_fault *vmf,
1728                         vm_fault_t *fault_type)
1729 {
1730         struct address_space *mapping = inode->i_mapping;
1731         struct shmem_inode_info *info = SHMEM_I(inode);
1732         struct shmem_sb_info *sbinfo;
1733         struct mm_struct *charge_mm;
1734         struct mem_cgroup *memcg;
1735         struct page *page;
1736         enum sgp_type sgp_huge = sgp;
1737         pgoff_t hindex = index;
1738         int error;
1739         int once = 0;
1740         int alloced = 0;
1741
1742         if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1743                 return -EFBIG;
1744         if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
1745                 sgp = SGP_CACHE;
1746 repeat:
1747         if (sgp <= SGP_CACHE &&
1748             ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1749                 return -EINVAL;
1750         }
1751
1752         sbinfo = SHMEM_SB(inode->i_sb);
1753         charge_mm = vma ? vma->vm_mm : current->mm;
1754
1755         page = find_lock_entry(mapping, index);
1756         if (xa_is_value(page)) {
1757                 error = shmem_swapin_page(inode, index, &page,
1758                                           sgp, gfp, vma, fault_type);
1759                 if (error == -EEXIST)
1760                         goto repeat;
1761
1762                 *pagep = page;
1763                 return error;
1764         }
1765
1766         if (page && sgp == SGP_WRITE)
1767                 mark_page_accessed(page);
1768
1769         /* fallocated page? */
1770         if (page && !PageUptodate(page)) {
1771                 if (sgp != SGP_READ)
1772                         goto clear;
1773                 unlock_page(page);
1774                 put_page(page);
1775                 page = NULL;
1776         }
1777         if (page || sgp == SGP_READ) {
1778                 *pagep = page;
1779                 return 0;
1780         }
1781
1782         /*
1783          * Fast cache lookup did not find it:
1784          * bring it back from swap or allocate.
1785          */
1786
1787         if (vma && userfaultfd_missing(vma)) {
1788                 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1789                 return 0;
1790         }
1791
1792         /* shmem_symlink() */
1793         if (mapping->a_ops != &shmem_aops)
1794                 goto alloc_nohuge;
1795         if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1796                 goto alloc_nohuge;
1797         if (shmem_huge == SHMEM_HUGE_FORCE)
1798                 goto alloc_huge;
1799         switch (sbinfo->huge) {
1800                 loff_t i_size;
1801                 pgoff_t off;
1802         case SHMEM_HUGE_NEVER:
1803                 goto alloc_nohuge;
1804         case SHMEM_HUGE_WITHIN_SIZE:
1805                 off = round_up(index, HPAGE_PMD_NR);
1806                 i_size = round_up(i_size_read(inode), PAGE_SIZE);
1807                 if (i_size >= HPAGE_PMD_SIZE &&
1808                     i_size >> PAGE_SHIFT >= off)
1809                         goto alloc_huge;
1810                 /* fallthrough */
1811         case SHMEM_HUGE_ADVISE:
1812                 if (sgp_huge == SGP_HUGE)
1813                         goto alloc_huge;
1814                 /* TODO: implement fadvise() hints */
1815                 goto alloc_nohuge;
1816         }
1817
1818 alloc_huge:
1819         page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1820         if (IS_ERR(page)) {
1821 alloc_nohuge:
1822                 page = shmem_alloc_and_acct_page(gfp, inode,
1823                                                  index, false);
1824         }
1825         if (IS_ERR(page)) {
1826                 int retry = 5;
1827
1828                 error = PTR_ERR(page);
1829                 page = NULL;
1830                 if (error != -ENOSPC)
1831                         goto unlock;
1832                 /*
1833                  * Try to reclaim some space by splitting a huge page
1834                  * beyond i_size on the filesystem.
1835                  */
1836                 while (retry--) {
1837                         int ret;
1838
1839                         ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1840                         if (ret == SHRINK_STOP)
1841                                 break;
1842                         if (ret)
1843                                 goto alloc_nohuge;
1844                 }
1845                 goto unlock;
1846         }
1847
1848         if (PageTransHuge(page))
1849                 hindex = round_down(index, HPAGE_PMD_NR);
1850         else
1851                 hindex = index;
1852
1853         if (sgp == SGP_WRITE)
1854                 __SetPageReferenced(page);
1855
1856         error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1857                                             PageTransHuge(page));
1858         if (error)
1859                 goto unacct;
1860         error = shmem_add_to_page_cache(page, mapping, hindex,
1861                                         NULL, gfp & GFP_RECLAIM_MASK);
1862         if (error) {
1863                 mem_cgroup_cancel_charge(page, memcg,
1864                                          PageTransHuge(page));
1865                 goto unacct;
1866         }
1867         mem_cgroup_commit_charge(page, memcg, false,
1868                                  PageTransHuge(page));
1869         lru_cache_add_anon(page);
1870
1871         spin_lock_irq(&info->lock);
1872         info->alloced += 1 << compound_order(page);
1873         inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1874         shmem_recalc_inode(inode);
1875         spin_unlock_irq(&info->lock);
1876         alloced = true;
1877
1878         if (PageTransHuge(page) &&
1879             DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1880                         hindex + HPAGE_PMD_NR - 1) {
1881                 /*
1882                  * Part of the huge page is beyond i_size: subject
1883                  * to shrink under memory pressure.
1884                  */
1885                 spin_lock(&sbinfo->shrinklist_lock);
1886                 /*
1887                  * _careful to defend against unlocked access to
1888                  * ->shrink_list in shmem_unused_huge_shrink()
1889                  */
1890                 if (list_empty_careful(&info->shrinklist)) {
1891                         list_add_tail(&info->shrinklist,
1892                                       &sbinfo->shrinklist);
1893                         sbinfo->shrinklist_len++;
1894                 }
1895                 spin_unlock(&sbinfo->shrinklist_lock);
1896         }
1897
1898         /*
1899          * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1900          */
1901         if (sgp == SGP_FALLOC)
1902                 sgp = SGP_WRITE;
1903 clear:
1904         /*
1905          * Let SGP_WRITE caller clear ends if write does not fill page;
1906          * but SGP_FALLOC on a page fallocated earlier must initialize
1907          * it now, lest undo on failure cancel our earlier guarantee.
1908          */
1909         if (sgp != SGP_WRITE && !PageUptodate(page)) {
1910                 struct page *head = compound_head(page);
1911                 int i;
1912
1913                 for (i = 0; i < (1 << compound_order(head)); i++) {
1914                         clear_highpage(head + i);
1915                         flush_dcache_page(head + i);
1916                 }
1917                 SetPageUptodate(head);
1918         }
1919
1920         /* Perhaps the file has been truncated since we checked */
1921         if (sgp <= SGP_CACHE &&
1922             ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1923                 if (alloced) {
1924                         ClearPageDirty(page);
1925                         delete_from_page_cache(page);
1926                         spin_lock_irq(&info->lock);
1927                         shmem_recalc_inode(inode);
1928                         spin_unlock_irq(&info->lock);
1929                 }
1930                 error = -EINVAL;
1931                 goto unlock;
1932         }
1933         *pagep = page + index - hindex;
1934         return 0;
1935
1936         /*
1937          * Error recovery.
1938          */
1939 unacct:
1940         shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
1941
1942         if (PageTransHuge(page)) {
1943                 unlock_page(page);
1944                 put_page(page);
1945                 goto alloc_nohuge;
1946         }
1947 unlock:
1948         if (page) {
1949                 unlock_page(page);
1950                 put_page(page);
1951         }
1952         if (error == -ENOSPC && !once++) {
1953                 spin_lock_irq(&info->lock);
1954                 shmem_recalc_inode(inode);
1955                 spin_unlock_irq(&info->lock);
1956                 goto repeat;
1957         }
1958         if (error == -EEXIST)
1959                 goto repeat;
1960         return error;
1961 }
1962
1963 /*
1964  * This is like autoremove_wake_function, but it removes the wait queue
1965  * entry unconditionally - even if something else had already woken the
1966  * target.
1967  */
1968 static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
1969 {
1970         int ret = default_wake_function(wait, mode, sync, key);
1971         list_del_init(&wait->entry);
1972         return ret;
1973 }
1974
1975 static vm_fault_t shmem_fault(struct vm_fault *vmf)
1976 {
1977         struct vm_area_struct *vma = vmf->vma;
1978         struct inode *inode = file_inode(vma->vm_file);
1979         gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
1980         enum sgp_type sgp;
1981         int err;
1982         vm_fault_t ret = VM_FAULT_LOCKED;
1983
1984         /*
1985          * Trinity finds that probing a hole which tmpfs is punching can
1986          * prevent the hole-punch from ever completing: which in turn
1987          * locks writers out with its hold on i_mutex.  So refrain from
1988          * faulting pages into the hole while it's being punched.  Although
1989          * shmem_undo_range() does remove the additions, it may be unable to
1990          * keep up, as each new page needs its own unmap_mapping_range() call,
1991          * and the i_mmap tree grows ever slower to scan if new vmas are added.
1992          *
1993          * It does not matter if we sometimes reach this check just before the
1994          * hole-punch begins, so that one fault then races with the punch:
1995          * we just need to make racing faults a rare case.
1996          *
1997          * The implementation below would be much simpler if we just used a
1998          * standard mutex or completion: but we cannot take i_mutex in fault,
1999          * and bloating every shmem inode for this unlikely case would be sad.
2000          */
2001         if (unlikely(inode->i_private)) {
2002                 struct shmem_falloc *shmem_falloc;
2003
2004                 spin_lock(&inode->i_lock);
2005                 shmem_falloc = inode->i_private;
2006                 if (shmem_falloc &&
2007                     shmem_falloc->waitq &&
2008                     vmf->pgoff >= shmem_falloc->start &&
2009                     vmf->pgoff < shmem_falloc->next) {
2010                         wait_queue_head_t *shmem_falloc_waitq;
2011                         DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2012
2013                         ret = VM_FAULT_NOPAGE;
2014                         if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
2015                            !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
2016                                 /* It's polite to up mmap_sem if we can */
2017                                 up_read(&vma->vm_mm->mmap_sem);
2018                                 ret = VM_FAULT_RETRY;
2019                         }
2020
2021                         shmem_falloc_waitq = shmem_falloc->waitq;
2022                         prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2023                                         TASK_UNINTERRUPTIBLE);
2024                         spin_unlock(&inode->i_lock);
2025                         schedule();
2026
2027                         /*
2028                          * shmem_falloc_waitq points into the shmem_fallocate()
2029                          * stack of the hole-punching task: shmem_falloc_waitq
2030                          * is usually invalid by the time we reach here, but
2031                          * finish_wait() does not dereference it in that case;
2032                          * though i_lock needed lest racing with wake_up_all().
2033                          */
2034                         spin_lock(&inode->i_lock);
2035                         finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2036                         spin_unlock(&inode->i_lock);
2037                         return ret;
2038                 }
2039                 spin_unlock(&inode->i_lock);
2040         }
2041
2042         sgp = SGP_CACHE;
2043
2044         if ((vma->vm_flags & VM_NOHUGEPAGE) ||
2045             test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
2046                 sgp = SGP_NOHUGE;
2047         else if (vma->vm_flags & VM_HUGEPAGE)
2048                 sgp = SGP_HUGE;
2049
2050         err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
2051                                   gfp, vma, vmf, &ret);
2052         if (err)
2053                 return vmf_error(err);
2054         return ret;
2055 }
2056
2057 unsigned long shmem_get_unmapped_area(struct file *file,
2058                                       unsigned long uaddr, unsigned long len,
2059                                       unsigned long pgoff, unsigned long flags)
2060 {
2061         unsigned long (*get_area)(struct file *,
2062                 unsigned long, unsigned long, unsigned long, unsigned long);
2063         unsigned long addr;
2064         unsigned long offset;
2065         unsigned long inflated_len;
2066         unsigned long inflated_addr;
2067         unsigned long inflated_offset;
2068
2069         if (len > TASK_SIZE)
2070                 return -ENOMEM;
2071
2072         get_area = current->mm->get_unmapped_area;
2073         addr = get_area(file, uaddr, len, pgoff, flags);
2074
2075         if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
2076                 return addr;
2077         if (IS_ERR_VALUE(addr))
2078                 return addr;
2079         if (addr & ~PAGE_MASK)
2080                 return addr;
2081         if (addr > TASK_SIZE - len)
2082                 return addr;
2083
2084         if (shmem_huge == SHMEM_HUGE_DENY)
2085                 return addr;
2086         if (len < HPAGE_PMD_SIZE)
2087                 return addr;
2088         if (flags & MAP_FIXED)
2089                 return addr;
2090         /*
2091          * Our priority is to support MAP_SHARED mapped hugely;
2092          * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2093          * But if caller specified an address hint, respect that as before.
2094          */
2095         if (uaddr)
2096                 return addr;
2097
2098         if (shmem_huge != SHMEM_HUGE_FORCE) {
2099                 struct super_block *sb;
2100
2101                 if (file) {
2102                         VM_BUG_ON(file->f_op != &shmem_file_operations);
2103                         sb = file_inode(file)->i_sb;
2104                 } else {
2105                         /*
2106                          * Called directly from mm/mmap.c, or drivers/char/mem.c
2107                          * for "/dev/zero", to create a shared anonymous object.
2108                          */
2109                         if (IS_ERR(shm_mnt))
2110                                 return addr;
2111                         sb = shm_mnt->mnt_sb;
2112                 }
2113                 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
2114                         return addr;
2115         }
2116
2117         offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2118         if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2119                 return addr;
2120         if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2121                 return addr;
2122
2123         inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2124         if (inflated_len > TASK_SIZE)
2125                 return addr;
2126         if (inflated_len < len)
2127                 return addr;
2128
2129         inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
2130         if (IS_ERR_VALUE(inflated_addr))
2131                 return addr;
2132         if (inflated_addr & ~PAGE_MASK)
2133                 return addr;
2134
2135         inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2136         inflated_addr += offset - inflated_offset;
2137         if (inflated_offset > offset)
2138                 inflated_addr += HPAGE_PMD_SIZE;
2139
2140         if (inflated_addr > TASK_SIZE - len)
2141                 return addr;
2142         return inflated_addr;
2143 }
2144
2145 #ifdef CONFIG_NUMA
2146 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2147 {
2148         struct inode *inode = file_inode(vma->vm_file);
2149         return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2150 }
2151
2152 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2153                                           unsigned long addr)
2154 {
2155         struct inode *inode = file_inode(vma->vm_file);
2156         pgoff_t index;
2157
2158         index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2159         return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2160 }
2161 #endif
2162
2163 int shmem_lock(struct file *file, int lock, struct user_struct *user)
2164 {
2165         struct inode *inode = file_inode(file);
2166         struct shmem_inode_info *info = SHMEM_I(inode);
2167         int retval = -ENOMEM;
2168
2169         spin_lock_irq(&info->lock);
2170         if (lock && !(info->flags & VM_LOCKED)) {
2171                 if (!user_shm_lock(inode->i_size, user))
2172                         goto out_nomem;
2173                 info->flags |= VM_LOCKED;
2174                 mapping_set_unevictable(file->f_mapping);
2175         }
2176         if (!lock && (info->flags & VM_LOCKED) && user) {
2177                 user_shm_unlock(inode->i_size, user);
2178                 info->flags &= ~VM_LOCKED;
2179                 mapping_clear_unevictable(file->f_mapping);
2180         }
2181         retval = 0;
2182
2183 out_nomem:
2184         spin_unlock_irq(&info->lock);
2185         return retval;
2186 }
2187
2188 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2189 {
2190         struct shmem_inode_info *info = SHMEM_I(file_inode(file));
2191
2192         if (info->seals & F_SEAL_FUTURE_WRITE) {
2193                 /*
2194                  * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
2195                  * "future write" seal active.
2196                  */
2197                 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
2198                         return -EPERM;
2199
2200                 /*
2201                  * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
2202                  * read-only mapping, take care to not allow mprotect to revert
2203                  * protections.
2204                  */
2205                 vma->vm_flags &= ~(VM_MAYWRITE);
2206         }
2207
2208         file_accessed(file);
2209         vma->vm_ops = &shmem_vm_ops;
2210         if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
2211                         ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
2212                         (vma->vm_end & HPAGE_PMD_MASK)) {
2213                 khugepaged_enter(vma, vma->vm_flags);
2214         }
2215         return 0;
2216 }
2217
2218 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
2219                                      umode_t mode, dev_t dev, unsigned long flags)
2220 {
2221         struct inode *inode;
2222         struct shmem_inode_info *info;
2223         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2224
2225         if (shmem_reserve_inode(sb))
2226                 return NULL;
2227
2228         inode = new_inode(sb);
2229         if (inode) {
2230                 inode->i_ino = get_next_ino();
2231                 inode_init_owner(inode, dir, mode);
2232                 inode->i_blocks = 0;
2233                 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
2234                 inode->i_generation = prandom_u32();
2235                 info = SHMEM_I(inode);
2236                 memset(info, 0, (char *)inode - (char *)info);
2237                 spin_lock_init(&info->lock);
2238                 atomic_set(&info->stop_eviction, 0);
2239                 info->seals = F_SEAL_SEAL;
2240                 info->flags = flags & VM_NORESERVE;
2241                 INIT_LIST_HEAD(&info->shrinklist);
2242                 INIT_LIST_HEAD(&info->swaplist);
2243                 simple_xattrs_init(&info->xattrs);
2244                 cache_no_acl(inode);
2245
2246                 switch (mode & S_IFMT) {
2247                 default:
2248                         inode->i_op = &shmem_special_inode_operations;
2249                         init_special_inode(inode, mode, dev);
2250                         break;
2251                 case S_IFREG:
2252                         inode->i_mapping->a_ops = &shmem_aops;
2253                         inode->i_op = &shmem_inode_operations;
2254                         inode->i_fop = &shmem_file_operations;
2255                         mpol_shared_policy_init(&info->policy,
2256                                                  shmem_get_sbmpol(sbinfo));
2257                         break;
2258                 case S_IFDIR:
2259                         inc_nlink(inode);
2260                         /* Some things misbehave if size == 0 on a directory */
2261                         inode->i_size = 2 * BOGO_DIRENT_SIZE;
2262                         inode->i_op = &shmem_dir_inode_operations;
2263                         inode->i_fop = &simple_dir_operations;
2264                         break;
2265                 case S_IFLNK:
2266                         /*
2267                          * Must not load anything in the rbtree,
2268                          * mpol_free_shared_policy will not be called.
2269                          */
2270                         mpol_shared_policy_init(&info->policy, NULL);
2271                         break;
2272                 }
2273
2274                 lockdep_annotate_inode_mutex_key(inode);
2275         } else
2276                 shmem_free_inode(sb);
2277         return inode;
2278 }
2279
2280 bool shmem_mapping(struct address_space *mapping)
2281 {
2282         return mapping->a_ops == &shmem_aops;
2283 }
2284
2285 static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2286                                   pmd_t *dst_pmd,
2287                                   struct vm_area_struct *dst_vma,
2288                                   unsigned long dst_addr,
2289                                   unsigned long src_addr,
2290                                   bool zeropage,
2291                                   struct page **pagep)
2292 {
2293         struct inode *inode = file_inode(dst_vma->vm_file);
2294         struct shmem_inode_info *info = SHMEM_I(inode);
2295         struct address_space *mapping = inode->i_mapping;
2296         gfp_t gfp = mapping_gfp_mask(mapping);
2297         pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2298         struct mem_cgroup *memcg;
2299         spinlock_t *ptl;
2300         void *page_kaddr;
2301         struct page *page;
2302         pte_t _dst_pte, *dst_pte;
2303         int ret;
2304         pgoff_t offset, max_off;
2305
2306         ret = -ENOMEM;
2307         if (!shmem_inode_acct_block(inode, 1))
2308                 goto out;
2309
2310         if (!*pagep) {
2311                 page = shmem_alloc_page(gfp, info, pgoff);
2312                 if (!page)
2313                         goto out_unacct_blocks;
2314
2315                 if (!zeropage) {        /* mcopy_atomic */
2316                         page_kaddr = kmap_atomic(page);
2317                         ret = copy_from_user(page_kaddr,
2318                                              (const void __user *)src_addr,
2319                                              PAGE_SIZE);
2320                         kunmap_atomic(page_kaddr);
2321
2322                         /* fallback to copy_from_user outside mmap_sem */
2323                         if (unlikely(ret)) {
2324                                 *pagep = page;
2325                                 shmem_inode_unacct_blocks(inode, 1);
2326                                 /* don't free the page */
2327                                 return -ENOENT;
2328                         }
2329                 } else {                /* mfill_zeropage_atomic */
2330                         clear_highpage(page);
2331                 }
2332         } else {
2333                 page = *pagep;
2334                 *pagep = NULL;
2335         }
2336
2337         VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
2338         __SetPageLocked(page);
2339         __SetPageSwapBacked(page);
2340         __SetPageUptodate(page);
2341
2342         ret = -EFAULT;
2343         offset = linear_page_index(dst_vma, dst_addr);
2344         max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2345         if (unlikely(offset >= max_off))
2346                 goto out_release;
2347
2348         ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
2349         if (ret)
2350                 goto out_release;
2351
2352         ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
2353                                                 gfp & GFP_RECLAIM_MASK);
2354         if (ret)
2355                 goto out_release_uncharge;
2356
2357         mem_cgroup_commit_charge(page, memcg, false, false);
2358
2359         _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
2360         if (dst_vma->vm_flags & VM_WRITE)
2361                 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
2362         else {
2363                 /*
2364                  * We don't set the pte dirty if the vma has no
2365                  * VM_WRITE permission, so mark the page dirty or it
2366                  * could be freed from under us. We could do it
2367                  * unconditionally before unlock_page(), but doing it
2368                  * only if VM_WRITE is not set is faster.
2369                  */
2370                 set_page_dirty(page);
2371         }
2372
2373         dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
2374
2375         ret = -EFAULT;
2376         max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2377         if (unlikely(offset >= max_off))
2378                 goto out_release_uncharge_unlock;
2379
2380         ret = -EEXIST;
2381         if (!pte_none(*dst_pte))
2382                 goto out_release_uncharge_unlock;
2383
2384         lru_cache_add_anon(page);
2385
2386         spin_lock(&info->lock);
2387         info->alloced++;
2388         inode->i_blocks += BLOCKS_PER_PAGE;
2389         shmem_recalc_inode(inode);
2390         spin_unlock(&info->lock);
2391
2392         inc_mm_counter(dst_mm, mm_counter_file(page));
2393         page_add_file_rmap(page, false);
2394         set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
2395
2396         /* No need to invalidate - it was non-present before */
2397         update_mmu_cache(dst_vma, dst_addr, dst_pte);
2398         pte_unmap_unlock(dst_pte, ptl);
2399         unlock_page(page);
2400         ret = 0;
2401 out:
2402         return ret;
2403 out_release_uncharge_unlock:
2404         pte_unmap_unlock(dst_pte, ptl);
2405         ClearPageDirty(page);
2406         delete_from_page_cache(page);
2407 out_release_uncharge:
2408         mem_cgroup_cancel_charge(page, memcg, false);
2409 out_release:
2410         unlock_page(page);
2411         put_page(page);
2412 out_unacct_blocks:
2413         shmem_inode_unacct_blocks(inode, 1);
2414         goto out;
2415 }
2416
2417 int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
2418                            pmd_t *dst_pmd,
2419                            struct vm_area_struct *dst_vma,
2420                            unsigned long dst_addr,
2421                            unsigned long src_addr,
2422                            struct page **pagep)
2423 {
2424         return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2425                                       dst_addr, src_addr, false, pagep);
2426 }
2427
2428 int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
2429                              pmd_t *dst_pmd,
2430                              struct vm_area_struct *dst_vma,
2431                              unsigned long dst_addr)
2432 {
2433         struct page *page = NULL;
2434
2435         return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2436                                       dst_addr, 0, true, &page);
2437 }
2438
2439 #ifdef CONFIG_TMPFS
2440 static const struct inode_operations shmem_symlink_inode_operations;
2441 static const struct inode_operations shmem_short_symlink_operations;
2442
2443 #ifdef CONFIG_TMPFS_XATTR
2444 static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2445 #else
2446 #define shmem_initxattrs NULL
2447 #endif
2448
2449 static int
2450 shmem_write_begin(struct file *file, struct address_space *mapping,
2451                         loff_t pos, unsigned len, unsigned flags,
2452                         struct page **pagep, void **fsdata)
2453 {
2454         struct inode *inode = mapping->host;
2455         struct shmem_inode_info *info = SHMEM_I(inode);
2456         pgoff_t index = pos >> PAGE_SHIFT;
2457
2458         /* i_mutex is held by caller */
2459         if (unlikely(info->seals & (F_SEAL_GROW |
2460                                    F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2461                 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
2462                         return -EPERM;
2463                 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2464                         return -EPERM;
2465         }
2466
2467         return shmem_getpage(inode, index, pagep, SGP_WRITE);
2468 }
2469
2470 static int
2471 shmem_write_end(struct file *file, struct address_space *mapping,
2472                         loff_t pos, unsigned len, unsigned copied,
2473                         struct page *page, void *fsdata)
2474 {
2475         struct inode *inode = mapping->host;
2476
2477         if (pos + copied > inode->i_size)
2478                 i_size_write(inode, pos + copied);
2479
2480         if (!PageUptodate(page)) {
2481                 struct page *head = compound_head(page);
2482                 if (PageTransCompound(page)) {
2483                         int i;
2484
2485                         for (i = 0; i < HPAGE_PMD_NR; i++) {
2486                                 if (head + i == page)
2487                                         continue;
2488                                 clear_highpage(head + i);
2489                                 flush_dcache_page(head + i);
2490                         }
2491                 }
2492                 if (copied < PAGE_SIZE) {
2493                         unsigned from = pos & (PAGE_SIZE - 1);
2494                         zero_user_segments(page, 0, from,
2495                                         from + copied, PAGE_SIZE);
2496                 }
2497                 SetPageUptodate(head);
2498         }
2499         set_page_dirty(page);
2500         unlock_page(page);
2501         put_page(page);
2502
2503         return copied;
2504 }
2505
2506 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
2507 {
2508         struct file *file = iocb->ki_filp;
2509         struct inode *inode = file_inode(file);
2510         struct address_space *mapping = inode->i_mapping;
2511         pgoff_t index;
2512         unsigned long offset;
2513         enum sgp_type sgp = SGP_READ;
2514         int error = 0;
2515         ssize_t retval = 0;
2516         loff_t *ppos = &iocb->ki_pos;
2517
2518         /*
2519          * Might this read be for a stacking filesystem?  Then when reading
2520          * holes of a sparse file, we actually need to allocate those pages,
2521          * and even mark them dirty, so it cannot exceed the max_blocks limit.
2522          */
2523         if (!iter_is_iovec(to))
2524                 sgp = SGP_CACHE;
2525
2526         index = *ppos >> PAGE_SHIFT;
2527         offset = *ppos & ~PAGE_MASK;
2528
2529         for (;;) {
2530                 struct page *page = NULL;
2531                 pgoff_t end_index;
2532                 unsigned long nr, ret;
2533                 loff_t i_size = i_size_read(inode);
2534
2535                 end_index = i_size >> PAGE_SHIFT;
2536                 if (index > end_index)
2537                         break;
2538                 if (index == end_index) {
2539                         nr = i_size & ~PAGE_MASK;
2540                         if (nr <= offset)
2541                                 break;
2542                 }
2543
2544                 error = shmem_getpage(inode, index, &page, sgp);
2545                 if (error) {
2546                         if (error == -EINVAL)
2547                                 error = 0;
2548                         break;
2549                 }
2550                 if (page) {
2551                         if (sgp == SGP_CACHE)
2552                                 set_page_dirty(page);
2553                         unlock_page(page);
2554                 }
2555
2556                 /*
2557                  * We must evaluate after, since reads (unlike writes)
2558                  * are called without i_mutex protection against truncate
2559                  */
2560                 nr = PAGE_SIZE;
2561                 i_size = i_size_read(inode);
2562                 end_index = i_size >> PAGE_SHIFT;
2563                 if (index == end_index) {
2564                         nr = i_size & ~PAGE_MASK;
2565                         if (nr <= offset) {
2566                                 if (page)
2567                                         put_page(page);
2568                                 break;
2569                         }
2570                 }
2571                 nr -= offset;
2572
2573                 if (page) {
2574                         /*
2575                          * If users can be writing to this page using arbitrary
2576                          * virtual addresses, take care about potential aliasing
2577                          * before reading the page on the kernel side.
2578                          */
2579                         if (mapping_writably_mapped(mapping))
2580                                 flush_dcache_page(page);
2581                         /*
2582                          * Mark the page accessed if we read the beginning.
2583                          */
2584                         if (!offset)
2585                                 mark_page_accessed(page);
2586                 } else {
2587                         page = ZERO_PAGE(0);
2588                         get_page(page);
2589                 }
2590
2591                 /*
2592                  * Ok, we have the page, and it's up-to-date, so
2593                  * now we can copy it to user space...
2594                  */
2595                 ret = copy_page_to_iter(page, offset, nr, to);
2596                 retval += ret;
2597                 offset += ret;
2598                 index += offset >> PAGE_SHIFT;
2599                 offset &= ~PAGE_MASK;
2600
2601                 put_page(page);
2602                 if (!iov_iter_count(to))
2603                         break;
2604                 if (ret < nr) {
2605                         error = -EFAULT;
2606                         break;
2607                 }
2608                 cond_resched();
2609         }
2610
2611         *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
2612         file_accessed(file);
2613         return retval ? retval : error;
2614 }
2615
2616 /*
2617  * llseek SEEK_DATA or SEEK_HOLE through the page cache.
2618  */
2619 static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
2620                                     pgoff_t index, pgoff_t end, int whence)
2621 {
2622         struct page *page;
2623         struct pagevec pvec;
2624         pgoff_t indices[PAGEVEC_SIZE];
2625         bool done = false;
2626         int i;
2627
2628         pagevec_init(&pvec);
2629         pvec.nr = 1;            /* start small: we may be there already */
2630         while (!done) {
2631                 pvec.nr = find_get_entries(mapping, index,
2632                                         pvec.nr, pvec.pages, indices);
2633                 if (!pvec.nr) {
2634                         if (whence == SEEK_DATA)
2635                                 index = end;
2636                         break;
2637                 }
2638                 for (i = 0; i < pvec.nr; i++, index++) {
2639                         if (index < indices[i]) {
2640                                 if (whence == SEEK_HOLE) {
2641                                         done = true;
2642                                         break;
2643                                 }
2644                                 index = indices[i];
2645                         }
2646                         page = pvec.pages[i];
2647                         if (page && !xa_is_value(page)) {
2648                                 if (!PageUptodate(page))
2649                                         page = NULL;
2650                         }
2651                         if (index >= end ||
2652                             (page && whence == SEEK_DATA) ||
2653                             (!page && whence == SEEK_HOLE)) {
2654                                 done = true;
2655                                 break;
2656                         }
2657                 }
2658                 pagevec_remove_exceptionals(&pvec);
2659                 pagevec_release(&pvec);
2660                 pvec.nr = PAGEVEC_SIZE;
2661                 cond_resched();
2662         }
2663         return index;
2664 }
2665
2666 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
2667 {
2668         struct address_space *mapping = file->f_mapping;
2669         struct inode *inode = mapping->host;
2670         pgoff_t start, end;
2671         loff_t new_offset;
2672
2673         if (whence != SEEK_DATA && whence != SEEK_HOLE)
2674                 return generic_file_llseek_size(file, offset, whence,
2675                                         MAX_LFS_FILESIZE, i_size_read(inode));
2676         inode_lock(inode);
2677         /* We're holding i_mutex so we can access i_size directly */
2678
2679         if (offset < 0 || offset >= inode->i_size)
2680                 offset = -ENXIO;
2681         else {
2682                 start = offset >> PAGE_SHIFT;
2683                 end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2684                 new_offset = shmem_seek_hole_data(mapping, start, end, whence);
2685                 new_offset <<= PAGE_SHIFT;
2686                 if (new_offset > offset) {
2687                         if (new_offset < inode->i_size)
2688                                 offset = new_offset;
2689                         else if (whence == SEEK_DATA)
2690                                 offset = -ENXIO;
2691                         else
2692                                 offset = inode->i_size;
2693                 }
2694         }
2695
2696         if (offset >= 0)
2697                 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
2698         inode_unlock(inode);
2699         return offset;
2700 }
2701
2702 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2703                                                          loff_t len)
2704 {
2705         struct inode *inode = file_inode(file);
2706         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2707         struct shmem_inode_info *info = SHMEM_I(inode);
2708         struct shmem_falloc shmem_falloc;
2709         pgoff_t start, index, end;
2710         int error;
2711
2712         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2713                 return -EOPNOTSUPP;
2714
2715         inode_lock(inode);
2716
2717         if (mode & FALLOC_FL_PUNCH_HOLE) {
2718                 struct address_space *mapping = file->f_mapping;
2719                 loff_t unmap_start = round_up(offset, PAGE_SIZE);
2720                 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
2721                 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2722
2723                 /* protected by i_mutex */
2724                 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
2725                         error = -EPERM;
2726                         goto out;
2727                 }
2728
2729                 shmem_falloc.waitq = &shmem_falloc_waitq;
2730                 shmem_falloc.start = unmap_start >> PAGE_SHIFT;
2731                 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2732                 spin_lock(&inode->i_lock);
2733                 inode->i_private = &shmem_falloc;
2734                 spin_unlock(&inode->i_lock);
2735
2736                 if ((u64)unmap_end > (u64)unmap_start)
2737                         unmap_mapping_range(mapping, unmap_start,
2738                                             1 + unmap_end - unmap_start, 0);
2739                 shmem_truncate_range(inode, offset, offset + len - 1);
2740                 /* No need to unmap again: hole-punching leaves COWed pages */
2741
2742                 spin_lock(&inode->i_lock);
2743                 inode->i_private = NULL;
2744                 wake_up_all(&shmem_falloc_waitq);
2745                 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
2746                 spin_unlock(&inode->i_lock);
2747                 error = 0;
2748                 goto out;
2749         }
2750
2751         /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2752         error = inode_newsize_ok(inode, offset + len);
2753         if (error)
2754                 goto out;
2755
2756         if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2757                 error = -EPERM;
2758                 goto out;
2759         }
2760
2761         start = offset >> PAGE_SHIFT;
2762         end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2763         /* Try to avoid a swapstorm if len is impossible to satisfy */
2764         if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2765                 error = -ENOSPC;
2766                 goto out;
2767         }
2768
2769         shmem_falloc.waitq = NULL;
2770         shmem_falloc.start = start;
2771         shmem_falloc.next  = start;
2772         shmem_falloc.nr_falloced = 0;
2773         shmem_falloc.nr_unswapped = 0;
2774         spin_lock(&inode->i_lock);
2775         inode->i_private = &shmem_falloc;
2776         spin_unlock(&inode->i_lock);
2777
2778         for (index = start; index < end; index++) {
2779                 struct page *page;
2780
2781                 /*
2782                  * Good, the fallocate(2) manpage permits EINTR: we may have
2783                  * been interrupted because we are using up too much memory.
2784                  */
2785                 if (signal_pending(current))
2786                         error = -EINTR;
2787                 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2788                         error = -ENOMEM;
2789                 else
2790                         error = shmem_getpage(inode, index, &page, SGP_FALLOC);
2791                 if (error) {
2792                         /* Remove the !PageUptodate pages we added */
2793                         if (index > start) {
2794                                 shmem_undo_range(inode,
2795                                     (loff_t)start << PAGE_SHIFT,
2796                                     ((loff_t)index << PAGE_SHIFT) - 1, true);
2797                         }
2798                         goto undone;
2799                 }
2800
2801                 /*
2802                  * Inform shmem_writepage() how far we have reached.
2803                  * No need for lock or barrier: we have the page lock.
2804                  */
2805                 shmem_falloc.next++;
2806                 if (!PageUptodate(page))
2807                         shmem_falloc.nr_falloced++;
2808
2809                 /*
2810                  * If !PageUptodate, leave it that way so that freeable pages
2811                  * can be recognized if we need to rollback on error later.
2812                  * But set_page_dirty so that memory pressure will swap rather
2813                  * than free the pages we are allocating (and SGP_CACHE pages
2814                  * might still be clean: we now need to mark those dirty too).
2815                  */
2816                 set_page_dirty(page);
2817                 unlock_page(page);
2818                 put_page(page);
2819                 cond_resched();
2820         }
2821
2822         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
2823                 i_size_write(inode, offset + len);
2824         inode->i_ctime = current_time(inode);
2825 undone:
2826         spin_lock(&inode->i_lock);
2827         inode->i_private = NULL;
2828         spin_unlock(&inode->i_lock);
2829 out:
2830         inode_unlock(inode);
2831         return error;
2832 }
2833
2834 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
2835 {
2836         struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
2837
2838         buf->f_type = TMPFS_MAGIC;
2839         buf->f_bsize = PAGE_SIZE;
2840         buf->f_namelen = NAME_MAX;
2841         if (sbinfo->max_blocks) {
2842                 buf->f_blocks = sbinfo->max_blocks;
2843                 buf->f_bavail =
2844                 buf->f_bfree  = sbinfo->max_blocks -
2845                                 percpu_counter_sum(&sbinfo->used_blocks);
2846         }
2847         if (sbinfo->max_inodes) {
2848                 buf->f_files = sbinfo->max_inodes;
2849                 buf->f_ffree = sbinfo->free_inodes;
2850         }
2851         /* else leave those fields 0 like simple_statfs */
2852         return 0;
2853 }
2854
2855 /*
2856  * File creation. Allocate an inode, and we're done..
2857  */
2858 static int
2859 shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
2860 {
2861         struct inode *inode;
2862         int error = -ENOSPC;
2863
2864         inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
2865         if (inode) {
2866                 error = simple_acl_create(dir, inode);
2867                 if (error)
2868                         goto out_iput;
2869                 error = security_inode_init_security(inode, dir,
2870                                                      &dentry->d_name,
2871                                                      shmem_initxattrs, NULL);
2872                 if (error && error != -EOPNOTSUPP)
2873                         goto out_iput;
2874
2875                 error = 0;
2876                 dir->i_size += BOGO_DIRENT_SIZE;
2877                 dir->i_ctime = dir->i_mtime = current_time(dir);
2878                 d_instantiate(dentry, inode);
2879                 dget(dentry); /* Extra count - pin the dentry in core */
2880         }
2881         return error;
2882 out_iput:
2883         iput(inode);
2884         return error;
2885 }
2886
2887 static int
2888 shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
2889 {
2890         struct inode *inode;
2891         int error = -ENOSPC;
2892
2893         inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
2894         if (inode) {
2895                 error = security_inode_init_security(inode, dir,
2896                                                      NULL,
2897                                                      shmem_initxattrs, NULL);
2898                 if (error && error != -EOPNOTSUPP)
2899                         goto out_iput;
2900                 error = simple_acl_create(dir, inode);
2901                 if (error)
2902                         goto out_iput;
2903                 d_tmpfile(dentry, inode);
2904         }
2905         return error;
2906 out_iput:
2907         iput(inode);
2908         return error;
2909 }
2910
2911 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2912 {
2913         int error;
2914
2915         if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
2916                 return error;
2917         inc_nlink(dir);
2918         return 0;
2919 }
2920
2921 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2922                 bool excl)
2923 {
2924         return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
2925 }
2926
2927 /*
2928  * Link a file..
2929  */
2930 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
2931 {
2932         struct inode *inode = d_inode(old_dentry);
2933         int ret = 0;
2934
2935         /*
2936          * No ordinary (disk based) filesystem counts links as inodes;
2937          * but each new link needs a new dentry, pinning lowmem, and
2938          * tmpfs dentries cannot be pruned until they are unlinked.
2939          * But if an O_TMPFILE file is linked into the tmpfs, the
2940          * first link must skip that, to get the accounting right.
2941          */
2942         if (inode->i_nlink) {
2943                 ret = shmem_reserve_inode(inode->i_sb);
2944                 if (ret)
2945                         goto out;
2946         }
2947
2948         dir->i_size += BOGO_DIRENT_SIZE;
2949         inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2950         inc_nlink(inode);
2951         ihold(inode);   /* New dentry reference */
2952         dget(dentry);           /* Extra pinning count for the created dentry */
2953         d_instantiate(dentry, inode);
2954 out:
2955         return ret;
2956 }
2957
2958 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2959 {
2960         struct inode *inode = d_inode(dentry);
2961
2962         if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2963                 shmem_free_inode(inode->i_sb);
2964
2965         dir->i_size -= BOGO_DIRENT_SIZE;
2966         inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2967         drop_nlink(inode);
2968         dput(dentry);   /* Undo the count from "create" - this does all the work */
2969         return 0;
2970 }
2971
2972 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2973 {
2974         if (!simple_empty(dentry))
2975                 return -ENOTEMPTY;
2976
2977         drop_nlink(d_inode(dentry));
2978         drop_nlink(dir);
2979         return shmem_unlink(dir, dentry);
2980 }
2981
2982 static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
2983 {
2984         bool old_is_dir = d_is_dir(old_dentry);
2985         bool new_is_dir = d_is_dir(new_dentry);
2986
2987         if (old_dir != new_dir && old_is_dir != new_is_dir) {
2988                 if (old_is_dir) {
2989                         drop_nlink(old_dir);
2990                         inc_nlink(new_dir);
2991                 } else {
2992                         drop_nlink(new_dir);
2993                         inc_nlink(old_dir);
2994                 }
2995         }
2996         old_dir->i_ctime = old_dir->i_mtime =
2997         new_dir->i_ctime = new_dir->i_mtime =
2998         d_inode(old_dentry)->i_ctime =
2999         d_inode(new_dentry)->i_ctime = current_time(old_dir);
3000
3001         return 0;
3002 }
3003
3004 static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
3005 {
3006         struct dentry *whiteout;
3007         int error;
3008
3009         whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
3010         if (!whiteout)
3011                 return -ENOMEM;
3012
3013         error = shmem_mknod(old_dir, whiteout,
3014                             S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
3015         dput(whiteout);
3016         if (error)
3017                 return error;
3018
3019         /*
3020          * Cheat and hash the whiteout while the old dentry is still in
3021          * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
3022          *
3023          * d_lookup() will consistently find one of them at this point,
3024          * not sure which one, but that isn't even important.
3025          */
3026         d_rehash(whiteout);
3027         return 0;
3028 }
3029
3030 /*
3031  * The VFS layer already does all the dentry stuff for rename,
3032  * we just have to decrement the usage count for the target if
3033  * it exists so that the VFS layer correctly free's it when it
3034  * gets overwritten.
3035  */
3036 static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
3037 {
3038         struct inode *inode = d_inode(old_dentry);
3039         int they_are_dirs = S_ISDIR(inode->i_mode);
3040
3041         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3042                 return -EINVAL;
3043
3044         if (flags & RENAME_EXCHANGE)
3045                 return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
3046
3047         if (!simple_empty(new_dentry))
3048                 return -ENOTEMPTY;
3049
3050         if (flags & RENAME_WHITEOUT) {
3051                 int error;
3052
3053                 error = shmem_whiteout(old_dir, old_dentry);
3054                 if (error)
3055                         return error;
3056         }
3057
3058         if (d_really_is_positive(new_dentry)) {
3059                 (void) shmem_unlink(new_dir, new_dentry);
3060                 if (they_are_dirs) {
3061                         drop_nlink(d_inode(new_dentry));
3062                         drop_nlink(old_dir);
3063                 }
3064         } else if (they_are_dirs) {
3065                 drop_nlink(old_dir);
3066                 inc_nlink(new_dir);
3067         }
3068
3069         old_dir->i_size -= BOGO_DIRENT_SIZE;
3070         new_dir->i_size += BOGO_DIRENT_SIZE;
3071         old_dir->i_ctime = old_dir->i_mtime =
3072         new_dir->i_ctime = new_dir->i_mtime =
3073         inode->i_ctime = current_time(old_dir);
3074         return 0;
3075 }
3076
3077 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
3078 {
3079         int error;
3080         int len;
3081         struct inode *inode;
3082         struct page *page;
3083
3084         len = strlen(symname) + 1;
3085         if (len > PAGE_SIZE)
3086                 return -ENAMETOOLONG;
3087
3088         inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
3089                                 VM_NORESERVE);
3090         if (!inode)
3091                 return -ENOSPC;
3092
3093         error = security_inode_init_security(inode, dir, &dentry->d_name,
3094                                              shmem_initxattrs, NULL);
3095         if (error) {
3096                 if (error != -EOPNOTSUPP) {
3097                         iput(inode);
3098                         return error;
3099                 }
3100                 error = 0;
3101         }
3102
3103         inode->i_size = len-1;
3104         if (len <= SHORT_SYMLINK_LEN) {
3105                 inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3106                 if (!inode->i_link) {
3107                         iput(inode);
3108                         return -ENOMEM;
3109                 }
3110                 inode->i_op = &shmem_short_symlink_operations;
3111         } else {
3112                 inode_nohighmem(inode);
3113                 error = shmem_getpage(inode, 0, &page, SGP_WRITE);
3114                 if (error) {
3115                         iput(inode);
3116                         return error;
3117                 }
3118                 inode->i_mapping->a_ops = &shmem_aops;
3119                 inode->i_op = &shmem_symlink_inode_operations;
3120                 memcpy(page_address(page), symname, len);
3121                 SetPageUptodate(page);
3122                 set_page_dirty(page);
3123                 unlock_page(page);
3124                 put_page(page);
3125         }
3126         dir->i_size += BOGO_DIRENT_SIZE;
3127         dir->i_ctime = dir->i_mtime = current_time(dir);
3128         d_instantiate(dentry, inode);
3129         dget(dentry);
3130         return 0;
3131 }
3132
3133 static void shmem_put_link(void *arg)
3134 {
3135         mark_page_accessed(arg);
3136         put_page(arg);
3137 }
3138
3139 static const char *shmem_get_link(struct dentry *dentry,
3140                                   struct inode *inode,
3141                                   struct delayed_call *done)
3142 {
3143         struct page *page = NULL;
3144         int error;
3145         if (!dentry) {
3146                 page = find_get_page(inode->i_mapping, 0);
3147                 if (!page)
3148                         return ERR_PTR(-ECHILD);
3149                 if (!PageUptodate(page)) {
3150                         put_page(page);
3151                         return ERR_PTR(-ECHILD);
3152                 }
3153         } else {
3154                 error = shmem_getpage(inode, 0, &page, SGP_READ);
3155                 if (error)
3156                         return ERR_PTR(error);
3157                 unlock_page(page);
3158         }
3159         set_delayed_call(done, shmem_put_link, page);
3160         return page_address(page);
3161 }
3162
3163 #ifdef CONFIG_TMPFS_XATTR
3164 /*
3165  * Superblocks without xattr inode operations may get some security.* xattr
3166  * support from the LSM "for free". As soon as we have any other xattrs
3167  * like ACLs, we also need to implement the security.* handlers at
3168  * filesystem level, though.
3169  */
3170
3171 /*
3172  * Callback for security_inode_init_security() for acquiring xattrs.
3173  */
3174 static int shmem_initxattrs(struct inode *inode,
3175                             const struct xattr *xattr_array,
3176                             void *fs_info)
3177 {
3178         struct shmem_inode_info *info = SHMEM_I(inode);
3179         const struct xattr *xattr;
3180         struct simple_xattr *new_xattr;
3181         size_t len;
3182
3183         for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3184                 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
3185                 if (!new_xattr)
3186                         return -ENOMEM;
3187
3188                 len = strlen(xattr->name) + 1;
3189                 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3190                                           GFP_KERNEL);
3191                 if (!new_xattr->name) {
3192                         kfree(new_xattr);
3193                         return -ENOMEM;
3194                 }
3195
3196                 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
3197                        XATTR_SECURITY_PREFIX_LEN);
3198                 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
3199                        xattr->name, len);
3200
3201                 simple_xattr_list_add(&info->xattrs, new_xattr);
3202         }
3203
3204         return 0;
3205 }
3206
3207 static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3208                                    struct dentry *unused, struct inode *inode,
3209                                    const char *name, void *buffer, size_t size)
3210 {
3211         struct shmem_inode_info *info = SHMEM_I(inode);
3212
3213         name = xattr_full_name(handler, name);
3214         return simple_xattr_get(&info->xattrs, name, buffer, size);
3215 }
3216
3217 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
3218                                    struct dentry *unused, struct inode *inode,
3219                                    const char *name, const void *value,
3220                                    size_t size, int flags)
3221 {
3222         struct shmem_inode_info *info = SHMEM_I(inode);
3223
3224         name = xattr_full_name(handler, name);
3225         return simple_xattr_set(&info->xattrs, name, value, size, flags);
3226 }
3227
3228 static const struct xattr_handler shmem_security_xattr_handler = {
3229         .prefix = XATTR_SECURITY_PREFIX,
3230         .get = shmem_xattr_handler_get,
3231         .set = shmem_xattr_handler_set,
3232 };
3233
3234 static const struct xattr_handler shmem_trusted_xattr_handler = {
3235         .prefix = XATTR_TRUSTED_PREFIX,
3236         .get = shmem_xattr_handler_get,
3237         .set = shmem_xattr_handler_set,
3238 };
3239
3240 static const struct xattr_handler *shmem_xattr_handlers[] = {
3241 #ifdef CONFIG_TMPFS_POSIX_ACL
3242         &posix_acl_access_xattr_handler,
3243         &posix_acl_default_xattr_handler,
3244 #endif
3245         &shmem_security_xattr_handler,
3246         &shmem_trusted_xattr_handler,
3247         NULL
3248 };
3249
3250 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3251 {
3252         struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3253         return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
3254 }
3255 #endif /* CONFIG_TMPFS_XATTR */
3256
3257 static const struct inode_operations shmem_short_symlink_operations = {
3258         .get_link       = simple_get_link,
3259 #ifdef CONFIG_TMPFS_XATTR
3260         .listxattr      = shmem_listxattr,
3261 #endif
3262 };
3263
3264 static const struct inode_operations shmem_symlink_inode_operations = {
3265         .get_link       = shmem_get_link,
3266 #ifdef CONFIG_TMPFS_XATTR
3267         .listxattr      = shmem_listxattr,
3268 #endif
3269 };
3270
3271 static struct dentry *shmem_get_parent(struct dentry *child)
3272 {
3273         return ERR_PTR(-ESTALE);
3274 }
3275
3276 static int shmem_match(struct inode *ino, void *vfh)
3277 {
3278         __u32 *fh = vfh;
3279         __u64 inum = fh[2];
3280         inum = (inum << 32) | fh[1];
3281         return ino->i_ino == inum && fh[0] == ino->i_generation;
3282 }
3283
3284 /* Find any alias of inode, but prefer a hashed alias */
3285 static struct dentry *shmem_find_alias(struct inode *inode)
3286 {
3287         struct dentry *alias = d_find_alias(inode);
3288
3289         return alias ?: d_find_any_alias(inode);
3290 }
3291
3292
3293 static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3294                 struct fid *fid, int fh_len, int fh_type)
3295 {
3296         struct inode *inode;
3297         struct dentry *dentry = NULL;
3298         u64 inum;
3299
3300         if (fh_len < 3)
3301                 return NULL;
3302
3303         inum = fid->raw[2];
3304         inum = (inum << 32) | fid->raw[1];
3305
3306         inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3307                         shmem_match, fid->raw);
3308         if (inode) {
3309                 dentry = shmem_find_alias(inode);
3310                 iput(inode);
3311         }
3312
3313         return dentry;
3314 }
3315
3316 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3317                                 struct inode *parent)
3318 {
3319         if (*len < 3) {
3320                 *len = 3;
3321                 return FILEID_INVALID;
3322         }
3323
3324         if (inode_unhashed(inode)) {
3325                 /* Unfortunately insert_inode_hash is not idempotent,
3326                  * so as we hash inodes here rather than at creation
3327                  * time, we need a lock to ensure we only try
3328                  * to do it once
3329                  */
3330                 static DEFINE_SPINLOCK(lock);
3331                 spin_lock(&lock);
3332                 if (inode_unhashed(inode))
3333                         __insert_inode_hash(inode,
3334                                             inode->i_ino + inode->i_generation);
3335                 spin_unlock(&lock);
3336         }
3337
3338         fh[0] = inode->i_generation;
3339         fh[1] = inode->i_ino;
3340         fh[2] = ((__u64)inode->i_ino) >> 32;
3341
3342         *len = 3;
3343         return 1;
3344 }
3345
3346 static const struct export_operations shmem_export_ops = {
3347         .get_parent     = shmem_get_parent,
3348         .encode_fh      = shmem_encode_fh,
3349         .fh_to_dentry   = shmem_fh_to_dentry,
3350 };
3351
3352 static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
3353                                bool remount)
3354 {
3355         char *this_char, *value, *rest;
3356         struct mempolicy *mpol = NULL;
3357         uid_t uid;
3358         gid_t gid;
3359
3360         while (options != NULL) {
3361                 this_char = options;
3362                 for (;;) {
3363                         /*
3364                          * NUL-terminate this option: unfortunately,
3365                          * mount options form a comma-separated list,
3366                          * but mpol's nodelist may also contain commas.
3367                          */
3368                         options = strchr(options, ',');
3369                         if (options == NULL)
3370                                 break;
3371                         options++;
3372                         if (!isdigit(*options)) {
3373                                 options[-1] = '\0';
3374                                 break;
3375                         }
3376                 }
3377                 if (!*this_char)
3378                         continue;
3379                 if ((value = strchr(this_char,'=')) != NULL) {
3380                         *value++ = 0;
3381                 } else {
3382                         pr_err("tmpfs: No value for mount option '%s'\n",
3383                                this_char);
3384                         goto error;
3385                 }
3386
3387                 if (!strcmp(this_char,"size")) {
3388                         unsigned long long size;
3389                         size = memparse(value,&rest);
3390                         if (*rest == '%') {
3391                                 size <<= PAGE_SHIFT;
3392                                 size *= totalram_pages();
3393                                 do_div(size, 100);
3394                                 rest++;
3395                         }
3396                         if (*rest)
3397                                 goto bad_val;
3398                         sbinfo->max_blocks =
3399                                 DIV_ROUND_UP(size, PAGE_SIZE);
3400                 } else if (!strcmp(this_char,"nr_blocks")) {
3401                         sbinfo->max_blocks = memparse(value, &rest);
3402                         if (*rest)
3403                                 goto bad_val;
3404                 } else if (!strcmp(this_char,"nr_inodes")) {
3405                         sbinfo->max_inodes = memparse(value, &rest);
3406                         if (*rest)
3407                                 goto bad_val;
3408                 } else if (!strcmp(this_char,"mode")) {
3409                         if (remount)
3410                                 continue;
3411                         sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
3412                         if (*rest)
3413                                 goto bad_val;
3414                 } else if (!strcmp(this_char,"uid")) {
3415                         if (remount)
3416                                 continue;
3417                         uid = simple_strtoul(value, &rest, 0);
3418                         if (*rest)
3419                                 goto bad_val;
3420                         sbinfo->uid = make_kuid(current_user_ns(), uid);
3421                         if (!uid_valid(sbinfo->uid))
3422                                 goto bad_val;
3423                 } else if (!strcmp(this_char,"gid")) {
3424                         if (remount)
3425                                 continue;
3426                         gid = simple_strtoul(value, &rest, 0);
3427                         if (*rest)
3428                                 goto bad_val;
3429                         sbinfo->gid = make_kgid(current_user_ns(), gid);
3430                         if (!gid_valid(sbinfo->gid))
3431                                 goto bad_val;
3432 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3433                 } else if (!strcmp(this_char, "huge")) {
3434                         int huge;
3435                         huge = shmem_parse_huge(value);
3436                         if (huge < 0)
3437                                 goto bad_val;
3438                         if (!has_transparent_hugepage() &&
3439                                         huge != SHMEM_HUGE_NEVER)
3440                                 goto bad_val;
3441                         sbinfo->huge = huge;
3442 #endif
3443 #ifdef CONFIG_NUMA
3444                 } else if (!strcmp(this_char,"mpol")) {
3445                         mpol_put(mpol);
3446                         mpol = NULL;
3447                         if (mpol_parse_str(value, &mpol))
3448                                 goto bad_val;
3449 #endif
3450                 } else {
3451                         pr_err("tmpfs: Bad mount option %s\n", this_char);
3452                         goto error;
3453                 }
3454         }
3455         sbinfo->mpol = mpol;
3456         return 0;
3457
3458 bad_val:
3459         pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
3460                value, this_char);
3461 error:
3462         mpol_put(mpol);
3463         return 1;
3464
3465 }
3466
3467 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
3468 {
3469         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3470         struct shmem_sb_info config = *sbinfo;
3471         unsigned long inodes;
3472         int error = -EINVAL;
3473
3474         config.mpol = NULL;
3475         if (shmem_parse_options(data, &config, true))
3476                 return error;
3477
3478         spin_lock(&sbinfo->stat_lock);
3479         inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3480         if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
3481                 goto out;
3482         if (config.max_inodes < inodes)
3483                 goto out;
3484         /*
3485          * Those tests disallow limited->unlimited while any are in use;
3486          * but we must separately disallow unlimited->limited, because
3487          * in that case we have no record of how much is already in use.
3488          */
3489         if (config.max_blocks && !sbinfo->max_blocks)
3490                 goto out;
3491         if (config.max_inodes && !sbinfo->max_inodes)
3492                 goto out;
3493
3494         error = 0;
3495         sbinfo->huge = config.huge;
3496         sbinfo->max_blocks  = config.max_blocks;
3497         sbinfo->max_inodes  = config.max_inodes;
3498         sbinfo->free_inodes = config.max_inodes - inodes;
3499
3500         /*
3501          * Preserve previous mempolicy unless mpol remount option was specified.
3502          */
3503         if (config.mpol) {
3504                 mpol_put(sbinfo->mpol);
3505                 sbinfo->mpol = config.mpol;     /* transfers initial ref */
3506         }
3507 out:
3508         spin_unlock(&sbinfo->stat_lock);
3509         return error;
3510 }
3511
3512 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3513 {
3514         struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
3515
3516         if (sbinfo->max_blocks != shmem_default_max_blocks())
3517                 seq_printf(seq, ",size=%luk",
3518                         sbinfo->max_blocks << (PAGE_SHIFT - 10));
3519         if (sbinfo->max_inodes != shmem_default_max_inodes())
3520                 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
3521         if (sbinfo->mode != (0777 | S_ISVTX))
3522                 seq_printf(seq, ",mode=%03ho", sbinfo->mode);
3523         if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
3524                 seq_printf(seq, ",uid=%u",
3525                                 from_kuid_munged(&init_user_ns, sbinfo->uid));
3526         if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3527                 seq_printf(seq, ",gid=%u",
3528                                 from_kgid_munged(&init_user_ns, sbinfo->gid));
3529 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3530         /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3531         if (sbinfo->huge)
3532                 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3533 #endif
3534         shmem_show_mpol(seq, sbinfo->mpol);
3535         return 0;
3536 }
3537
3538 #endif /* CONFIG_TMPFS */
3539
3540 static void shmem_put_super(struct super_block *sb)
3541 {
3542         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3543
3544         percpu_counter_destroy(&sbinfo->used_blocks);
3545         mpol_put(sbinfo->mpol);
3546         kfree(sbinfo);
3547         sb->s_fs_info = NULL;
3548 }
3549
3550 static int shmem_fill_super(struct super_block *sb, void *data, int silent)
3551 {
3552         struct inode *inode;
3553         struct shmem_sb_info *sbinfo;
3554         int err = -ENOMEM;
3555
3556         /* Round up to L1_CACHE_BYTES to resist false sharing */
3557         sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
3558                                 L1_CACHE_BYTES), GFP_KERNEL);
3559         if (!sbinfo)
3560                 return -ENOMEM;
3561
3562         sbinfo->mode = 0777 | S_ISVTX;
3563         sbinfo->uid = current_fsuid();
3564         sbinfo->gid = current_fsgid();
3565         sb->s_fs_info = sbinfo;
3566
3567 #ifdef CONFIG_TMPFS
3568         /*
3569          * Per default we only allow half of the physical ram per
3570          * tmpfs instance, limiting inodes to one per page of lowmem;
3571          * but the internal instance is left unlimited.
3572          */
3573         if (!(sb->s_flags & SB_KERNMOUNT)) {
3574                 sbinfo->max_blocks = shmem_default_max_blocks();
3575                 sbinfo->max_inodes = shmem_default_max_inodes();
3576                 if (shmem_parse_options(data, sbinfo, false)) {
3577                         err = -EINVAL;
3578                         goto failed;
3579                 }
3580         } else {
3581                 sb->s_flags |= SB_NOUSER;
3582         }
3583         sb->s_export_op = &shmem_export_ops;
3584         sb->s_flags |= SB_NOSEC;
3585 #else
3586         sb->s_flags |= SB_NOUSER;
3587 #endif
3588
3589         spin_lock_init(&sbinfo->stat_lock);
3590         if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3591                 goto failed;
3592         sbinfo->free_inodes = sbinfo->max_inodes;
3593         spin_lock_init(&sbinfo->shrinklist_lock);
3594         INIT_LIST_HEAD(&sbinfo->shrinklist);
3595
3596         sb->s_maxbytes = MAX_LFS_FILESIZE;
3597         sb->s_blocksize = PAGE_SIZE;
3598         sb->s_blocksize_bits = PAGE_SHIFT;
3599         sb->s_magic = TMPFS_MAGIC;
3600         sb->s_op = &shmem_ops;
3601         sb->s_time_gran = 1;
3602 #ifdef CONFIG_TMPFS_XATTR
3603         sb->s_xattr = shmem_xattr_handlers;
3604 #endif
3605 #ifdef CONFIG_TMPFS_POSIX_ACL
3606         sb->s_flags |= SB_POSIXACL;
3607 #endif
3608         uuid_gen(&sb->s_uuid);
3609
3610         inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
3611         if (!inode)
3612                 goto failed;
3613         inode->i_uid = sbinfo->uid;
3614         inode->i_gid = sbinfo->gid;
3615         sb->s_root = d_make_root(inode);
3616         if (!sb->s_root)
3617                 goto failed;
3618         return 0;
3619
3620 failed:
3621         shmem_put_super(sb);
3622         return err;
3623 }
3624
3625 static struct kmem_cache *shmem_inode_cachep;
3626
3627 static struct inode *shmem_alloc_inode(struct super_block *sb)
3628 {
3629         struct shmem_inode_info *info;
3630         info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
3631         if (!info)
3632                 return NULL;
3633         return &info->vfs_inode;
3634 }
3635
3636 static void shmem_free_in_core_inode(struct inode *inode)
3637 {
3638         if (S_ISLNK(inode->i_mode))
3639                 kfree(inode->i_link);
3640         kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
3641 }
3642
3643 static void shmem_destroy_inode(struct inode *inode)
3644 {
3645         if (S_ISREG(inode->i_mode))
3646                 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3647 }
3648
3649 static void shmem_init_inode(void *foo)
3650 {
3651         struct shmem_inode_info *info = foo;
3652         inode_init_once(&info->vfs_inode);
3653 }
3654
3655 static void shmem_init_inodecache(void)
3656 {
3657         shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3658                                 sizeof(struct shmem_inode_info),
3659                                 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
3660 }
3661
3662 static void shmem_destroy_inodecache(void)
3663 {
3664         kmem_cache_destroy(shmem_inode_cachep);
3665 }
3666
3667 static const struct address_space_operations shmem_aops = {
3668         .writepage      = shmem_writepage,
3669         .set_page_dirty = __set_page_dirty_no_writeback,
3670 #ifdef CONFIG_TMPFS
3671         .write_begin    = shmem_write_begin,
3672         .write_end      = shmem_write_end,
3673 #endif
3674 #ifdef CONFIG_MIGRATION
3675         .migratepage    = migrate_page,
3676 #endif
3677         .error_remove_page = generic_error_remove_page,
3678 };
3679
3680 static const struct file_operations shmem_file_operations = {
3681         .mmap           = shmem_mmap,
3682         .get_unmapped_area = shmem_get_unmapped_area,
3683 #ifdef CONFIG_TMPFS
3684         .llseek         = shmem_file_llseek,
3685         .read_iter      = shmem_file_read_iter,
3686         .write_iter     = generic_file_write_iter,
3687         .fsync          = noop_fsync,
3688         .splice_read    = generic_file_splice_read,
3689         .splice_write   = iter_file_splice_write,
3690         .fallocate      = shmem_fallocate,
3691 #endif
3692 };
3693
3694 static const struct inode_operations shmem_inode_operations = {
3695         .getattr        = shmem_getattr,
3696         .setattr        = shmem_setattr,
3697 #ifdef CONFIG_TMPFS_XATTR
3698         .listxattr      = shmem_listxattr,
3699         .set_acl        = simple_set_acl,
3700 #endif
3701 };
3702
3703 static const struct inode_operations shmem_dir_inode_operations = {
3704 #ifdef CONFIG_TMPFS
3705         .create         = shmem_create,
3706         .lookup         = simple_lookup,
3707         .link           = shmem_link,
3708         .unlink         = shmem_unlink,
3709         .symlink        = shmem_symlink,
3710         .mkdir          = shmem_mkdir,
3711         .rmdir          = shmem_rmdir,
3712         .mknod          = shmem_mknod,
3713         .rename         = shmem_rename2,
3714         .tmpfile        = shmem_tmpfile,
3715 #endif
3716 #ifdef CONFIG_TMPFS_XATTR
3717         .listxattr      = shmem_listxattr,
3718 #endif
3719 #ifdef CONFIG_TMPFS_POSIX_ACL
3720         .setattr        = shmem_setattr,
3721         .set_acl        = simple_set_acl,
3722 #endif
3723 };
3724
3725 static const struct inode_operations shmem_special_inode_operations = {
3726 #ifdef CONFIG_TMPFS_XATTR
3727         .listxattr      = shmem_listxattr,
3728 #endif
3729 #ifdef CONFIG_TMPFS_POSIX_ACL
3730         .setattr        = shmem_setattr,
3731         .set_acl        = simple_set_acl,
3732 #endif
3733 };
3734
3735 static const struct super_operations shmem_ops = {
3736         .alloc_inode    = shmem_alloc_inode,
3737         .free_inode     = shmem_free_in_core_inode,
3738         .destroy_inode  = shmem_destroy_inode,
3739 #ifdef CONFIG_TMPFS
3740         .statfs         = shmem_statfs,
3741         .remount_fs     = shmem_remount_fs,
3742         .show_options   = shmem_show_options,
3743 #endif
3744         .evict_inode    = shmem_evict_inode,
3745         .drop_inode     = generic_delete_inode,
3746         .put_super      = shmem_put_super,
3747 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3748         .nr_cached_objects      = shmem_unused_huge_count,
3749         .free_cached_objects    = shmem_unused_huge_scan,
3750 #endif
3751 };
3752
3753 static const struct vm_operations_struct shmem_vm_ops = {
3754         .fault          = shmem_fault,
3755         .map_pages      = filemap_map_pages,
3756 #ifdef CONFIG_NUMA
3757         .set_policy     = shmem_set_policy,
3758         .get_policy     = shmem_get_policy,
3759 #endif
3760 };
3761
3762 struct dentry *shmem_mount(struct file_system_type *fs_type,
3763         int flags, const char *dev_name, void *data)
3764 {
3765         return mount_nodev(fs_type, flags, data, shmem_fill_super);
3766 }
3767
3768 static struct file_system_type shmem_fs_type = {
3769         .owner          = THIS_MODULE,
3770         .name           = "tmpfs",
3771         .mount          = shmem_mount,
3772         .kill_sb        = kill_litter_super,
3773         .fs_flags       = FS_USERNS_MOUNT,
3774 };
3775
3776 int __init shmem_init(void)
3777 {
3778         int error;
3779
3780         shmem_init_inodecache();
3781
3782         error = register_filesystem(&shmem_fs_type);
3783         if (error) {
3784                 pr_err("Could not register tmpfs\n");
3785                 goto out2;
3786         }
3787
3788         shm_mnt = kern_mount(&shmem_fs_type);
3789         if (IS_ERR(shm_mnt)) {
3790                 error = PTR_ERR(shm_mnt);
3791                 pr_err("Could not kern_mount tmpfs\n");
3792                 goto out1;
3793         }
3794
3795 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3796         if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
3797                 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3798         else
3799                 shmem_huge = 0; /* just in case it was patched */
3800 #endif
3801         return 0;
3802
3803 out1:
3804         unregister_filesystem(&shmem_fs_type);
3805 out2:
3806         shmem_destroy_inodecache();
3807         shm_mnt = ERR_PTR(error);
3808         return error;
3809 }
3810
3811 #if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
3812 static ssize_t shmem_enabled_show(struct kobject *kobj,
3813                 struct kobj_attribute *attr, char *buf)
3814 {
3815         int values[] = {
3816                 SHMEM_HUGE_ALWAYS,
3817                 SHMEM_HUGE_WITHIN_SIZE,
3818                 SHMEM_HUGE_ADVISE,
3819                 SHMEM_HUGE_NEVER,
3820                 SHMEM_HUGE_DENY,
3821                 SHMEM_HUGE_FORCE,
3822         };
3823         int i, count;
3824
3825         for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
3826                 const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
3827
3828                 count += sprintf(buf + count, fmt,
3829                                 shmem_format_huge(values[i]));
3830         }
3831         buf[count - 1] = '\n';
3832         return count;
3833 }
3834
3835 static ssize_t shmem_enabled_store(struct kobject *kobj,
3836                 struct kobj_attribute *attr, const char *buf, size_t count)
3837 {
3838         char tmp[16];
3839         int huge;
3840
3841         if (count + 1 > sizeof(tmp))
3842                 return -EINVAL;
3843         memcpy(tmp, buf, count);
3844         tmp[count] = '\0';
3845         if (count && tmp[count - 1] == '\n')
3846                 tmp[count - 1] = '\0';
3847
3848         huge = shmem_parse_huge(tmp);
3849         if (huge == -EINVAL)
3850                 return -EINVAL;
3851         if (!has_transparent_hugepage() &&
3852                         huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
3853                 return -EINVAL;
3854
3855         shmem_huge = huge;
3856         if (shmem_huge > SHMEM_HUGE_DENY)
3857                 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3858         return count;
3859 }
3860
3861 struct kobj_attribute shmem_enabled_attr =
3862         __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
3863 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
3864
3865 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3866 bool shmem_huge_enabled(struct vm_area_struct *vma)
3867 {
3868         struct inode *inode = file_inode(vma->vm_file);
3869         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3870         loff_t i_size;
3871         pgoff_t off;
3872
3873         if ((vma->vm_flags & VM_NOHUGEPAGE) ||
3874             test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
3875                 return false;
3876         if (shmem_huge == SHMEM_HUGE_FORCE)
3877                 return true;
3878         if (shmem_huge == SHMEM_HUGE_DENY)
3879                 return false;
3880         switch (sbinfo->huge) {
3881                 case SHMEM_HUGE_NEVER:
3882                         return false;
3883                 case SHMEM_HUGE_ALWAYS:
3884                         return true;
3885                 case SHMEM_HUGE_WITHIN_SIZE:
3886                         off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
3887                         i_size = round_up(i_size_read(inode), PAGE_SIZE);
3888                         if (i_size >= HPAGE_PMD_SIZE &&
3889                                         i_size >> PAGE_SHIFT >= off)
3890                                 return true;
3891                         /* fall through */
3892                 case SHMEM_HUGE_ADVISE:
3893                         /* TODO: implement fadvise() hints */
3894                         return (vma->vm_flags & VM_HUGEPAGE);
3895                 default:
3896                         VM_BUG_ON(1);
3897                         return false;
3898         }
3899 }
3900 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
3901
3902 #else /* !CONFIG_SHMEM */
3903
3904 /*
3905  * tiny-shmem: simple shmemfs and tmpfs using ramfs code
3906  *
3907  * This is intended for small system where the benefits of the full
3908  * shmem code (swap-backed and resource-limited) are outweighed by
3909  * their complexity. On systems without swap this code should be
3910  * effectively equivalent, but much lighter weight.
3911  */
3912
3913 static struct file_system_type shmem_fs_type = {
3914         .name           = "tmpfs",
3915         .mount          = ramfs_mount,
3916         .kill_sb        = kill_litter_super,
3917         .fs_flags       = FS_USERNS_MOUNT,
3918 };
3919
3920 int __init shmem_init(void)
3921 {
3922         BUG_ON(register_filesystem(&shmem_fs_type) != 0);
3923
3924         shm_mnt = kern_mount(&shmem_fs_type);
3925         BUG_ON(IS_ERR(shm_mnt));
3926
3927         return 0;
3928 }
3929
3930 int shmem_unuse(unsigned int type, bool frontswap,
3931                 unsigned long *fs_pages_to_unuse)
3932 {
3933         return 0;
3934 }
3935
3936 int shmem_lock(struct file *file, int lock, struct user_struct *user)
3937 {
3938         return 0;
3939 }
3940
3941 void shmem_unlock_mapping(struct address_space *mapping)
3942 {
3943 }
3944
3945 #ifdef CONFIG_MMU
3946 unsigned long shmem_get_unmapped_area(struct file *file,
3947                                       unsigned long addr, unsigned long len,
3948                                       unsigned long pgoff, unsigned long flags)
3949 {
3950         return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
3951 }
3952 #endif
3953
3954 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
3955 {
3956         truncate_inode_pages_range(inode->i_mapping, lstart, lend);
3957 }
3958 EXPORT_SYMBOL_GPL(shmem_truncate_range);
3959
3960 #define shmem_vm_ops                            generic_file_vm_ops
3961 #define shmem_file_operations                   ramfs_file_operations
3962 #define shmem_get_inode(sb, dir, mode, dev, flags)      ramfs_get_inode(sb, dir, mode, dev)
3963 #define shmem_acct_size(flags, size)            0
3964 #define shmem_unacct_size(flags, size)          do {} while (0)
3965
3966 #endif /* CONFIG_SHMEM */
3967
3968 /* common code */
3969
3970 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
3971                                        unsigned long flags, unsigned int i_flags)
3972 {
3973         struct inode *inode;
3974         struct file *res;
3975
3976         if (IS_ERR(mnt))
3977                 return ERR_CAST(mnt);
3978
3979         if (size < 0 || size > MAX_LFS_FILESIZE)
3980                 return ERR_PTR(-EINVAL);
3981
3982         if (shmem_acct_size(flags, size))
3983                 return ERR_PTR(-ENOMEM);
3984
3985         inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
3986                                 flags);
3987         if (unlikely(!inode)) {
3988                 shmem_unacct_size(flags, size);
3989                 return ERR_PTR(-ENOSPC);
3990         }
3991         inode->i_flags |= i_flags;
3992         inode->i_size = size;
3993         clear_nlink(inode);     /* It is unlinked */
3994         res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
3995         if (!IS_ERR(res))
3996                 res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
3997                                 &shmem_file_operations);
3998         if (IS_ERR(res))
3999                 iput(inode);
4000         return res;
4001 }
4002
4003 /**
4004  * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4005  *      kernel internal.  There will be NO LSM permission checks against the
4006  *      underlying inode.  So users of this interface must do LSM checks at a
4007  *      higher layer.  The users are the big_key and shm implementations.  LSM
4008  *      checks are provided at the key or shm level rather than the inode.
4009  * @name: name for dentry (to be seen in /proc/<pid>/maps
4010  * @size: size to be set for the file
4011  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4012  */
4013 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4014 {
4015         return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
4016 }
4017
4018 /**
4019  * shmem_file_setup - get an unlinked file living in tmpfs
4020  * @name: name for dentry (to be seen in /proc/<pid>/maps
4021  * @size: size to be set for the file
4022  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4023  */
4024 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4025 {
4026         return __shmem_file_setup(shm_mnt, name, size, flags, 0);
4027 }
4028 EXPORT_SYMBOL_GPL(shmem_file_setup);
4029
4030 /**
4031  * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
4032  * @mnt: the tmpfs mount where the file will be created
4033  * @name: name for dentry (to be seen in /proc/<pid>/maps
4034  * @size: size to be set for the file
4035  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4036  */
4037 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
4038                                        loff_t size, unsigned long flags)
4039 {
4040         return __shmem_file_setup(mnt, name, size, flags, 0);
4041 }
4042 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
4043
4044 /**
4045  * shmem_zero_setup - setup a shared anonymous mapping
4046  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
4047  */
4048 int shmem_zero_setup(struct vm_area_struct *vma)
4049 {
4050         struct file *file;
4051         loff_t size = vma->vm_end - vma->vm_start;
4052
4053         /*
4054          * Cloning a new file under mmap_sem leads to a lock ordering conflict
4055          * between XFS directory reading and selinux: since this file is only
4056          * accessible to the user through its mapping, use S_PRIVATE flag to
4057          * bypass file security, in the same way as shmem_kernel_file_setup().
4058          */
4059         file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
4060         if (IS_ERR(file))
4061                 return PTR_ERR(file);
4062
4063         if (vma->vm_file)
4064                 fput(vma->vm_file);
4065         vma->vm_file = file;
4066         vma->vm_ops = &shmem_vm_ops;
4067
4068         if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
4069                         ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
4070                         (vma->vm_end & HPAGE_PMD_MASK)) {
4071                 khugepaged_enter(vma, vma->vm_flags);
4072         }
4073
4074         return 0;
4075 }
4076
4077 /**
4078  * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
4079  * @mapping:    the page's address_space
4080  * @index:      the page index
4081  * @gfp:        the page allocator flags to use if allocating
4082  *
4083  * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4084  * with any new page allocations done using the specified allocation flags.
4085  * But read_cache_page_gfp() uses the ->readpage() method: which does not
4086  * suit tmpfs, since it may have pages in swapcache, and needs to find those
4087  * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4088  *
4089  * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
4090  * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
4091  */
4092 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4093                                          pgoff_t index, gfp_t gfp)
4094 {
4095 #ifdef CONFIG_SHMEM
4096         struct inode *inode = mapping->host;
4097         struct page *page;
4098         int error;
4099
4100         BUG_ON(mapping->a_ops != &shmem_aops);
4101         error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
4102                                   gfp, NULL, NULL, NULL);
4103         if (error)
4104                 page = ERR_PTR(error);
4105         else
4106                 unlock_page(page);
4107         return page;
4108 #else
4109         /*
4110          * The tiny !SHMEM case uses ramfs without swap
4111          */
4112         return read_cache_page_gfp(mapping, index, gfp);
4113 #endif
4114 }
4115 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);