fs/namespace.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  linux/fs/namespace.c
   4  *
   5  * (C) Copyright Al Viro 2000, 2001
   6  *
   7  * Based on code from fs/super.c, copyright Linus Torvalds and others.
   8  * Heavily rewritten.
   9  */
  10
  11 #include <linux/syscalls.h>
  12 #include <linux/export.h>
  13 #include <linux/capability.h>
  14 #include <linux/mnt_namespace.h>
  15 #include <linux/user_namespace.h>
  16 #include <linux/namei.h>
  17 #include <linux/security.h>
  18 #include <linux/cred.h>
  19 #include <linux/idr.h>
  20 #include <linux/init.h>         /* init_rootfs */
  21 #include <linux/fs_struct.h>    /* get_fs_root et.al. */
  22 #include <linux/fsnotify.h>     /* fsnotify_vfsmount_delete */
  23 #include <linux/file.h>
  24 #include <linux/uaccess.h>
  25 #include <linux/proc_ns.h>
  26 #include <linux/magic.h>
  27 #include <linux/memblock.h>
  28 #include <linux/proc_fs.h>
  29 #include <linux/task_work.h>
  30 #include <linux/sched/task.h>
  31 #include <uapi/linux/mount.h>
  32 #include <linux/fs_context.h>
  33 #include <linux/shmem_fs.h>
  34 #include <linux/mnt_idmapping.h>
  35
  36 #include "pnode.h"
  37 #include "internal.h"
  38
  39 /* Maximum number of mounts in a mount namespace */
  40 static unsigned int sysctl_mount_max __read_mostly = 100000;
  41
  42 static unsigned int m_hash_mask __read_mostly;
  43 static unsigned int m_hash_shift __read_mostly;
  44 static unsigned int mp_hash_mask __read_mostly;
  45 static unsigned int mp_hash_shift __read_mostly;
  46
  47 static __initdata unsigned long mhash_entries;
  48 static int __init set_mhash_entries(char *str)
  49 {
  50         if (!str)
  51                 return 0;
  52         mhash_entries = simple_strtoul(str, &str, 0);
  53         return 1;
  54 }
  55 __setup("mhash_entries=", set_mhash_entries);
  56
  57 static __initdata unsigned long mphash_entries;
  58 static int __init set_mphash_entries(char *str)
  59 {
  60         if (!str)
  61                 return 0;
  62         mphash_entries = simple_strtoul(str, &str, 0);
  63         return 1;
  64 }
  65 __setup("mphash_entries=", set_mphash_entries);
  66
  67 static u64 event;
  68 static DEFINE_IDA(mnt_id_ida);
  69 static DEFINE_IDA(mnt_group_ida);
  70
  71 static struct hlist_head *mount_hashtable __read_mostly;
  72 static struct hlist_head *mountpoint_hashtable __read_mostly;
  73 static struct kmem_cache *mnt_cache __read_mostly;
  74 static DECLARE_RWSEM(namespace_sem);
  75 static HLIST_HEAD(unmounted);   /* protected by namespace_sem */
  76 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
  77
  78 struct mnt_idmap {
  79         struct user_namespace *owner;
  80         refcount_t count;
  81 };
  82
  83 /*
  84  * Carries the initial idmapping of 0:0:4294967295 which is an identity
  85  * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
  86  * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
  87  */
  88 struct mnt_idmap nop_mnt_idmap = {
  89         .owner  = &init_user_ns,
  90         .count  = REFCOUNT_INIT(1),
  91 };
  92 EXPORT_SYMBOL_GPL(nop_mnt_idmap);
  93
  94 struct mount_kattr {
  95         unsigned int attr_set;
  96         unsigned int attr_clr;
  97         unsigned int propagation;
  98         unsigned int lookup_flags;
  99         bool recurse;
 100         struct user_namespace *mnt_userns;
 101         struct mnt_idmap *mnt_idmap;
 102 };
 103
 104 /* /sys/fs */
 105 struct kobject *fs_kobj;
 106 EXPORT_SYMBOL_GPL(fs_kobj);
 107
 108 /*
 109  * vfsmount lock may be taken for read to prevent changes to the
 110  * vfsmount hash, ie. during mountpoint lookups or walking back
 111  * up the tree.
 112  *
 113  * It should be taken for write in all cases where the vfsmount
 114  * tree or hash is modified or when a vfsmount structure is modified.
 115  */
 116 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 117
 118 static inline void lock_mount_hash(void)
 119 {
 120         write_seqlock(&mount_lock);
 121 }
 122
 123 static inline void unlock_mount_hash(void)
 124 {
 125         write_sequnlock(&mount_lock);
 126 }
 127
 128 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
 129 {
 130         unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
 131         tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
 132         tmp = tmp + (tmp >> m_hash_shift);
 133         return &mount_hashtable[tmp & m_hash_mask];
 134 }
 135
 136 static inline struct hlist_head *mp_hash(struct dentry *dentry)
 137 {
 138         unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
 139         tmp = tmp + (tmp >> mp_hash_shift);
 140         return &mountpoint_hashtable[tmp & mp_hash_mask];
 141 }
 142
 143 static int mnt_alloc_id(struct mount *mnt)
 144 {
 145         int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
 146
 147         if (res < 0)
 148                 return res;
 149         mnt->mnt_id = res;
 150         return 0;
 151 }
 152
 153 static void mnt_free_id(struct mount *mnt)
 154 {
 155         ida_free(&mnt_id_ida, mnt->mnt_id);
 156 }
 157
 158 /*
 159  * Allocate a new peer group ID
 160  */
 161 static int mnt_alloc_group_id(struct mount *mnt)
 162 {
 163         int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
 164
 165         if (res < 0)
 166                 return res;
 167         mnt->mnt_group_id = res;
 168         return 0;
 169 }
 170
 171 /*
 172  * Release a peer group ID
 173  */
 174 void mnt_release_group_id(struct mount *mnt)
 175 {
 176         ida_free(&mnt_group_ida, mnt->mnt_group_id);
 177         mnt->mnt_group_id = 0;
 178 }
 179
 180 /*
 181  * vfsmount lock must be held for read
 182  */
 183 static inline void mnt_add_count(struct mount *mnt, int n)
 184 {
 185 #ifdef CONFIG_SMP
 186         this_cpu_add(mnt->mnt_pcp->mnt_count, n);
 187 #else
 188         preempt_disable();
 189         mnt->mnt_count += n;
 190         preempt_enable();
 191 #endif
 192 }
 193
 194 /*
 195  * vfsmount lock must be held for write
 196  */
 197 int mnt_get_count(struct mount *mnt)
 198 {
 199 #ifdef CONFIG_SMP
 200         int count = 0;
 201         int cpu;
 202
 203         for_each_possible_cpu(cpu) {
 204                 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
 205         }
 206
 207         return count;
 208 #else
 209         return mnt->mnt_count;
 210 #endif
 211 }
 212
 213 /**
 214  * mnt_idmap_owner - retrieve owner of the mount's idmapping
 215  * @idmap: mount idmapping
 216  *
 217  * This helper will go away once the conversion to use struct mnt_idmap
 218  * everywhere has finished at which point the helper will be unexported.
 219  *
 220  * Only code that needs to perform permission checks based on the owner of the
 221  * idmapping will get access to it. All other code will solely rely on
 222  * idmappings. This will get us type safety so it's impossible to conflate
 223  * filesystems idmappings with mount idmappings.
 224  *
 225  * Return: The owner of the idmapping.
 226  */
 227 struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap)
 228 {
 229         return idmap->owner;
 230 }
 231 EXPORT_SYMBOL_GPL(mnt_idmap_owner);
 232
 233 /**
 234  * mnt_user_ns - retrieve owner of an idmapped mount
 235  * @mnt: the relevant vfsmount
 236  *
 237  * This helper will go away once the conversion to use struct mnt_idmap
 238  * everywhere has finished at which point the helper will be unexported.
 239  *
 240  * Only code that needs to perform permission checks based on the owner of the
 241  * idmapping will get access to it. All other code will solely rely on
 242  * idmappings. This will get us type safety so it's impossible to conflate
 243  * filesystems idmappings with mount idmappings.
 244  *
 245  * Return: The owner of the idmapped.
 246  */
 247 struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
 248 {
 249         struct mnt_idmap *idmap = mnt_idmap(mnt);
 250
 251         /* Return the actual owner of the filesystem instead of the nop. */
 252         if (idmap == &nop_mnt_idmap &&
 253             !initial_idmapping(mnt->mnt_sb->s_user_ns))
 254                 return mnt->mnt_sb->s_user_ns;
 255         return mnt_idmap_owner(idmap);
 256 }
 257 EXPORT_SYMBOL_GPL(mnt_user_ns);
 258
 259 /**
 260  * alloc_mnt_idmap - allocate a new idmapping for the mount
 261  * @mnt_userns: owning userns of the idmapping
 262  *
 263  * Allocate a new struct mnt_idmap which carries the idmapping of the mount.
 264  *
 265  * Return: On success a new idmap, on error an error pointer is returned.
 266  */
 267 static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
 268 {
 269         struct mnt_idmap *idmap;
 270
 271         idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
 272         if (!idmap)
 273                 return ERR_PTR(-ENOMEM);
 274
 275         idmap->owner = get_user_ns(mnt_userns);
 276         refcount_set(&idmap->count, 1);
 277         return idmap;
 278 }
 279
 280 /**
 281  * mnt_idmap_get - get a reference to an idmapping
 282  * @idmap: the idmap to bump the reference on
 283  *
 284  * If @idmap is not the @nop_mnt_idmap bump the reference count.
 285  *
 286  * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
 287  */
 288 static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
 289 {
 290         if (idmap != &nop_mnt_idmap)
 291                 refcount_inc(&idmap->count);
 292
 293         return idmap;
 294 }
 295
 296 /**
 297  * mnt_idmap_put - put a reference to an idmapping
 298  * @idmap: the idmap to put the reference on
 299  *
 300  * If this is a non-initial idmapping, put the reference count when a mount is
 301  * released and free it if we're the last user.
 302  */
 303 static inline void mnt_idmap_put(struct mnt_idmap *idmap)
 304 {
 305         if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
 306                 put_user_ns(idmap->owner);
 307                 kfree(idmap);
 308         }
 309 }
 310
 311 static struct mount *alloc_vfsmnt(const char *name)
 312 {
 313         struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 314         if (mnt) {
 315                 int err;
 316
 317                 err = mnt_alloc_id(mnt);
 318                 if (err)
 319                         goto out_free_cache;
 320
 321                 if (name) {
 322                         mnt->mnt_devname = kstrdup_const(name,
 323                                                          GFP_KERNEL_ACCOUNT);
 324                         if (!mnt->mnt_devname)
 325                                 goto out_free_id;
 326                 }
 327
 328 #ifdef CONFIG_SMP
 329                 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
 330                 if (!mnt->mnt_pcp)
 331                         goto out_free_devname;
 332
 333                 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
 334 #else
 335                 mnt->mnt_count = 1;
 336                 mnt->mnt_writers = 0;
 337 #endif
 338
 339                 INIT_HLIST_NODE(&mnt->mnt_hash);
 340                 INIT_LIST_HEAD(&mnt->mnt_child);
 341                 INIT_LIST_HEAD(&mnt->mnt_mounts);
 342                 INIT_LIST_HEAD(&mnt->mnt_list);
 343                 INIT_LIST_HEAD(&mnt->mnt_expire);
 344                 INIT_LIST_HEAD(&mnt->mnt_share);
 345                 INIT_LIST_HEAD(&mnt->mnt_slave_list);
 346                 INIT_LIST_HEAD(&mnt->mnt_slave);
 347                 INIT_HLIST_NODE(&mnt->mnt_mp_list);
 348                 INIT_LIST_HEAD(&mnt->mnt_umounting);
 349                 INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 350                 mnt->mnt.mnt_idmap = &nop_mnt_idmap;
 351         }
 352         return mnt;
 353
 354 #ifdef CONFIG_SMP
 355 out_free_devname:
 356         kfree_const(mnt->mnt_devname);
 357 #endif
 358 out_free_id:
 359         mnt_free_id(mnt);
 360 out_free_cache:
 361         kmem_cache_free(mnt_cache, mnt);
 362         return NULL;
 363 }
 364
 365 /*
 366  * Most r/o checks on a fs are for operations that take
 367  * discrete amounts of time, like a write() or unlink().
 368  * We must keep track of when those operations start
 369  * (for permission checks) and when they end, so that
 370  * we can determine when writes are able to occur to
 371  * a filesystem.
 372  */
 373 /*
 374  * __mnt_is_readonly: check whether a mount is read-only
 375  * @mnt: the mount to check for its write status
 376  *
 377  * This shouldn't be used directly ouside of the VFS.
 378  * It does not guarantee that the filesystem will stay
 379  * r/w, just that it is right *now*.  This can not and
 380  * should not be used in place of IS_RDONLY(inode).
 381  * mnt_want/drop_write() will _keep_ the filesystem
 382  * r/w.
 383  */
 384 bool __mnt_is_readonly(struct vfsmount *mnt)
 385 {
 386         return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
 387 }
 388 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 389
 390 static inline void mnt_inc_writers(struct mount *mnt)
 391 {
 392 #ifdef CONFIG_SMP
 393         this_cpu_inc(mnt->mnt_pcp->mnt_writers);
 394 #else
 395         mnt->mnt_writers++;
 396 #endif
 397 }
 398
 399 static inline void mnt_dec_writers(struct mount *mnt)
 400 {
 401 #ifdef CONFIG_SMP
 402         this_cpu_dec(mnt->mnt_pcp->mnt_writers);
 403 #else
 404         mnt->mnt_writers--;
 405 #endif
 406 }
 407
 408 static unsigned int mnt_get_writers(struct mount *mnt)
 409 {
 410 #ifdef CONFIG_SMP
 411         unsigned int count = 0;
 412         int cpu;
 413
 414         for_each_possible_cpu(cpu) {
 415                 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
 416         }
 417
 418         return count;
 419 #else
 420         return mnt->mnt_writers;
 421 #endif
 422 }
 423
 424 static int mnt_is_readonly(struct vfsmount *mnt)
 425 {
 426         if (mnt->mnt_sb->s_readonly_remount)
 427                 return 1;
 428         /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
 429         smp_rmb();
 430         return __mnt_is_readonly(mnt);
 431 }
 432
 433 /*
 434  * Most r/o & frozen checks on a fs are for operations that take discrete
 435  * amounts of time, like a write() or unlink().  We must keep track of when
 436  * those operations start (for permission checks) and when they end, so that we
 437  * can determine when writes are able to occur to a filesystem.
 438  */
 439 /**
 440  * __mnt_want_write - get write access to a mount without freeze protection
 441  * @m: the mount on which to take a write
 442  *
 443  * This tells the low-level filesystem that a write is about to be performed to
 444  * it, and makes sure that writes are allowed (mnt it read-write) before
 445  * returning success. This operation does not protect against filesystem being
 446  * frozen. When the write operation is finished, __mnt_drop_write() must be
 447  * called. This is effectively a refcount.
 448  */
 449 int __mnt_want_write(struct vfsmount *m)
 450 {
 451         struct mount *mnt = real_mount(m);
 452         int ret = 0;
 453
 454         preempt_disable();
 455         mnt_inc_writers(mnt);
 456         /*
 457          * The store to mnt_inc_writers must be visible before we pass
 458          * MNT_WRITE_HOLD loop below, so that the slowpath can see our
 459          * incremented count after it has set MNT_WRITE_HOLD.
 460          */
 461         smp_mb();
 462         might_lock(&mount_lock.lock);
 463         while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
 464                 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
 465                         cpu_relax();
 466                 } else {
 467                         /*
 468                          * This prevents priority inversion, if the task
 469                          * setting MNT_WRITE_HOLD got preempted on a remote
 470                          * CPU, and it prevents life lock if the task setting
 471                          * MNT_WRITE_HOLD has a lower priority and is bound to
 472                          * the same CPU as the task that is spinning here.
 473                          */
 474                         preempt_enable();
 475                         lock_mount_hash();
 476                         unlock_mount_hash();
 477                         preempt_disable();
 478                 }
 479         }
 480         /*
 481          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
 482          * be set to match its requirements. So we must not load that until
 483          * MNT_WRITE_HOLD is cleared.
 484          */
 485         smp_rmb();
 486         if (mnt_is_readonly(m)) {
 487                 mnt_dec_writers(mnt);
 488                 ret = -EROFS;
 489         }
 490         preempt_enable();
 491
 492         return ret;
 493 }
 494
 495 /**
 496  * mnt_want_write - get write access to a mount
 497  * @m: the mount on which to take a write
 498  *
 499  * This tells the low-level filesystem that a write is about to be performed to
 500  * it, and makes sure that writes are allowed (mount is read-write, filesystem
 501  * is not frozen) before returning success.  When the write operation is
 502  * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 503  */
 504 int mnt_want_write(struct vfsmount *m)
 505 {
 506         int ret;
 507
 508         sb_start_write(m->mnt_sb);
 509         ret = __mnt_want_write(m);
 510         if (ret)
 511                 sb_end_write(m->mnt_sb);
 512         return ret;
 513 }
 514 EXPORT_SYMBOL_GPL(mnt_want_write);
 515
 516 /**
 517  * __mnt_want_write_file - get write access to a file's mount
 518  * @file: the file who's mount on which to take a write
 519  *
 520  * This is like __mnt_want_write, but if the file is already open for writing it
 521  * skips incrementing mnt_writers (since the open file already has a reference)
 522  * and instead only does the check for emergency r/o remounts.  This must be
 523  * paired with __mnt_drop_write_file.
 524  */
 525 int __mnt_want_write_file(struct file *file)
 526 {
 527         if (file->f_mode & FMODE_WRITER) {
 528                 /*
 529                  * Superblock may have become readonly while there are still
 530                  * writable fd's, e.g. due to a fs error with errors=remount-ro
 531                  */
 532                 if (__mnt_is_readonly(file->f_path.mnt))
 533                         return -EROFS;
 534                 return 0;
 535         }
 536         return __mnt_want_write(file->f_path.mnt);
 537 }
 538
 539 /**
 540  * mnt_want_write_file - get write access to a file's mount
 541  * @file: the file who's mount on which to take a write
 542  *
 543  * This is like mnt_want_write, but if the file is already open for writing it
 544  * skips incrementing mnt_writers (since the open file already has a reference)
 545  * and instead only does the freeze protection and the check for emergency r/o
 546  * remounts.  This must be paired with mnt_drop_write_file.
 547  */
 548 int mnt_want_write_file(struct file *file)
 549 {
 550         int ret;
 551
 552         sb_start_write(file_inode(file)->i_sb);
 553         ret = __mnt_want_write_file(file);
 554         if (ret)
 555                 sb_end_write(file_inode(file)->i_sb);
 556         return ret;
 557 }
 558 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 559
 560 /**
 561  * __mnt_drop_write - give up write access to a mount
 562  * @mnt: the mount on which to give up write access
 563  *
 564  * Tells the low-level filesystem that we are done
 565  * performing writes to it.  Must be matched with
 566  * __mnt_want_write() call above.
 567  */
 568 void __mnt_drop_write(struct vfsmount *mnt)
 569 {
 570         preempt_disable();
 571         mnt_dec_writers(real_mount(mnt));
 572         preempt_enable();
 573 }
 574
 575 /**
 576  * mnt_drop_write - give up write access to a mount
 577  * @mnt: the mount on which to give up write access
 578  *
 579  * Tells the low-level filesystem that we are done performing writes to it and
 580  * also allows filesystem to be frozen again.  Must be matched with
 581  * mnt_want_write() call above.
 582  */
 583 void mnt_drop_write(struct vfsmount *mnt)
 584 {
 585         __mnt_drop_write(mnt);
 586         sb_end_write(mnt->mnt_sb);
 587 }
 588 EXPORT_SYMBOL_GPL(mnt_drop_write);
 589
 590 void __mnt_drop_write_file(struct file *file)
 591 {
 592         if (!(file->f_mode & FMODE_WRITER))
 593                 __mnt_drop_write(file->f_path.mnt);
 594 }
 595
 596 void mnt_drop_write_file(struct file *file)
 597 {
 598         __mnt_drop_write_file(file);
 599         sb_end_write(file_inode(file)->i_sb);
 600 }
 601 EXPORT_SYMBOL(mnt_drop_write_file);
 602
 603 /**
 604  * mnt_hold_writers - prevent write access to the given mount
 605  * @mnt: mnt to prevent write access to
 606  *
 607  * Prevents write access to @mnt if there are no active writers for @mnt.
 608  * This function needs to be called and return successfully before changing
 609  * properties of @mnt that need to remain stable for callers with write access
 610  * to @mnt.
 611  *
 612  * After this functions has been called successfully callers must pair it with
 613  * a call to mnt_unhold_writers() in order to stop preventing write access to
 614  * @mnt.
 615  *
 616  * Context: This function expects lock_mount_hash() to be held serializing
 617  *          setting MNT_WRITE_HOLD.
 618  * Return: On success 0 is returned.
 619  *         On error, -EBUSY is returned.
 620  */
 621 static inline int mnt_hold_writers(struct mount *mnt)
 622 {
 623         mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 624         /*
 625          * After storing MNT_WRITE_HOLD, we'll read the counters. This store
 626          * should be visible before we do.
 627          */
 628         smp_mb();
 629
 630         /*
 631          * With writers on hold, if this value is zero, then there are
 632          * definitely no active writers (although held writers may subsequently
 633          * increment the count, they'll have to wait, and decrement it after
 634          * seeing MNT_READONLY).
 635          *
 636          * It is OK to have counter incremented on one CPU and decremented on
 637          * another: the sum will add up correctly. The danger would be when we
 638          * sum up each counter, if we read a counter before it is incremented,
 639          * but then read another CPU's count which it has been subsequently
 640          * decremented from -- we would see more decrements than we should.
 641          * MNT_WRITE_HOLD protects against this scenario, because
 642          * mnt_want_write first increments count, then smp_mb, then spins on
 643          * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
 644          * we're counting up here.
 645          */
 646         if (mnt_get_writers(mnt) > 0)
 647                 return -EBUSY;
 648
 649         return 0;
 650 }
 651
 652 /**
 653  * mnt_unhold_writers - stop preventing write access to the given mount
 654  * @mnt: mnt to stop preventing write access to
 655  *
 656  * Stop preventing write access to @mnt allowing callers to gain write access
 657  * to @mnt again.
 658  *
 659  * This function can only be called after a successful call to
 660  * mnt_hold_writers().
 661  *
 662  * Context: This function expects lock_mount_hash() to be held.
 663  */
 664 static inline void mnt_unhold_writers(struct mount *mnt)
 665 {
 666         /*
 667          * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
 668          * that become unheld will see MNT_READONLY.
 669          */
 670         smp_wmb();
 671         mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 672 }
 673
 674 static int mnt_make_readonly(struct mount *mnt)
 675 {
 676         int ret;
 677
 678         ret = mnt_hold_writers(mnt);
 679         if (!ret)
 680                 mnt->mnt.mnt_flags |= MNT_READONLY;
 681         mnt_unhold_writers(mnt);
 682         return ret;
 683 }
 684
 685 int sb_prepare_remount_readonly(struct super_block *sb)
 686 {
 687         struct mount *mnt;
 688         int err = 0;
 689
 690         /* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
 691         if (atomic_long_read(&sb->s_remove_count))
 692                 return -EBUSY;
 693
 694         lock_mount_hash();
 695         list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 696                 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
 697                         err = mnt_hold_writers(mnt);
 698                         if (err)
 699                                 break;
 700                 }
 701         }
 702         if (!err && atomic_long_read(&sb->s_remove_count))
 703                 err = -EBUSY;
 704
 705         if (!err) {
 706                 sb->s_readonly_remount = 1;
 707                 smp_wmb();
 708         }
 709         list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 710                 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
 711                         mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 712         }
 713         unlock_mount_hash();
 714
 715         return err;
 716 }
 717
 718 static void free_vfsmnt(struct mount *mnt)
 719 {
 720         mnt_idmap_put(mnt_idmap(&mnt->mnt));
 721         kfree_const(mnt->mnt_devname);
 722 #ifdef CONFIG_SMP
 723         free_percpu(mnt->mnt_pcp);
 724 #endif
 725         kmem_cache_free(mnt_cache, mnt);
 726 }
 727
 728 static void delayed_free_vfsmnt(struct rcu_head *head)
 729 {
 730         free_vfsmnt(container_of(head, struct mount, mnt_rcu));
 731 }
 732
 733 /* call under rcu_read_lock */
 734 int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 735 {
 736         struct mount *mnt;
 737         if (read_seqretry(&mount_lock, seq))
 738                 return 1;
 739         if (bastard == NULL)
 740                 return 0;
 741         mnt = real_mount(bastard);
 742         mnt_add_count(mnt, 1);
 743         smp_mb();                       // see mntput_no_expire()
 744         if (likely(!read_seqretry(&mount_lock, seq)))
 745                 return 0;
 746         if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
 747                 mnt_add_count(mnt, -1);
 748                 return 1;
 749         }
 750         lock_mount_hash();
 751         if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
 752                 mnt_add_count(mnt, -1);
 753                 unlock_mount_hash();
 754                 return 1;
 755         }
 756         unlock_mount_hash();
 757         /* caller will mntput() */
 758         return -1;
 759 }
 760
 761 /* call under rcu_read_lock */
 762 static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 763 {
 764         int res = __legitimize_mnt(bastard, seq);
 765         if (likely(!res))
 766                 return true;
 767         if (unlikely(res < 0)) {
 768                 rcu_read_unlock();
 769                 mntput(bastard);
 770                 rcu_read_lock();
 771         }
 772         return false;
 773 }
 774
 775 /*
 776  * find the first mount at @dentry on vfsmount @mnt.
 777  * call under rcu_read_lock()
 778  */
 779 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 780 {
 781         struct hlist_head *head = m_hash(mnt, dentry);
 782         struct mount *p;
 783
 784         hlist_for_each_entry_rcu(p, head, mnt_hash)
 785                 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
 786                         return p;
 787         return NULL;
 788 }
 789
 790 /*
 791  * lookup_mnt - Return the first child mount mounted at path
 792  *
 793  * "First" means first mounted chronologically.  If you create the
 794  * following mounts:
 795  *
 796  * mount /dev/sda1 /mnt
 797  * mount /dev/sda2 /mnt
 798  * mount /dev/sda3 /mnt
 799  *
 800  * Then lookup_mnt() on the base /mnt dentry in the root mount will
 801  * return successively the root dentry and vfsmount of /dev/sda1, then
 802  * /dev/sda2, then /dev/sda3, then NULL.
 803  *
 804  * lookup_mnt takes a reference to the found vfsmount.
 805  */
 806 struct vfsmount *lookup_mnt(const struct path *path)
 807 {
 808         struct mount *child_mnt;
 809         struct vfsmount *m;
 810         unsigned seq;
 811
 812         rcu_read_lock();
 813         do {
 814                 seq = read_seqbegin(&mount_lock);
 815                 child_mnt = __lookup_mnt(path->mnt, path->dentry);
 816                 m = child_mnt ? &child_mnt->mnt : NULL;
 817         } while (!legitimize_mnt(m, seq));
 818         rcu_read_unlock();
 819         return m;
 820 }
 821
 822 static inline void lock_ns_list(struct mnt_namespace *ns)
 823 {
 824         spin_lock(&ns->ns_lock);
 825 }
 826
 827 static inline void unlock_ns_list(struct mnt_namespace *ns)
 828 {
 829         spin_unlock(&ns->ns_lock);
 830 }
 831
 832 static inline bool mnt_is_cursor(struct mount *mnt)
 833 {
 834         return mnt->mnt.mnt_flags & MNT_CURSOR;
 835 }
 836
 837 /*
 838  * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 839  *                         current mount namespace.
 840  *
 841  * The common case is dentries are not mountpoints at all and that
 842  * test is handled inline.  For the slow case when we are actually
 843  * dealing with a mountpoint of some kind, walk through all of the
 844  * mounts in the current mount namespace and test to see if the dentry
 845  * is a mountpoint.
 846  *
 847  * The mount_hashtable is not usable in the context because we
 848  * need to identify all mounts that may be in the current mount
 849  * namespace not just a mount that happens to have some specified
 850  * parent mount.
 851  */
 852 bool __is_local_mountpoint(struct dentry *dentry)
 853 {
 854         struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 855         struct mount *mnt;
 856         bool is_covered = false;
 857
 858         down_read(&namespace_sem);
 859         lock_ns_list(ns);
 860         list_for_each_entry(mnt, &ns->list, mnt_list) {
 861                 if (mnt_is_cursor(mnt))
 862                         continue;
 863                 is_covered = (mnt->mnt_mountpoint == dentry);
 864                 if (is_covered)
 865                         break;
 866         }
 867         unlock_ns_list(ns);
 868         up_read(&namespace_sem);
 869
 870         return is_covered;
 871 }
 872
 873 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 874 {
 875         struct hlist_head *chain = mp_hash(dentry);
 876         struct mountpoint *mp;
 877
 878         hlist_for_each_entry(mp, chain, m_hash) {
 879                 if (mp->m_dentry == dentry) {
 880                         mp->m_count++;
 881                         return mp;
 882                 }
 883         }
 884         return NULL;
 885 }
 886
 887 static struct mountpoint *get_mountpoint(struct dentry *dentry)
 888 {
 889         struct mountpoint *mp, *new = NULL;
 890         int ret;
 891
 892         if (d_mountpoint(dentry)) {
 893                 /* might be worth a WARN_ON() */
 894                 if (d_unlinked(dentry))
 895                         return ERR_PTR(-ENOENT);
 896 mountpoint:
 897                 read_seqlock_excl(&mount_lock);
 898                 mp = lookup_mountpoint(dentry);
 899                 read_sequnlock_excl(&mount_lock);
 900                 if (mp)
 901                         goto done;
 902         }
 903
 904         if (!new)
 905                 new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
 906         if (!new)
 907                 return ERR_PTR(-ENOMEM);
 908
 909
 910         /* Exactly one processes may set d_mounted */
 911         ret = d_set_mounted(dentry);
 912
 913         /* Someone else set d_mounted? */
 914         if (ret == -EBUSY)
 915                 goto mountpoint;
 916
 917         /* The dentry is not available as a mountpoint? */
 918         mp = ERR_PTR(ret);
 919         if (ret)
 920                 goto done;
 921
 922         /* Add the new mountpoint to the hash table */
 923         read_seqlock_excl(&mount_lock);
 924         new->m_dentry = dget(dentry);
 925         new->m_count = 1;
 926         hlist_add_head(&new->m_hash, mp_hash(dentry));
 927         INIT_HLIST_HEAD(&new->m_list);
 928         read_sequnlock_excl(&mount_lock);
 929
 930         mp = new;
 931         new = NULL;
 932 done:
 933         kfree(new);
 934         return mp;
 935 }
 936
 937 /*
 938  * vfsmount lock must be held.  Additionally, the caller is responsible
 939  * for serializing calls for given disposal list.
 940  */
 941 static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
 942 {
 943         if (!--mp->m_count) {
 944                 struct dentry *dentry = mp->m_dentry;
 945                 BUG_ON(!hlist_empty(&mp->m_list));
 946                 spin_lock(&dentry->d_lock);
 947                 dentry->d_flags &= ~DCACHE_MOUNTED;
 948                 spin_unlock(&dentry->d_lock);
 949                 dput_to_list(dentry, list);
 950                 hlist_del(&mp->m_hash);
 951                 kfree(mp);
 952         }
 953 }
 954
 955 /* called with namespace_lock and vfsmount lock */
 956 static void put_mountpoint(struct mountpoint *mp)
 957 {
 958         __put_mountpoint(mp, &ex_mountpoints);
 959 }
 960
 961 static inline int check_mnt(struct mount *mnt)
 962 {
 963         return mnt->mnt_ns == current->nsproxy->mnt_ns;
 964 }
 965
 966 /*
 967  * vfsmount lock must be held for write
 968  */
 969 static void touch_mnt_namespace(struct mnt_namespace *ns)
 970 {
 971         if (ns) {
 972                 ns->event = ++event;
 973                 wake_up_interruptible(&ns->poll);
 974         }
 975 }
 976
 977 /*
 978  * vfsmount lock must be held for write
 979  */
 980 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 981 {
 982         if (ns && ns->event != event) {
 983                 ns->event = event;
 984                 wake_up_interruptible(&ns->poll);
 985         }
 986 }
 987
 988 /*
 989  * vfsmount lock must be held for write
 990  */
 991 static struct mountpoint *unhash_mnt(struct mount *mnt)
 992 {
 993         struct mountpoint *mp;
 994         mnt->mnt_parent = mnt;
 995         mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 996         list_del_init(&mnt->mnt_child);
 997         hlist_del_init_rcu(&mnt->mnt_hash);
 998         hlist_del_init(&mnt->mnt_mp_list);
 999         mp = mnt->mnt_mp;
1000         mnt->mnt_mp = NULL;
1001         return mp;
1002 }
1003
1004 /*
1005  * vfsmount lock must be held for write
1006  */
1007 static void umount_mnt(struct mount *mnt)
1008 {
1009         put_mountpoint(unhash_mnt(mnt));
1010 }
1011
1012 /*
1013  * vfsmount lock must be held for write
1014  */
1015 void mnt_set_mountpoint(struct mount *mnt,
1016                         struct mountpoint *mp,
1017                         struct mount *child_mnt)
1018 {
1019         mp->m_count++;
1020         mnt_add_count(mnt, 1);  /* essentially, that's mntget */
1021         child_mnt->mnt_mountpoint = mp->m_dentry;
1022         child_mnt->mnt_parent = mnt;
1023         child_mnt->mnt_mp = mp;
1024         hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
1025 }
1026
1027 static void __attach_mnt(struct mount *mnt, struct mount *parent)
1028 {
1029         hlist_add_head_rcu(&mnt->mnt_hash,
1030                            m_hash(&parent->mnt, mnt->mnt_mountpoint));
1031         list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
1032 }
1033
1034 /*
1035  * vfsmount lock must be held for write
1036  */
1037 static void attach_mnt(struct mount *mnt,
1038                         struct mount *parent,
1039                         struct mountpoint *mp)
1040 {
1041         mnt_set_mountpoint(parent, mp, mnt);
1042         __attach_mnt(mnt, parent);
1043 }
1044
1045 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
1046 {
1047         struct mountpoint *old_mp = mnt->mnt_mp;
1048         struct mount *old_parent = mnt->mnt_parent;
1049
1050         list_del_init(&mnt->mnt_child);
1051         hlist_del_init(&mnt->mnt_mp_list);
1052         hlist_del_init_rcu(&mnt->mnt_hash);
1053
1054         attach_mnt(mnt, parent, mp);
1055
1056         put_mountpoint(old_mp);
1057         mnt_add_count(old_parent, -1);
1058 }
1059
1060 /*
1061  * vfsmount lock must be held for write
1062  */
1063 static void commit_tree(struct mount *mnt)
1064 {
1065         struct mount *parent = mnt->mnt_parent;
1066         struct mount *m;
1067         LIST_HEAD(head);
1068         struct mnt_namespace *n = parent->mnt_ns;
1069
1070         BUG_ON(parent == mnt);
1071
1072         list_add_tail(&head, &mnt->mnt_list);
1073         list_for_each_entry(m, &head, mnt_list)
1074                 m->mnt_ns = n;
1075
1076         list_splice(&head, n->list.prev);
1077
1078         n->mounts += n->pending_mounts;
1079         n->pending_mounts = 0;
1080
1081         __attach_mnt(mnt, parent);
1082         touch_mnt_namespace(n);
1083 }
1084
1085 static struct mount *next_mnt(struct mount *p, struct mount *root)
1086 {
1087         struct list_head *next = p->mnt_mounts.next;
1088         if (next == &p->mnt_mounts) {
1089                 while (1) {
1090                         if (p == root)
1091                                 return NULL;
1092                         next = p->mnt_child.next;
1093                         if (next != &p->mnt_parent->mnt_mounts)
1094                                 break;
1095                         p = p->mnt_parent;
1096                 }
1097         }
1098         return list_entry(next, struct mount, mnt_child);
1099 }
1100
1101 static struct mount *skip_mnt_tree(struct mount *p)
1102 {
1103         struct list_head *prev = p->mnt_mounts.prev;
1104         while (prev != &p->mnt_mounts) {
1105                 p = list_entry(prev, struct mount, mnt_child);
1106                 prev = p->mnt_mounts.prev;
1107         }
1108         return p;
1109 }
1110
1111 /**
1112  * vfs_create_mount - Create a mount for a configured superblock
1113  * @fc: The configuration context with the superblock attached
1114  *
1115  * Create a mount to an already configured superblock.  If necessary, the
1116  * caller should invoke vfs_get_tree() before calling this.
1117  *
1118  * Note that this does not attach the mount to anything.
1119  */
1120 struct vfsmount *vfs_create_mount(struct fs_context *fc)
1121 {
1122         struct mount *mnt;
1123
1124         if (!fc->root)
1125                 return ERR_PTR(-EINVAL);
1126
1127         mnt = alloc_vfsmnt(fc->source ?: "none");
1128         if (!mnt)
1129                 return ERR_PTR(-ENOMEM);
1130
1131         if (fc->sb_flags & SB_KERNMOUNT)
1132                 mnt->mnt.mnt_flags = MNT_INTERNAL;
1133
1134         atomic_inc(&fc->root->d_sb->s_active);
1135         mnt->mnt.mnt_sb         = fc->root->d_sb;
1136         mnt->mnt.mnt_root       = dget(fc->root);
1137         mnt->mnt_mountpoint     = mnt->mnt.mnt_root;
1138         mnt->mnt_parent         = mnt;
1139
1140         lock_mount_hash();
1141         list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
1142         unlock_mount_hash();
1143         return &mnt->mnt;
1144 }
1145 EXPORT_SYMBOL(vfs_create_mount);
1146
1147 struct vfsmount *fc_mount(struct fs_context *fc)
1148 {
1149         int err = vfs_get_tree(fc);
1150         if (!err) {
1151                 up_write(&fc->root->d_sb->s_umount);
1152                 return vfs_create_mount(fc);
1153         }
1154         return ERR_PTR(err);
1155 }
1156 EXPORT_SYMBOL(fc_mount);
1157
1158 struct vfsmount *vfs_kern_mount(struct file_system_type *type,
1159                                 int flags, const char *name,
1160                                 void *data)
1161 {
1162         struct fs_context *fc;
1163         struct vfsmount *mnt;
1164         int ret = 0;
1165
1166         if (!type)
1167                 return ERR_PTR(-EINVAL);
1168
1169         fc = fs_context_for_mount(type, flags);
1170         if (IS_ERR(fc))
1171                 return ERR_CAST(fc);
1172
1173         if (name)
1174                 ret = vfs_parse_fs_string(fc, "source",
1175                                           name, strlen(name));
1176         if (!ret)
1177                 ret = parse_monolithic_mount_data(fc, data);
1178         if (!ret)
1179                 mnt = fc_mount(fc);
1180         else
1181                 mnt = ERR_PTR(ret);
1182
1183         put_fs_context(fc);
1184         return mnt;
1185 }
1186 EXPORT_SYMBOL_GPL(vfs_kern_mount);
1187
1188 struct vfsmount *
1189 vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
1190              const char *name, void *data)
1191 {
1192         /* Until it is worked out how to pass the user namespace
1193          * through from the parent mount to the submount don't support
1194          * unprivileged mounts with submounts.
1195          */
1196         if (mountpoint->d_sb->s_user_ns != &init_user_ns)
1197                 return ERR_PTR(-EPERM);
1198
1199         return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
1200 }
1201 EXPORT_SYMBOL_GPL(vfs_submount);
1202
1203 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
1204                                         int flag)
1205 {
1206         struct super_block *sb = old->mnt.mnt_sb;
1207         struct mount *mnt;
1208         int err;
1209
1210         mnt = alloc_vfsmnt(old->mnt_devname);
1211         if (!mnt)
1212                 return ERR_PTR(-ENOMEM);
1213
1214         if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
1215                 mnt->mnt_group_id = 0; /* not a peer of original */
1216         else
1217                 mnt->mnt_group_id = old->mnt_group_id;
1218
1219         if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
1220                 err = mnt_alloc_group_id(mnt);
1221                 if (err)
1222                         goto out_free;
1223         }
1224
1225         mnt->mnt.mnt_flags = old->mnt.mnt_flags;
1226         mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
1227
1228         atomic_inc(&sb->s_active);
1229         mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
1230
1231         mnt->mnt.mnt_sb = sb;
1232         mnt->mnt.mnt_root = dget(root);
1233         mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1234         mnt->mnt_parent = mnt;
1235         lock_mount_hash();
1236         list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
1237         unlock_mount_hash();
1238
1239         if ((flag & CL_SLAVE) ||
1240             ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
1241                 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
1242                 mnt->mnt_master = old;
1243                 CLEAR_MNT_SHARED(mnt);
1244         } else if (!(flag & CL_PRIVATE)) {
1245                 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
1246                         list_add(&mnt->mnt_share, &old->mnt_share);
1247                 if (IS_MNT_SLAVE(old))
1248                         list_add(&mnt->mnt_slave, &old->mnt_slave);
1249                 mnt->mnt_master = old->mnt_master;
1250         } else {
1251                 CLEAR_MNT_SHARED(mnt);
1252         }
1253         if (flag & CL_MAKE_SHARED)
1254                 set_mnt_shared(mnt);
1255
1256         /* stick the duplicate mount on the same expiry list
1257          * as the original if that was on one */
1258         if (flag & CL_EXPIRE) {
1259                 if (!list_empty(&old->mnt_expire))
1260                         list_add(&mnt->mnt_expire, &old->mnt_expire);
1261         }
1262
1263         return mnt;
1264
1265  out_free:
1266         mnt_free_id(mnt);
1267         free_vfsmnt(mnt);
1268         return ERR_PTR(err);
1269 }
1270
1271 static void cleanup_mnt(struct mount *mnt)
1272 {
1273         struct hlist_node *p;
1274         struct mount *m;
1275         /*
1276          * The warning here probably indicates that somebody messed
1277          * up a mnt_want/drop_write() pair.  If this happens, the
1278          * filesystem was probably unable to make r/w->r/o transitions.
1279          * The locking used to deal with mnt_count decrement provides barriers,
1280          * so mnt_get_writers() below is safe.
1281          */
1282         WARN_ON(mnt_get_writers(mnt));
1283         if (unlikely(mnt->mnt_pins.first))
1284                 mnt_pin_kill(mnt);
1285         hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
1286                 hlist_del(&m->mnt_umount);
1287                 mntput(&m->mnt);
1288         }
1289         fsnotify_vfsmount_delete(&mnt->mnt);
1290         dput(mnt->mnt.mnt_root);
1291         deactivate_super(mnt->mnt.mnt_sb);
1292         mnt_free_id(mnt);
1293         call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1294 }
1295
1296 static void __cleanup_mnt(struct rcu_head *head)
1297 {
1298         cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1299 }
1300
1301 static LLIST_HEAD(delayed_mntput_list);
1302 static void delayed_mntput(struct work_struct *unused)
1303 {
1304         struct llist_node *node = llist_del_all(&delayed_mntput_list);
1305         struct mount *m, *t;
1306
1307         llist_for_each_entry_safe(m, t, node, mnt_llist)
1308                 cleanup_mnt(m);
1309 }
1310 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1311
1312 static void mntput_no_expire(struct mount *mnt)
1313 {
1314         LIST_HEAD(list);
1315         int count;
1316
1317         rcu_read_lock();
1318         if (likely(READ_ONCE(mnt->mnt_ns))) {
1319                 /*
1320                  * Since we don't do lock_mount_hash() here,
1321                  * ->mnt_ns can change under us.  However, if it's
1322                  * non-NULL, then there's a reference that won't
1323                  * be dropped until after an RCU delay done after
1324                  * turning ->mnt_ns NULL.  So if we observe it
1325                  * non-NULL under rcu_read_lock(), the reference
1326                  * we are dropping is not the final one.
1327                  */
1328                 mnt_add_count(mnt, -1);
1329                 rcu_read_unlock();
1330                 return;
1331         }
1332         lock_mount_hash();
1333         /*
1334          * make sure that if __legitimize_mnt() has not seen us grab
1335          * mount_lock, we'll see their refcount increment here.
1336          */
1337         smp_mb();
1338         mnt_add_count(mnt, -1);
1339         count = mnt_get_count(mnt);
1340         if (count != 0) {
1341                 WARN_ON(count < 0);
1342                 rcu_read_unlock();
1343                 unlock_mount_hash();
1344                 return;
1345         }
1346         if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1347                 rcu_read_unlock();
1348                 unlock_mount_hash();
1349                 return;
1350         }
1351         mnt->mnt.mnt_flags |= MNT_DOOMED;
1352         rcu_read_unlock();
1353
1354         list_del(&mnt->mnt_instance);
1355
1356         if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1357                 struct mount *p, *tmp;
1358                 list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
1359                         __put_mountpoint(unhash_mnt(p), &list);
1360                         hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
1361                 }
1362         }
1363         unlock_mount_hash();
1364         shrink_dentry_list(&list);
1365
1366         if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1367                 struct task_struct *task = current;
1368                 if (likely(!(task->flags & PF_KTHREAD))) {
1369                         init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1370                         if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
1371                                 return;
1372                 }
1373                 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
1374                         schedule_delayed_work(&delayed_mntput_work, 1);
1375                 return;
1376         }
1377         cleanup_mnt(mnt);
1378 }
1379
1380 void mntput(struct vfsmount *mnt)
1381 {
1382         if (mnt) {
1383                 struct mount *m = real_mount(mnt);
1384                 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
1385                 if (unlikely(m->mnt_expiry_mark))
1386                         m->mnt_expiry_mark = 0;
1387                 mntput_no_expire(m);
1388         }
1389 }
1390 EXPORT_SYMBOL(mntput);
1391
1392 struct vfsmount *mntget(struct vfsmount *mnt)
1393 {
1394         if (mnt)
1395                 mnt_add_count(real_mount(mnt), 1);
1396         return mnt;
1397 }
1398 EXPORT_SYMBOL(mntget);
1399
1400 /*
1401  * Make a mount point inaccessible to new lookups.
1402  * Because there may still be current users, the caller MUST WAIT
1403  * for an RCU grace period before destroying the mount point.
1404  */
1405 void mnt_make_shortterm(struct vfsmount *mnt)
1406 {
1407         if (mnt)
1408                 real_mount(mnt)->mnt_ns = NULL;
1409 }
1410
1411 /**
1412  * path_is_mountpoint() - Check if path is a mount in the current namespace.
1413  * @path: path to check
1414  *
1415  *  d_mountpoint() can only be used reliably to establish if a dentry is
1416  *  not mounted in any namespace and that common case is handled inline.
1417  *  d_mountpoint() isn't aware of the possibility there may be multiple
1418  *  mounts using a given dentry in a different namespace. This function
1419  *  checks if the passed in path is a mountpoint rather than the dentry
1420  *  alone.
1421  */
1422 bool path_is_mountpoint(const struct path *path)
1423 {
1424         unsigned seq;
1425         bool res;
1426
1427         if (!d_mountpoint(path->dentry))
1428                 return false;
1429
1430         rcu_read_lock();
1431         do {
1432                 seq = read_seqbegin(&mount_lock);
1433                 res = __path_is_mountpoint(path);
1434         } while (read_seqretry(&mount_lock, seq));
1435         rcu_read_unlock();
1436
1437         return res;
1438 }
1439 EXPORT_SYMBOL(path_is_mountpoint);
1440
1441 struct vfsmount *mnt_clone_internal(const struct path *path)
1442 {
1443         struct mount *p;
1444         p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1445         if (IS_ERR(p))
1446                 return ERR_CAST(p);
1447         p->mnt.mnt_flags |= MNT_INTERNAL;
1448         return &p->mnt;
1449 }
1450
1451 #ifdef CONFIG_PROC_FS
1452 static struct mount *mnt_list_next(struct mnt_namespace *ns,
1453                                    struct list_head *p)
1454 {
1455         struct mount *mnt, *ret = NULL;
1456
1457         lock_ns_list(ns);
1458         list_for_each_continue(p, &ns->list) {
1459                 mnt = list_entry(p, typeof(*mnt), mnt_list);
1460                 if (!mnt_is_cursor(mnt)) {
1461                         ret = mnt;
1462                         break;
1463                 }
1464         }
1465         unlock_ns_list(ns);
1466
1467         return ret;
1468 }
1469
1470 /* iterator; we want it to have access to namespace_sem, thus here... */
1471 static void *m_start(struct seq_file *m, loff_t *pos)
1472 {
1473         struct proc_mounts *p = m->private;
1474         struct list_head *prev;
1475
1476         down_read(&namespace_sem);
1477         if (!*pos) {
1478                 prev = &p->ns->list;
1479         } else {
1480                 prev = &p->cursor.mnt_list;
1481
1482                 /* Read after we'd reached the end? */
1483                 if (list_empty(prev))
1484                         return NULL;
1485         }
1486
1487         return mnt_list_next(p->ns, prev);
1488 }
1489
1490 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1491 {
1492         struct proc_mounts *p = m->private;
1493         struct mount *mnt = v;
1494
1495         ++*pos;
1496         return mnt_list_next(p->ns, &mnt->mnt_list);
1497 }
1498
1499 static void m_stop(struct seq_file *m, void *v)
1500 {
1501         struct proc_mounts *p = m->private;
1502         struct mount *mnt = v;
1503
1504         lock_ns_list(p->ns);
1505         if (mnt)
1506                 list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
1507         else
1508                 list_del_init(&p->cursor.mnt_list);
1509         unlock_ns_list(p->ns);
1510         up_read(&namespace_sem);
1511 }
1512
1513 static int m_show(struct seq_file *m, void *v)
1514 {
1515         struct proc_mounts *p = m->private;
1516         struct mount *r = v;
1517         return p->show(m, &r->mnt);
1518 }
1519
1520 const struct seq_operations mounts_op = {
1521         .start  = m_start,
1522         .next   = m_next,
1523         .stop   = m_stop,
1524         .show   = m_show,
1525 };
1526
1527 void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
1528 {
1529         down_read(&namespace_sem);
1530         lock_ns_list(ns);
1531         list_del(&cursor->mnt_list);
1532         unlock_ns_list(ns);
1533         up_read(&namespace_sem);
1534 }
1535 #endif  /* CONFIG_PROC_FS */
1536
1537 /**
1538  * may_umount_tree - check if a mount tree is busy
1539  * @m: root of mount tree
1540  *
1541  * This is called to check if a tree of mounts has any
1542  * open files, pwds, chroots or sub mounts that are
1543  * busy.
1544  */
1545 int may_umount_tree(struct vfsmount *m)
1546 {
1547         struct mount *mnt = real_mount(m);
1548         int actual_refs = 0;
1549         int minimum_refs = 0;
1550         struct mount *p;
1551         BUG_ON(!m);
1552
1553         /* write lock needed for mnt_get_count */
1554         lock_mount_hash();
1555         for (p = mnt; p; p = next_mnt(p, mnt)) {
1556                 actual_refs += mnt_get_count(p);
1557                 minimum_refs += 2;
1558         }
1559         unlock_mount_hash();
1560
1561         if (actual_refs > minimum_refs)
1562                 return 0;
1563
1564         return 1;
1565 }
1566
1567 EXPORT_SYMBOL(may_umount_tree);
1568
1569 /**
1570  * may_umount - check if a mount point is busy
1571  * @mnt: root of mount
1572  *
1573  * This is called to check if a mount point has any
1574  * open files, pwds, chroots or sub mounts. If the
1575  * mount has sub mounts this will return busy
1576  * regardless of whether the sub mounts are busy.
1577  *
1578  * Doesn't take quota and stuff into account. IOW, in some cases it will
1579  * give false negatives. The main reason why it's here is that we need
1580  * a non-destructive way to look for easily umountable filesystems.
1581  */
1582 int may_umount(struct vfsmount *mnt)
1583 {
1584         int ret = 1;
1585         down_read(&namespace_sem);
1586         lock_mount_hash();
1587         if (propagate_mount_busy(real_mount(mnt), 2))
1588                 ret = 0;
1589         unlock_mount_hash();
1590         up_read(&namespace_sem);
1591         return ret;
1592 }
1593
1594 EXPORT_SYMBOL(may_umount);
1595
1596 static void namespace_unlock(void)
1597 {
1598         struct hlist_head head;
1599         struct hlist_node *p;
1600         struct mount *m;
1601         LIST_HEAD(list);
1602
1603         hlist_move_list(&unmounted, &head);
1604         list_splice_init(&ex_mountpoints, &list);
1605
1606         up_write(&namespace_sem);
1607
1608         shrink_dentry_list(&list);
1609
1610         if (likely(hlist_empty(&head)))
1611                 return;
1612
1613         synchronize_rcu_expedited();
1614
1615         hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
1616                 hlist_del(&m->mnt_umount);
1617                 mntput(&m->mnt);
1618         }
1619 }
1620
1621 static inline void namespace_lock(void)
1622 {
1623         down_write(&namespace_sem);
1624 }
1625
1626 enum umount_tree_flags {
1627         UMOUNT_SYNC = 1,
1628         UMOUNT_PROPAGATE = 2,
1629         UMOUNT_CONNECTED = 4,
1630 };
1631
1632 static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
1633 {
1634         /* Leaving mounts connected is only valid for lazy umounts */
1635         if (how & UMOUNT_SYNC)
1636                 return true;
1637
1638         /* A mount without a parent has nothing to be connected to */
1639         if (!mnt_has_parent(mnt))
1640                 return true;
1641
1642         /* Because the reference counting rules change when mounts are
1643          * unmounted and connected, umounted mounts may not be
1644          * connected to mounted mounts.
1645          */
1646         if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
1647                 return true;
1648
1649         /* Has it been requested that the mount remain connected? */
1650         if (how & UMOUNT_CONNECTED)
1651                 return false;
1652
1653         /* Is the mount locked such that it needs to remain connected? */
1654         if (IS_MNT_LOCKED(mnt))
1655                 return false;
1656
1657         /* By default disconnect the mount */
1658         return true;
1659 }
1660
1661 /*
1662  * mount_lock must be held
1663  * namespace_sem must be held for write
1664  */
1665 static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
1666 {
1667         LIST_HEAD(tmp_list);
1668         struct mount *p;
1669
1670         if (how & UMOUNT_PROPAGATE)
1671                 propagate_mount_unlock(mnt);
1672
1673         /* Gather the mounts to umount */
1674         for (p = mnt; p; p = next_mnt(p, mnt)) {
1675                 p->mnt.mnt_flags |= MNT_UMOUNT;
1676                 list_move(&p->mnt_list, &tmp_list);
1677         }
1678
1679         /* Hide the mounts from mnt_mounts */
1680         list_for_each_entry(p, &tmp_list, mnt_list) {
1681                 list_del_init(&p->mnt_child);
1682         }
1683
1684         /* Add propogated mounts to the tmp_list */
1685         if (how & UMOUNT_PROPAGATE)
1686                 propagate_umount(&tmp_list);
1687
1688         while (!list_empty(&tmp_list)) {
1689                 struct mnt_namespace *ns;
1690                 bool disconnect;
1691                 p = list_first_entry(&tmp_list, struct mount, mnt_list);
1692                 list_del_init(&p->mnt_expire);
1693                 list_del_init(&p->mnt_list);
1694                 ns = p->mnt_ns;
1695                 if (ns) {
1696                         ns->mounts--;
1697                         __touch_mnt_namespace(ns);
1698                 }
1699                 p->mnt_ns = NULL;
1700                 if (how & UMOUNT_SYNC)
1701                         p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1702
1703                 disconnect = disconnect_mount(p, how);
1704                 if (mnt_has_parent(p)) {
1705                         mnt_add_count(p->mnt_parent, -1);
1706                         if (!disconnect) {
1707                                 /* Don't forget about p */
1708                                 list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
1709                         } else {
1710                                 umount_mnt(p);
1711                         }
1712                 }
1713                 change_mnt_propagation(p, MS_PRIVATE);
1714                 if (disconnect)
1715                         hlist_add_head(&p->mnt_umount, &unmounted);
1716         }
1717 }
1718
1719 static void shrink_submounts(struct mount *mnt);
1720
1721 static int do_umount_root(struct super_block *sb)
1722 {
1723         int ret = 0;
1724
1725         down_write(&sb->s_umount);
1726         if (!sb_rdonly(sb)) {
1727                 struct fs_context *fc;
1728
1729                 fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
1730                                                 SB_RDONLY);
1731                 if (IS_ERR(fc)) {
1732                         ret = PTR_ERR(fc);
1733                 } else {
1734                         ret = parse_monolithic_mount_data(fc, NULL);
1735                         if (!ret)
1736                                 ret = reconfigure_super(fc);
1737                         put_fs_context(fc);
1738                 }
1739         }
1740         up_write(&sb->s_umount);
1741         return ret;
1742 }
1743
1744 static int do_umount(struct mount *mnt, int flags)
1745 {
1746         struct super_block *sb = mnt->mnt.mnt_sb;
1747         int retval;
1748
1749         retval = security_sb_umount(&mnt->mnt, flags);
1750         if (retval)
1751                 return retval;
1752
1753         /*
1754          * Allow userspace to request a mountpoint be expired rather than
1755          * unmounting unconditionally. Unmount only happens if:
1756          *  (1) the mark is already set (the mark is cleared by mntput())
1757          *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1758          */
1759         if (flags & MNT_EXPIRE) {
1760                 if (&mnt->mnt == current->fs->root.mnt ||
1761                     flags & (MNT_FORCE | MNT_DETACH))
1762                         return -EINVAL;
1763
1764                 /*
1765                  * probably don't strictly need the lock here if we examined
1766                  * all race cases, but it's a slowpath.
1767                  */
1768                 lock_mount_hash();
1769                 if (mnt_get_count(mnt) != 2) {
1770                         unlock_mount_hash();
1771                         return -EBUSY;
1772                 }
1773                 unlock_mount_hash();
1774
1775                 if (!xchg(&mnt->mnt_expiry_mark, 1))
1776                         return -EAGAIN;
1777         }
1778
1779         /*
1780          * If we may have to abort operations to get out of this
1781          * mount, and they will themselves hold resources we must
1782          * allow the fs to do things. In the Unix tradition of
1783          * 'Gee thats tricky lets do it in userspace' the umount_begin
1784          * might fail to complete on the first run through as other tasks
1785          * must return, and the like. Thats for the mount program to worry
1786          * about for the moment.
1787          */
1788
1789         if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1790                 sb->s_op->umount_begin(sb);
1791         }
1792
1793         /*
1794          * No sense to grab the lock for this test, but test itself looks
1795          * somewhat bogus. Suggestions for better replacement?
1796          * Ho-hum... In principle, we might treat that as umount + switch
1797          * to rootfs. GC would eventually take care of the old vfsmount.
1798          * Actually it makes sense, especially if rootfs would contain a
1799          * /reboot - static binary that would close all descriptors and
1800          * call reboot(9). Then init(8) could umount root and exec /reboot.
1801          */
1802         if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1803                 /*
1804                  * Special case for "unmounting" root ...
1805                  * we just try to remount it readonly.
1806                  */
1807                 if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
1808                         return -EPERM;
1809                 return do_umount_root(sb);
1810         }
1811
1812         namespace_lock();
1813         lock_mount_hash();
1814
1815         /* Recheck MNT_LOCKED with the locks held */
1816         retval = -EINVAL;
1817         if (mnt->mnt.mnt_flags & MNT_LOCKED)
1818                 goto out;
1819
1820         event++;
1821         if (flags & MNT_DETACH) {
1822                 if (!list_empty(&mnt->mnt_list))
1823                         umount_tree(mnt, UMOUNT_PROPAGATE);
1824                 retval = 0;
1825         } else {
1826                 shrink_submounts(mnt);
1827                 retval = -EBUSY;
1828                 if (!propagate_mount_busy(mnt, 2)) {
1829                         if (!list_empty(&mnt->mnt_list))
1830                                 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
1831                         retval = 0;
1832                 }
1833         }
1834 out:
1835         unlock_mount_hash();
1836         namespace_unlock();
1837         return retval;
1838 }
1839
1840 /*
1841  * __detach_mounts - lazily unmount all mounts on the specified dentry
1842  *
1843  * During unlink, rmdir, and d_drop it is possible to loose the path
1844  * to an existing mountpoint, and wind up leaking the mount.
1845  * detach_mounts allows lazily unmounting those mounts instead of
1846  * leaking them.
1847  *
1848  * The caller may hold dentry->d_inode->i_mutex.
1849  */
1850 void __detach_mounts(struct dentry *dentry)
1851 {
1852         struct mountpoint *mp;
1853         struct mount *mnt;
1854
1855         namespace_lock();
1856         lock_mount_hash();
1857         mp = lookup_mountpoint(dentry);
1858         if (!mp)
1859                 goto out_unlock;
1860
1861         event++;
1862         while (!hlist_empty(&mp->m_list)) {
1863                 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1864                 if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1865                         umount_mnt(mnt);
1866                         hlist_add_head(&mnt->mnt_umount, &unmounted);
1867                 }
1868                 else umount_tree(mnt, UMOUNT_CONNECTED);
1869         }
1870         put_mountpoint(mp);
1871 out_unlock:
1872         unlock_mount_hash();
1873         namespace_unlock();
1874 }
1875
1876 /*
1877  * Is the caller allowed to modify his namespace?
1878  */
1879 bool may_mount(void)
1880 {
1881         return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
1882 }
1883
1884 static void warn_mandlock(void)
1885 {
1886         pr_warn_once("=======================================================\n"
1887                      "WARNING: The mand mount option has been deprecated and\n"
1888                      "         and is ignored by this kernel. Remove the mand\n"
1889                      "         option from the mount to silence this warning.\n"
1890                      "=======================================================\n");
1891 }
1892
1893 static int can_umount(const struct path *path, int flags)
1894 {
1895         struct mount *mnt = real_mount(path->mnt);
1896
1897         if (!may_mount())
1898                 return -EPERM;
1899         if (path->dentry != path->mnt->mnt_root)
1900                 return -EINVAL;
1901         if (!check_mnt(mnt))
1902                 return -EINVAL;
1903         if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
1904                 return -EINVAL;
1905         if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
1906                 return -EPERM;
1907         return 0;
1908 }
1909
1910 // caller is responsible for flags being sane
1911 int path_umount(struct path *path, int flags)
1912 {
1913         struct mount *mnt = real_mount(path->mnt);
1914         int ret;
1915
1916         ret = can_umount(path, flags);
1917         if (!ret)
1918                 ret = do_umount(mnt, flags);
1919
1920         /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1921         dput(path->dentry);
1922         mntput_no_expire(mnt);
1923         return ret;
1924 }
1925
1926 static int ksys_umount(char __user *name, int flags)
1927 {
1928         int lookup_flags = LOOKUP_MOUNTPOINT;
1929         struct path path;
1930         int ret;
1931
1932         // basic validity checks done first
1933         if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1934                 return -EINVAL;
1935
1936         if (!(flags & UMOUNT_NOFOLLOW))
1937                 lookup_flags |= LOOKUP_FOLLOW;
1938         ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
1939         if (ret)
1940                 return ret;
1941         return path_umount(&path, flags);
1942 }
1943
1944 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1945 {
1946         return ksys_umount(name, flags);
1947 }
1948
1949 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
1950
1951 /*
1952  *      The 2.0 compatible umount. No flags.
1953  */
1954 SYSCALL_DEFINE1(oldumount, char __user *, name)
1955 {
1956         return ksys_umount(name, 0);
1957 }
1958
1959 #endif
1960
1961 static bool is_mnt_ns_file(struct dentry *dentry)
1962 {
1963         /* Is this a proxy for a mount namespace? */
1964         return dentry->d_op == &ns_dentry_operations &&
1965                dentry->d_fsdata == &mntns_operations;
1966 }
1967
1968 static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
1969 {
1970         return container_of(ns, struct mnt_namespace, ns);
1971 }
1972
1973 struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
1974 {
1975         return &mnt->ns;
1976 }
1977
1978 static bool mnt_ns_loop(struct dentry *dentry)
1979 {
1980         /* Could bind mounting the mount namespace inode cause a
1981          * mount namespace loop?
1982          */
1983         struct mnt_namespace *mnt_ns;
1984         if (!is_mnt_ns_file(dentry))
1985                 return false;
1986
1987         mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
1988         return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1989 }
1990
1991 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1992                                         int flag)
1993 {
1994         struct mount *res, *p, *q, *r, *parent;
1995
1996         if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1997                 return ERR_PTR(-EINVAL);
1998
1999         if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
2000                 return ERR_PTR(-EINVAL);
2001
2002         res = q = clone_mnt(mnt, dentry, flag);
2003         if (IS_ERR(q))
2004                 return q;
2005
2006         q->mnt_mountpoint = mnt->mnt_mountpoint;
2007
2008         p = mnt;
2009         list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
2010                 struct mount *s;
2011                 if (!is_subdir(r->mnt_mountpoint, dentry))
2012                         continue;
2013
2014                 for (s = r; s; s = next_mnt(s, r)) {
2015                         if (!(flag & CL_COPY_UNBINDABLE) &&
2016                             IS_MNT_UNBINDABLE(s)) {
2017                                 if (s->mnt.mnt_flags & MNT_LOCKED) {
2018                                         /* Both unbindable and locked. */
2019                                         q = ERR_PTR(-EPERM);
2020                                         goto out;
2021                                 } else {
2022                                         s = skip_mnt_tree(s);
2023                                         continue;
2024                                 }
2025                         }
2026                         if (!(flag & CL_COPY_MNT_NS_FILE) &&
2027                             is_mnt_ns_file(s->mnt.mnt_root)) {
2028                                 s = skip_mnt_tree(s);
2029                                 continue;
2030                         }
2031                         while (p != s->mnt_parent) {
2032                                 p = p->mnt_parent;
2033                                 q = q->mnt_parent;
2034                         }
2035                         p = s;
2036                         parent = q;
2037                         q = clone_mnt(p, p->mnt.mnt_root, flag);
2038                         if (IS_ERR(q))
2039                                 goto out;
2040                         lock_mount_hash();
2041                         list_add_tail(&q->mnt_list, &res->mnt_list);
2042                         attach_mnt(q, parent, p->mnt_mp);
2043                         unlock_mount_hash();
2044                 }
2045         }
2046         return res;
2047 out:
2048         if (res) {
2049                 lock_mount_hash();
2050                 umount_tree(res, UMOUNT_SYNC);
2051                 unlock_mount_hash();
2052         }
2053         return q;
2054 }
2055
2056 /* Caller should check returned pointer for errors */
2057
2058 struct vfsmount *collect_mounts(const struct path *path)
2059 {
2060         struct mount *tree;
2061         namespace_lock();
2062         if (!check_mnt(real_mount(path->mnt)))
2063                 tree = ERR_PTR(-EINVAL);
2064         else
2065                 tree = copy_tree(real_mount(path->mnt), path->dentry,
2066                                  CL_COPY_ALL | CL_PRIVATE);
2067         namespace_unlock();
2068         if (IS_ERR(tree))
2069                 return ERR_CAST(tree);
2070         return &tree->mnt;
2071 }
2072
2073 static void free_mnt_ns(struct mnt_namespace *);
2074 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
2075
2076 void dissolve_on_fput(struct vfsmount *mnt)
2077 {
2078         struct mnt_namespace *ns;
2079         namespace_lock();
2080         lock_mount_hash();
2081         ns = real_mount(mnt)->mnt_ns;
2082         if (ns) {
2083                 if (is_anon_ns(ns))
2084                         umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
2085                 else
2086                         ns = NULL;
2087         }
2088         unlock_mount_hash();
2089         namespace_unlock();
2090         if (ns)
2091                 free_mnt_ns(ns);
2092 }
2093
2094 void drop_collected_mounts(struct vfsmount *mnt)
2095 {
2096         namespace_lock();
2097         lock_mount_hash();
2098         umount_tree(real_mount(mnt), 0);
2099         unlock_mount_hash();
2100         namespace_unlock();
2101 }
2102
2103 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
2104 {
2105         struct mount *child;
2106
2107         list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2108                 if (!is_subdir(child->mnt_mountpoint, dentry))
2109                         continue;
2110
2111                 if (child->mnt.mnt_flags & MNT_LOCKED)
2112                         return true;
2113         }
2114         return false;
2115 }
2116
2117 /**
2118  * clone_private_mount - create a private clone of a path
2119  * @path: path to clone
2120  *
2121  * This creates a new vfsmount, which will be the clone of @path.  The new mount
2122  * will not be attached anywhere in the namespace and will be private (i.e.
2123  * changes to the originating mount won't be propagated into this).
2124  *
2125  * Release with mntput().
2126  */
2127 struct vfsmount *clone_private_mount(const struct path *path)
2128 {
2129         struct mount *old_mnt = real_mount(path->mnt);
2130         struct mount *new_mnt;
2131
2132         down_read(&namespace_sem);
2133         if (IS_MNT_UNBINDABLE(old_mnt))
2134                 goto invalid;
2135
2136         if (!check_mnt(old_mnt))
2137                 goto invalid;
2138
2139         if (has_locked_children(old_mnt, path->dentry))
2140                 goto invalid;
2141
2142         new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
2143         up_read(&namespace_sem);
2144
2145         if (IS_ERR(new_mnt))
2146                 return ERR_CAST(new_mnt);
2147
2148         /* Longterm mount to be removed by kern_unmount*() */
2149         new_mnt->mnt_ns = MNT_NS_INTERNAL;
2150
2151         return &new_mnt->mnt;
2152
2153 invalid:
2154         up_read(&namespace_sem);
2155         return ERR_PTR(-EINVAL);
2156 }
2157 EXPORT_SYMBOL_GPL(clone_private_mount);
2158
2159 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
2160                    struct vfsmount *root)
2161 {
2162         struct mount *mnt;
2163         int res = f(root, arg);
2164         if (res)
2165                 return res;
2166         list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
2167                 res = f(&mnt->mnt, arg);
2168                 if (res)
2169                         return res;
2170         }
2171         return 0;
2172 }
2173
2174 static void lock_mnt_tree(struct mount *mnt)
2175 {
2176         struct mount *p;
2177
2178         for (p = mnt; p; p = next_mnt(p, mnt)) {
2179                 int flags = p->mnt.mnt_flags;
2180                 /* Don't allow unprivileged users to change mount flags */
2181                 flags |= MNT_LOCK_ATIME;
2182
2183                 if (flags & MNT_READONLY)
2184                         flags |= MNT_LOCK_READONLY;
2185
2186                 if (flags & MNT_NODEV)
2187                         flags |= MNT_LOCK_NODEV;
2188
2189                 if (flags & MNT_NOSUID)
2190                         flags |= MNT_LOCK_NOSUID;
2191
2192                 if (flags & MNT_NOEXEC)
2193                         flags |= MNT_LOCK_NOEXEC;
2194                 /* Don't allow unprivileged users to reveal what is under a mount */
2195                 if (list_empty(&p->mnt_expire))
2196                         flags |= MNT_LOCKED;
2197                 p->mnt.mnt_flags = flags;
2198         }
2199 }
2200
2201 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
2202 {
2203         struct mount *p;
2204
2205         for (p = mnt; p != end; p = next_mnt(p, mnt)) {
2206                 if (p->mnt_group_id && !IS_MNT_SHARED(p))
2207                         mnt_release_group_id(p);
2208         }
2209 }
2210
2211 static int invent_group_ids(struct mount *mnt, bool recurse)
2212 {
2213         struct mount *p;
2214
2215         for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
2216                 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
2217                         int err = mnt_alloc_group_id(p);
2218                         if (err) {
2219                                 cleanup_group_ids(mnt, p);
2220                                 return err;
2221                         }
2222                 }
2223         }
2224
2225         return 0;
2226 }
2227
2228 int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
2229 {
2230         unsigned int max = READ_ONCE(sysctl_mount_max);
2231         unsigned int mounts = 0;
2232         struct mount *p;
2233
2234         if (ns->mounts >= max)
2235                 return -ENOSPC;
2236         max -= ns->mounts;
2237         if (ns->pending_mounts >= max)
2238                 return -ENOSPC;
2239         max -= ns->pending_mounts;
2240
2241         for (p = mnt; p; p = next_mnt(p, mnt))
2242                 mounts++;
2243
2244         if (mounts > max)
2245                 return -ENOSPC;
2246
2247         ns->pending_mounts += mounts;
2248         return 0;
2249 }
2250
2251 /*
2252  *  @source_mnt : mount tree to be attached
2253  *  @nd         : place the mount tree @source_mnt is attached
2254  *  @parent_nd  : if non-null, detach the source_mnt from its parent and
2255  *                 store the parent mount and mountpoint dentry.
2256  *                 (done when source_mnt is moved)
2257  *
2258  *  NOTE: in the table below explains the semantics when a source mount
2259  *  of a given type is attached to a destination mount of a given type.
2260  * ---------------------------------------------------------------------------
2261  * |         BIND MOUNT OPERATION                                            |
2262  * |**************************************************************************
2263  * | source-->| shared        |       private  |       slave    | unbindable |
2264  * | dest     |               |                |                |            |
2265  * |   |      |               |                |                |            |
2266  * |   v      |               |                |                |            |
2267  * |**************************************************************************
2268  * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
2269  * |          |               |                |                |            |
2270  * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
2271  * ***************************************************************************
2272  * A bind operation clones the source mount and mounts the clone on the
2273  * destination mount.
2274  *
2275  * (++)  the cloned mount is propagated to all the mounts in the propagation
2276  *       tree of the destination mount and the cloned mount is added to
2277  *       the peer group of the source mount.
2278  * (+)   the cloned mount is created under the destination mount and is marked
2279  *       as shared. The cloned mount is added to the peer group of the source
2280  *       mount.
2281  * (+++) the mount is propagated to all the mounts in the propagation tree
2282  *       of the destination mount and the cloned mount is made slave
2283  *       of the same master as that of the source mount. The cloned mount
2284  *       is marked as 'shared and slave'.
2285  * (*)   the cloned mount is made a slave of the same master as that of the
2286  *       source mount.
2287  *
2288  * ---------------------------------------------------------------------------
2289  * |                    MOVE MOUNT OPERATION                                 |
2290  * |**************************************************************************
2291  * | source-->| shared        |       private  |       slave    | unbindable |
2292  * | dest     |               |                |                |            |
2293  * |   |      |               |                |                |            |
2294  * |   v      |               |                |                |            |
2295  * |**************************************************************************
2296  * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
2297  * |          |               |                |                |            |
2298  * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
2299  * ***************************************************************************
2300  *
2301  * (+)  the mount is moved to the destination. And is then propagated to
2302  *      all the mounts in the propagation tree of the destination mount.
2303  * (+*)  the mount is moved to the destination.
2304  * (+++)  the mount is moved to the destination and is then propagated to
2305  *      all the mounts belonging to the destination mount's propagation tree.
2306  *      the mount is marked as 'shared and slave'.
2307  * (*)  the mount continues to be a slave at the new location.
2308  *
2309  * if the source mount is a tree, the operations explained above is
2310  * applied to each mount in the tree.
2311  * Must be called without spinlocks held, since this function can sleep
2312  * in allocations.
2313  */
2314 static int attach_recursive_mnt(struct mount *source_mnt,
2315                         struct mount *dest_mnt,
2316                         struct mountpoint *dest_mp,
2317                         bool moving)
2318 {
2319         struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2320         HLIST_HEAD(tree_list);
2321         struct mnt_namespace *ns = dest_mnt->mnt_ns;
2322         struct mountpoint *smp;
2323         struct mount *child, *p;
2324         struct hlist_node *n;
2325         int err;
2326
2327         /* Preallocate a mountpoint in case the new mounts need
2328          * to be tucked under other mounts.
2329          */
2330         smp = get_mountpoint(source_mnt->mnt.mnt_root);
2331         if (IS_ERR(smp))
2332                 return PTR_ERR(smp);
2333
2334         /* Is there space to add these mounts to the mount namespace? */
2335         if (!moving) {
2336                 err = count_mounts(ns, source_mnt);
2337                 if (err)
2338                         goto out;
2339         }
2340
2341         if (IS_MNT_SHARED(dest_mnt)) {
2342                 err = invent_group_ids(source_mnt, true);
2343                 if (err)
2344                         goto out;
2345                 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
2346                 lock_mount_hash();
2347                 if (err)
2348                         goto out_cleanup_ids;
2349                 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
2350                         set_mnt_shared(p);
2351         } else {
2352                 lock_mount_hash();
2353         }
2354         if (moving) {
2355                 unhash_mnt(source_mnt);
2356                 attach_mnt(source_mnt, dest_mnt, dest_mp);
2357                 touch_mnt_namespace(source_mnt->mnt_ns);
2358         } else {
2359                 if (source_mnt->mnt_ns) {
2360                         /* move from anon - the caller will destroy */
2361                         list_del_init(&source_mnt->mnt_ns->list);
2362                 }
2363                 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2364                 commit_tree(source_mnt);
2365         }
2366
2367         hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2368                 struct mount *q;
2369                 hlist_del_init(&child->mnt_hash);
2370                 q = __lookup_mnt(&child->mnt_parent->mnt,
2371                                  child->mnt_mountpoint);
2372                 if (q)
2373                         mnt_change_mountpoint(child, smp, q);
2374                 /* Notice when we are propagating across user namespaces */
2375                 if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2376                         lock_mnt_tree(child);
2377                 child->mnt.mnt_flags &= ~MNT_LOCKED;
2378                 commit_tree(child);
2379         }
2380         put_mountpoint(smp);
2381         unlock_mount_hash();
2382
2383         return 0;
2384
2385  out_cleanup_ids:
2386         while (!hlist_empty(&tree_list)) {
2387                 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
2388                 child->mnt_parent->mnt_ns->pending_mounts = 0;
2389                 umount_tree(child, UMOUNT_SYNC);
2390         }
2391         unlock_mount_hash();
2392         cleanup_group_ids(source_mnt, NULL);
2393  out:
2394         ns->pending_mounts = 0;
2395
2396         read_seqlock_excl(&mount_lock);
2397         put_mountpoint(smp);
2398         read_sequnlock_excl(&mount_lock);
2399
2400         return err;
2401 }
2402
2403 static struct mountpoint *lock_mount(struct path *path)
2404 {
2405         struct vfsmount *mnt;
2406         struct dentry *dentry = path->dentry;
2407 retry:
2408         inode_lock(dentry->d_inode);
2409         if (unlikely(cant_mount(dentry))) {
2410                 inode_unlock(dentry->d_inode);
2411                 return ERR_PTR(-ENOENT);
2412         }
2413         namespace_lock();
2414         mnt = lookup_mnt(path);
2415         if (likely(!mnt)) {
2416                 struct mountpoint *mp = get_mountpoint(dentry);
2417                 if (IS_ERR(mp)) {
2418                         namespace_unlock();
2419                         inode_unlock(dentry->d_inode);
2420                         return mp;
2421                 }
2422                 return mp;
2423         }
2424         namespace_unlock();
2425         inode_unlock(path->dentry->d_inode);
2426         path_put(path);
2427         path->mnt = mnt;
2428         dentry = path->dentry = dget(mnt->mnt_root);
2429         goto retry;
2430 }
2431
2432 static void unlock_mount(struct mountpoint *where)
2433 {
2434         struct dentry *dentry = where->m_dentry;
2435
2436         read_seqlock_excl(&mount_lock);
2437         put_mountpoint(where);
2438         read_sequnlock_excl(&mount_lock);
2439
2440         namespace_unlock();
2441         inode_unlock(dentry->d_inode);
2442 }
2443
2444 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
2445 {
2446         if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
2447                 return -EINVAL;
2448
2449         if (d_is_dir(mp->m_dentry) !=
2450               d_is_dir(mnt->mnt.mnt_root))
2451                 return -ENOTDIR;
2452
2453         return attach_recursive_mnt(mnt, p, mp, false);
2454 }
2455
2456 /*
2457  * Sanity check the flags to change_mnt_propagation.
2458  */
2459
2460 static int flags_to_propagation_type(int ms_flags)
2461 {
2462         int type = ms_flags & ~(MS_REC | MS_SILENT);
2463
2464         /* Fail if any non-propagation flags are set */
2465         if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2466                 return 0;
2467         /* Only one propagation flag should be set */
2468         if (!is_power_of_2(type))
2469                 return 0;
2470         return type;
2471 }
2472
2473 /*
2474  * recursively change the type of the mountpoint.
2475  */
2476 static int do_change_type(struct path *path, int ms_flags)
2477 {
2478         struct mount *m;
2479         struct mount *mnt = real_mount(path->mnt);
2480         int recurse = ms_flags & MS_REC;
2481         int type;
2482         int err = 0;
2483
2484         if (path->dentry != path->mnt->mnt_root)
2485                 return -EINVAL;
2486
2487         type = flags_to_propagation_type(ms_flags);
2488         if (!type)
2489                 return -EINVAL;
2490
2491         namespace_lock();
2492         if (type == MS_SHARED) {
2493                 err = invent_group_ids(mnt, recurse);
2494                 if (err)
2495                         goto out_unlock;
2496         }
2497
2498         lock_mount_hash();
2499         for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
2500                 change_mnt_propagation(m, type);
2501         unlock_mount_hash();
2502
2503  out_unlock:
2504         namespace_unlock();
2505         return err;
2506 }
2507
2508 static struct mount *__do_loopback(struct path *old_path, int recurse)
2509 {
2510         struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
2511
2512         if (IS_MNT_UNBINDABLE(old))
2513                 return mnt;
2514
2515         if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
2516                 return mnt;
2517
2518         if (!recurse && has_locked_children(old, old_path->dentry))
2519                 return mnt;
2520
2521         if (recurse)
2522                 mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
2523         else
2524                 mnt = clone_mnt(old, old_path->dentry, 0);
2525
2526         if (!IS_ERR(mnt))
2527                 mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2528
2529         return mnt;
2530 }
2531
2532 /*
2533  * do loopback mount.
2534  */
2535 static int do_loopback(struct path *path, const char *old_name,
2536                                 int recurse)
2537 {
2538         struct path old_path;
2539         struct mount *mnt = NULL, *parent;
2540         struct mountpoint *mp;
2541         int err;
2542         if (!old_name || !*old_name)
2543                 return -EINVAL;
2544         err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
2545         if (err)
2546                 return err;
2547
2548         err = -EINVAL;
2549         if (mnt_ns_loop(old_path.dentry))
2550                 goto out;
2551
2552         mp = lock_mount(path);
2553         if (IS_ERR(mp)) {
2554                 err = PTR_ERR(mp);
2555                 goto out;
2556         }
2557
2558         parent = real_mount(path->mnt);
2559         if (!check_mnt(parent))
2560                 goto out2;
2561
2562         mnt = __do_loopback(&old_path, recurse);
2563         if (IS_ERR(mnt)) {
2564                 err = PTR_ERR(mnt);
2565                 goto out2;
2566         }
2567
2568         err = graft_tree(mnt, parent, mp);
2569         if (err) {
2570                 lock_mount_hash();
2571                 umount_tree(mnt, UMOUNT_SYNC);
2572                 unlock_mount_hash();
2573         }
2574 out2:
2575         unlock_mount(mp);
2576 out:
2577         path_put(&old_path);
2578         return err;
2579 }
2580
2581 static struct file *open_detached_copy(struct path *path, bool recursive)
2582 {
2583         struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2584         struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
2585         struct mount *mnt, *p;
2586         struct file *file;
2587
2588         if (IS_ERR(ns))
2589                 return ERR_CAST(ns);
2590
2591         namespace_lock();
2592         mnt = __do_loopback(path, recursive);
2593         if (IS_ERR(mnt)) {
2594                 namespace_unlock();
2595                 free_mnt_ns(ns);
2596                 return ERR_CAST(mnt);
2597         }
2598
2599         lock_mount_hash();
2600         for (p = mnt; p; p = next_mnt(p, mnt)) {
2601                 p->mnt_ns = ns;
2602                 ns->mounts++;
2603         }
2604         ns->root = mnt;
2605         list_add_tail(&ns->list, &mnt->mnt_list);
2606         mntget(&mnt->mnt);
2607         unlock_mount_hash();
2608         namespace_unlock();
2609
2610         mntput(path->mnt);
2611         path->mnt = &mnt->mnt;
2612         file = dentry_open(path, O_PATH, current_cred());
2613         if (IS_ERR(file))
2614                 dissolve_on_fput(path->mnt);
2615         else
2616                 file->f_mode |= FMODE_NEED_UNMOUNT;
2617         return file;
2618 }
2619
2620 SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
2621 {
2622         struct file *file;
2623         struct path path;
2624         int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
2625         bool detached = flags & OPEN_TREE_CLONE;
2626         int error;
2627         int fd;
2628
2629         BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
2630
2631         if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
2632                       AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
2633                       OPEN_TREE_CLOEXEC))
2634                 return -EINVAL;
2635
2636         if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
2637                 return -EINVAL;
2638
2639         if (flags & AT_NO_AUTOMOUNT)
2640                 lookup_flags &= ~LOOKUP_AUTOMOUNT;
2641         if (flags & AT_SYMLINK_NOFOLLOW)
2642                 lookup_flags &= ~LOOKUP_FOLLOW;
2643         if (flags & AT_EMPTY_PATH)
2644                 lookup_flags |= LOOKUP_EMPTY;
2645
2646         if (detached && !may_mount())
2647                 return -EPERM;
2648
2649         fd = get_unused_fd_flags(flags & O_CLOEXEC);
2650         if (fd < 0)
2651                 return fd;
2652
2653         error = user_path_at(dfd, filename, lookup_flags, &path);
2654         if (unlikely(error)) {
2655                 file = ERR_PTR(error);
2656         } else {
2657                 if (detached)
2658                         file = open_detached_copy(&path, flags & AT_RECURSIVE);
2659                 else
2660                         file = dentry_open(&path, O_PATH, current_cred());
2661                 path_put(&path);
2662         }
2663         if (IS_ERR(file)) {
2664                 put_unused_fd(fd);
2665                 return PTR_ERR(file);
2666         }
2667         fd_install(fd, file);
2668         return fd;
2669 }
2670
2671 /*
2672  * Don't allow locked mount flags to be cleared.
2673  *
2674  * No locks need to be held here while testing the various MNT_LOCK
2675  * flags because those flags can never be cleared once they are set.
2676  */
2677 static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
2678 {
2679         unsigned int fl = mnt->mnt.mnt_flags;
2680
2681         if ((fl & MNT_LOCK_READONLY) &&
2682             !(mnt_flags & MNT_READONLY))
2683                 return false;
2684
2685         if ((fl & MNT_LOCK_NODEV) &&
2686             !(mnt_flags & MNT_NODEV))
2687                 return false;
2688
2689         if ((fl & MNT_LOCK_NOSUID) &&
2690             !(mnt_flags & MNT_NOSUID))
2691                 return false;
2692
2693         if ((fl & MNT_LOCK_NOEXEC) &&
2694             !(mnt_flags & MNT_NOEXEC))
2695                 return false;
2696
2697         if ((fl & MNT_LOCK_ATIME) &&
2698             ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
2699                 return false;
2700
2701         return true;
2702 }
2703
2704 static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
2705 {
2706         bool readonly_request = (mnt_flags & MNT_READONLY);
2707
2708         if (readonly_request == __mnt_is_readonly(&mnt->mnt))
2709                 return 0;
2710
2711         if (readonly_request)
2712                 return mnt_make_readonly(mnt);
2713
2714         mnt->mnt.mnt_flags &= ~MNT_READONLY;
2715         return 0;
2716 }
2717
2718 static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
2719 {
2720         mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2721         mnt->mnt.mnt_flags = mnt_flags;
2722         touch_mnt_namespace(mnt->mnt_ns);
2723 }
2724
2725 static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
2726 {
2727         struct super_block *sb = mnt->mnt_sb;
2728
2729         if (!__mnt_is_readonly(mnt) &&
2730            (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
2731            (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
2732                 char *buf = (char *)__get_free_page(GFP_KERNEL);
2733                 char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
2734                 struct tm tm;
2735
2736                 time64_to_tm(sb->s_time_max, 0, &tm);
2737
2738                 pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n",
2739                         sb->s_type->name,
2740                         is_mounted(mnt) ? "remounted" : "mounted",
2741                         mntpath,
2742                         tm.tm_year+1900, (unsigned long long)sb->s_time_max);
2743
2744                 free_page((unsigned long)buf);
2745                 sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
2746         }
2747 }
2748
2749 /*
2750  * Handle reconfiguration of the mountpoint only without alteration of the
2751  * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
2752  * to mount(2).
2753  */
2754 static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
2755 {
2756         struct super_block *sb = path->mnt->mnt_sb;
2757         struct mount *mnt = real_mount(path->mnt);
2758         int ret;
2759
2760         if (!check_mnt(mnt))
2761                 return -EINVAL;
2762
2763         if (path->dentry != mnt->mnt.mnt_root)
2764                 return -EINVAL;
2765
2766         if (!can_change_locked_flags(mnt, mnt_flags))
2767                 return -EPERM;
2768
2769         /*
2770          * We're only checking whether the superblock is read-only not
2771          * changing it, so only take down_read(&sb->s_umount).
2772          */
2773         down_read(&sb->s_umount);
2774         lock_mount_hash();
2775         ret = change_mount_ro_state(mnt, mnt_flags);
2776         if (ret == 0)
2777                 set_mount_attributes(mnt, mnt_flags);
2778         unlock_mount_hash();
2779         up_read(&sb->s_umount);
2780
2781         mnt_warn_timestamp_expiry(path, &mnt->mnt);
2782
2783         return ret;
2784 }
2785
2786 /*
2787  * change filesystem flags. dir should be a physical root of filesystem.
2788  * If you've mounted a non-root directory somewhere and want to do remount
2789  * on it - tough luck.
2790  */
2791 static int do_remount(struct path *path, int ms_flags, int sb_flags,
2792                       int mnt_flags, void *data)
2793 {
2794         int err;
2795         struct super_block *sb = path->mnt->mnt_sb;
2796         struct mount *mnt = real_mount(path->mnt);
2797         struct fs_context *fc;
2798
2799         if (!check_mnt(mnt))
2800                 return -EINVAL;
2801
2802         if (path->dentry != path->mnt->mnt_root)
2803                 return -EINVAL;
2804
2805         if (!can_change_locked_flags(mnt, mnt_flags))
2806                 return -EPERM;
2807
2808         fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
2809         if (IS_ERR(fc))
2810                 return PTR_ERR(fc);
2811
2812         fc->oldapi = true;
2813         err = parse_monolithic_mount_data(fc, data);
2814         if (!err) {
2815                 down_write(&sb->s_umount);
2816                 err = -EPERM;
2817                 if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
2818                         err = reconfigure_super(fc);
2819                         if (!err) {
2820                                 lock_mount_hash();
2821                                 set_mount_attributes(mnt, mnt_flags);
2822                                 unlock_mount_hash();
2823                         }
2824                 }
2825                 up_write(&sb->s_umount);
2826         }
2827
2828         mnt_warn_timestamp_expiry(path, &mnt->mnt);
2829
2830         put_fs_context(fc);
2831         return err;
2832 }
2833
2834 static inline int tree_contains_unbindable(struct mount *mnt)
2835 {
2836         struct mount *p;
2837         for (p = mnt; p; p = next_mnt(p, mnt)) {
2838                 if (IS_MNT_UNBINDABLE(p))
2839                         return 1;
2840         }
2841         return 0;
2842 }
2843
2844 /*
2845  * Check that there aren't references to earlier/same mount namespaces in the
2846  * specified subtree.  Such references can act as pins for mount namespaces
2847  * that aren't checked by the mount-cycle checking code, thereby allowing
2848  * cycles to be made.
2849  */
2850 static bool check_for_nsfs_mounts(struct mount *subtree)
2851 {
2852         struct mount *p;
2853         bool ret = false;
2854
2855         lock_mount_hash();
2856         for (p = subtree; p; p = next_mnt(p, subtree))
2857                 if (mnt_ns_loop(p->mnt.mnt_root))
2858                         goto out;
2859
2860         ret = true;
2861 out:
2862         unlock_mount_hash();
2863         return ret;
2864 }
2865
2866 static int do_set_group(struct path *from_path, struct path *to_path)
2867 {
2868         struct mount *from, *to;
2869         int err;
2870
2871         from = real_mount(from_path->mnt);
2872         to = real_mount(to_path->mnt);
2873
2874         namespace_lock();
2875
2876         err = -EINVAL;
2877         /* To and From must be mounted */
2878         if (!is_mounted(&from->mnt))
2879                 goto out;
2880         if (!is_mounted(&to->mnt))
2881                 goto out;
2882
2883         err = -EPERM;
2884         /* We should be allowed to modify mount namespaces of both mounts */
2885         if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
2886                 goto out;
2887         if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
2888                 goto out;
2889
2890         err = -EINVAL;
2891         /* To and From paths should be mount roots */
2892         if (from_path->dentry != from_path->mnt->mnt_root)
2893                 goto out;
2894         if (to_path->dentry != to_path->mnt->mnt_root)
2895                 goto out;
2896
2897         /* Setting sharing groups is only allowed across same superblock */
2898         if (from->mnt.mnt_sb != to->mnt.mnt_sb)
2899                 goto out;
2900
2901         /* From mount root should be wider than To mount root */
2902         if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
2903                 goto out;
2904
2905         /* From mount should not have locked children in place of To's root */
2906         if (has_locked_children(from, to->mnt.mnt_root))
2907                 goto out;
2908
2909         /* Setting sharing groups is only allowed on private mounts */
2910         if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
2911                 goto out;
2912
2913         /* From should not be private */
2914         if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
2915                 goto out;
2916
2917         if (IS_MNT_SLAVE(from)) {
2918                 struct mount *m = from->mnt_master;
2919
2920                 list_add(&to->mnt_slave, &m->mnt_slave_list);
2921                 to->mnt_master = m;
2922         }
2923
2924         if (IS_MNT_SHARED(from)) {
2925                 to->mnt_group_id = from->mnt_group_id;
2926                 list_add(&to->mnt_share, &from->mnt_share);
2927                 lock_mount_hash();
2928                 set_mnt_shared(to);
2929                 unlock_mount_hash();
2930         }
2931
2932         err = 0;
2933 out:
2934         namespace_unlock();
2935         return err;
2936 }
2937
2938 static int do_move_mount(struct path *old_path, struct path *new_path)
2939 {
2940         struct mnt_namespace *ns;
2941         struct mount *p;
2942         struct mount *old;
2943         struct mount *parent;
2944         struct mountpoint *mp, *old_mp;
2945         int err;
2946         bool attached;
2947
2948         mp = lock_mount(new_path);
2949         if (IS_ERR(mp))
2950                 return PTR_ERR(mp);
2951
2952         old = real_mount(old_path->mnt);
2953         p = real_mount(new_path->mnt);
2954         parent = old->mnt_parent;
2955         attached = mnt_has_parent(old);
2956         old_mp = old->mnt_mp;
2957         ns = old->mnt_ns;
2958
2959         err = -EINVAL;
2960         /* The mountpoint must be in our namespace. */
2961         if (!check_mnt(p))
2962                 goto out;
2963
2964         /* The thing moved must be mounted... */
2965         if (!is_mounted(&old->mnt))
2966                 goto out;
2967
2968         /* ... and either ours or the root of anon namespace */
2969         if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
2970                 goto out;
2971
2972         if (old->mnt.mnt_flags & MNT_LOCKED)
2973                 goto out;
2974
2975         if (old_path->dentry != old_path->mnt->mnt_root)
2976                 goto out;
2977
2978         if (d_is_dir(new_path->dentry) !=
2979             d_is_dir(old_path->dentry))
2980                 goto out;
2981         /*
2982          * Don't move a mount residing in a shared parent.
2983          */
2984         if (attached && IS_MNT_SHARED(parent))
2985                 goto out;
2986         /*
2987          * Don't move a mount tree containing unbindable mounts to a destination
2988          * mount which is shared.
2989          */
2990         if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
2991                 goto out;
2992         err = -ELOOP;
2993         if (!check_for_nsfs_mounts(old))
2994                 goto out;
2995         for (; mnt_has_parent(p); p = p->mnt_parent)
2996                 if (p == old)
2997                         goto out;
2998
2999         err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
3000                                    attached);
3001         if (err)
3002                 goto out;
3003
3004         /* if the mount is moved, it should no longer be expire
3005          * automatically */
3006         list_del_init(&old->mnt_expire);
3007         if (attached)
3008                 put_mountpoint(old_mp);
3009 out:
3010         unlock_mount(mp);
3011         if (!err) {
3012                 if (attached)
3013                         mntput_no_expire(parent);
3014                 else
3015                         free_mnt_ns(ns);
3016         }
3017         return err;
3018 }
3019
3020 static int do_move_mount_old(struct path *path, const char *old_name)
3021 {
3022         struct path old_path;
3023         int err;
3024
3025         if (!old_name || !*old_name)
3026                 return -EINVAL;
3027
3028         err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
3029         if (err)
3030                 return err;
3031
3032         err = do_move_mount(&old_path, path);
3033         path_put(&old_path);
3034         return err;
3035 }
3036
3037 /*
3038  * add a mount into a namespace's mount tree
3039  */
3040 static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
3041                         const struct path *path, int mnt_flags)
3042 {
3043         struct mount *parent = real_mount(path->mnt);
3044
3045         mnt_flags &= ~MNT_INTERNAL_FLAGS;
3046
3047         if (unlikely(!check_mnt(parent))) {
3048                 /* that's acceptable only for automounts done in private ns */
3049                 if (!(mnt_flags & MNT_SHRINKABLE))
3050                         return -EINVAL;
3051                 /* ... and for those we'd better have mountpoint still alive */
3052                 if (!parent->mnt_ns)
3053                         return -EINVAL;
3054         }
3055
3056         /* Refuse the same filesystem on the same mount point */
3057         if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
3058             path->mnt->mnt_root == path->dentry)
3059                 return -EBUSY;
3060
3061         if (d_is_symlink(newmnt->mnt.mnt_root))
3062                 return -EINVAL;
3063
3064         newmnt->mnt.mnt_flags = mnt_flags;
3065         return graft_tree(newmnt, parent, mp);
3066 }
3067
3068 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
3069
3070 /*
3071  * Create a new mount using a superblock configuration and request it
3072  * be added to the namespace tree.
3073  */
3074 static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
3075                            unsigned int mnt_flags)
3076 {
3077         struct vfsmount *mnt;
3078         struct mountpoint *mp;
3079         struct super_block *sb = fc->root->d_sb;
3080         int error;
3081
3082         error = security_sb_kern_mount(sb);
3083         if (!error && mount_too_revealing(sb, &mnt_flags))
3084                 error = -EPERM;
3085
3086         if (unlikely(error)) {
3087                 fc_drop_locked(fc);
3088                 return error;
3089         }
3090
3091         up_write(&sb->s_umount);
3092
3093         mnt = vfs_create_mount(fc);
3094         if (IS_ERR(mnt))
3095                 return PTR_ERR(mnt);
3096
3097         mnt_warn_timestamp_expiry(mountpoint, mnt);
3098
3099         mp = lock_mount(mountpoint);
3100         if (IS_ERR(mp)) {
3101                 mntput(mnt);
3102                 return PTR_ERR(mp);
3103         }
3104         error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
3105         unlock_mount(mp);
3106         if (error < 0)
3107                 mntput(mnt);
3108         return error;
3109 }
3110
3111 /*
3112  * create a new mount for userspace and request it to be added into the
3113  * namespace's tree
3114  */
3115 static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
3116                         int mnt_flags, const char *name, void *data)
3117 {
3118         struct file_system_type *type;
3119         struct fs_context *fc;
3120         const char *subtype = NULL;
3121         int err = 0;
3122
3123         if (!fstype)
3124                 return -EINVAL;
3125
3126         type = get_fs_type(fstype);
3127         if (!type)
3128                 return -ENODEV;
3129
3130         if (type->fs_flags & FS_HAS_SUBTYPE) {
3131                 subtype = strchr(fstype, '.');
3132                 if (subtype) {
3133                         subtype++;
3134                         if (!*subtype) {
3135                                 put_filesystem(type);
3136                                 return -EINVAL;
3137                         }
3138                 }
3139         }
3140
3141         fc = fs_context_for_mount(type, sb_flags);
3142         put_filesystem(type);
3143         if (IS_ERR(fc))
3144                 return PTR_ERR(fc);
3145
3146         if (subtype)
3147                 err = vfs_parse_fs_string(fc, "subtype",
3148                                           subtype, strlen(subtype));
3149         if (!err && name)
3150                 err = vfs_parse_fs_string(fc, "source", name, strlen(name));
3151         if (!err)
3152                 err = parse_monolithic_mount_data(fc, data);
3153         if (!err && !mount_capable(fc))
3154                 err = -EPERM;
3155         if (!err)
3156                 err = vfs_get_tree(fc);
3157         if (!err)
3158                 err = do_new_mount_fc(fc, path, mnt_flags);
3159
3160         put_fs_context(fc);
3161         return err;
3162 }
3163
3164 int finish_automount(struct vfsmount *m, const struct path *path)
3165 {
3166         struct dentry *dentry = path->dentry;
3167         struct mountpoint *mp;
3168         struct mount *mnt;
3169         int err;
3170
3171         if (!m)
3172                 return 0;
3173         if (IS_ERR(m))
3174                 return PTR_ERR(m);
3175
3176         mnt = real_mount(m);
3177         /* The new mount record should have at least 2 refs to prevent it being
3178          * expired before we get a chance to add it
3179          */
3180         BUG_ON(mnt_get_count(mnt) < 2);
3181
3182         if (m->mnt_sb == path->mnt->mnt_sb &&
3183             m->mnt_root == dentry) {
3184                 err = -ELOOP;
3185                 goto discard;
3186         }
3187
3188         /*
3189          * we don't want to use lock_mount() - in this case finding something
3190          * that overmounts our mountpoint to be means "quitely drop what we've
3191          * got", not "try to mount it on top".
3192          */
3193         inode_lock(dentry->d_inode);
3194         namespace_lock();
3195         if (unlikely(cant_mount(dentry))) {
3196                 err = -ENOENT;
3197                 goto discard_locked;
3198         }
3199         rcu_read_lock();
3200         if (unlikely(__lookup_mnt(path->mnt, dentry))) {
3201                 rcu_read_unlock();
3202                 err = 0;
3203                 goto discard_locked;
3204         }
3205         rcu_read_unlock();
3206         mp = get_mountpoint(dentry);
3207         if (IS_ERR(mp)) {
3208                 err = PTR_ERR(mp);
3209                 goto discard_locked;
3210         }
3211
3212         err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
3213         unlock_mount(mp);
3214         if (unlikely(err))
3215                 goto discard;
3216         mntput(m);
3217         return 0;
3218
3219 discard_locked:
3220         namespace_unlock();
3221         inode_unlock(dentry->d_inode);
3222 discard:
3223         /* remove m from any expiration list it may be on */
3224         if (!list_empty(&mnt->mnt_expire)) {
3225                 namespace_lock();
3226                 list_del_init(&mnt->mnt_expire);
3227                 namespace_unlock();
3228         }
3229         mntput(m);
3230         mntput(m);
3231         return err;
3232 }
3233
3234 /**
3235  * mnt_set_expiry - Put a mount on an expiration list
3236  * @mnt: The mount to list.
3237  * @expiry_list: The list to add the mount to.
3238  */
3239 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
3240 {
3241         namespace_lock();
3242
3243         list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
3244
3245         namespace_unlock();
3246 }
3247 EXPORT_SYMBOL(mnt_set_expiry);
3248
3249 /*
3250  * process a list of expirable mountpoints with the intent of discarding any
3251  * mountpoints that aren't in use and haven't been touched since last we came
3252  * here
3253  */
3254 void mark_mounts_for_expiry(struct list_head *mounts)
3255 {
3256         struct mount *mnt, *next;
3257         LIST_HEAD(graveyard);
3258
3259         if (list_empty(mounts))
3260                 return;
3261
3262         namespace_lock();
3263         lock_mount_hash();
3264
3265         /* extract from the expiration list every vfsmount that matches the
3266          * following criteria:
3267          * - only referenced by its parent vfsmount
3268          * - still marked for expiry (marked on the last call here; marks are
3269          *   cleared by mntput())
3270          */
3271         list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
3272                 if (!xchg(&mnt->mnt_expiry_mark, 1) ||
3273                         propagate_mount_busy(mnt, 1))
3274                         continue;
3275                 list_move(&mnt->mnt_expire, &graveyard);
3276         }
3277         while (!list_empty(&graveyard)) {
3278                 mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
3279                 touch_mnt_namespace(mnt->mnt_ns);
3280                 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3281         }
3282         unlock_mount_hash();
3283         namespace_unlock();
3284 }
3285
3286 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
3287
3288 /*
3289  * Ripoff of 'select_parent()'
3290  *
3291  * search the list of submounts for a given mountpoint, and move any
3292  * shrinkable submounts to the 'graveyard' list.
3293  */
3294 static int select_submounts(struct mount *parent, struct list_head *graveyard)
3295 {
3296         struct mount *this_parent = parent;
3297         struct list_head *next;
3298         int found = 0;
3299
3300 repeat:
3301         next = this_parent->mnt_mounts.next;
3302 resume:
3303         while (next != &this_parent->mnt_mounts) {
3304                 struct list_head *tmp = next;
3305                 struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
3306
3307                 next = tmp->next;
3308                 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
3309                         continue;
3310                 /*
3311                  * Descend a level if the d_mounts list is non-empty.
3312                  */
3313                 if (!list_empty(&mnt->mnt_mounts)) {
3314                         this_parent = mnt;
3315                         goto repeat;
3316                 }
3317
3318                 if (!propagate_mount_busy(mnt, 1)) {
3319                         list_move_tail(&mnt->mnt_expire, graveyard);
3320                         found++;
3321                 }
3322         }
3323         /*
3324          * All done at this level ... ascend and resume the search
3325          */
3326         if (this_parent != parent) {
3327                 next = this_parent->mnt_child.next;
3328                 this_parent = this_parent->mnt_parent;
3329                 goto resume;
3330         }
3331         return found;
3332 }
3333
3334 /*
3335  * process a list of expirable mountpoints with the intent of discarding any
3336  * submounts of a specific parent mountpoint
3337  *
3338  * mount_lock must be held for write
3339  */
3340 static void shrink_submounts(struct mount *mnt)
3341 {
3342         LIST_HEAD(graveyard);
3343         struct mount *m;
3344
3345         /* extract submounts of 'mountpoint' from the expiration list */
3346         while (select_submounts(mnt, &graveyard)) {
3347                 while (!list_empty(&graveyard)) {
3348                         m = list_first_entry(&graveyard, struct mount,
3349                                                 mnt_expire);
3350                         touch_mnt_namespace(m->mnt_ns);
3351                         umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3352                 }
3353         }
3354 }
3355
3356 static void *copy_mount_options(const void __user * data)
3357 {
3358         char *copy;
3359         unsigned left, offset;
3360
3361         if (!data)
3362                 return NULL;
3363
3364         copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
3365         if (!copy)
3366                 return ERR_PTR(-ENOMEM);
3367
3368         left = copy_from_user(copy, data, PAGE_SIZE);
3369
3370         /*
3371          * Not all architectures have an exact copy_from_user(). Resort to
3372          * byte at a time.
3373          */
3374         offset = PAGE_SIZE - left;
3375         while (left) {
3376                 char c;
3377                 if (get_user(c, (const char __user *)data + offset))
3378                         break;
3379                 copy[offset] = c;
3380                 left--;
3381                 offset++;
3382         }
3383
3384         if (left == PAGE_SIZE) {
3385                 kfree(copy);
3386                 return ERR_PTR(-EFAULT);
3387         }
3388
3389         return copy;
3390 }
3391
3392 static char *copy_mount_string(const void __user *data)
3393 {
3394         return data ? strndup_user(data, PATH_MAX) : NULL;
3395 }
3396
3397 /*
3398  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
3399  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
3400  *
3401  * data is a (void *) that can point to any structure up to
3402  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
3403  * information (or be NULL).
3404  *
3405  * Pre-0.97 versions of mount() didn't have a flags word.
3406  * When the flags word was introduced its top half was required
3407  * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
3408  * Therefore, if this magic number is present, it carries no information
3409  * and must be discarded.
3410  */
3411 int path_mount(const char *dev_name, struct path *path,
3412                 const char *type_page, unsigned long flags, void *data_page)
3413 {
3414         unsigned int mnt_flags = 0, sb_flags;
3415         int ret;
3416
3417         /* Discard magic */
3418         if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
3419                 flags &= ~MS_MGC_MSK;
3420
3421         /* Basic sanity checks */
3422         if (data_page)
3423                 ((char *)data_page)[PAGE_SIZE - 1] = 0;
3424
3425         if (flags & MS_NOUSER)
3426                 return -EINVAL;
3427
3428         ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
3429         if (ret)
3430                 return ret;
3431         if (!may_mount())
3432                 return -EPERM;
3433         if (flags & SB_MANDLOCK)
3434                 warn_mandlock();
3435
3436         /* Default to relatime unless overriden */
3437         if (!(flags & MS_NOATIME))
3438                 mnt_flags |= MNT_RELATIME;
3439
3440         /* Separate the per-mountpoint flags */
3441         if (flags & MS_NOSUID)
3442                 mnt_flags |= MNT_NOSUID;
3443         if (flags & MS_NODEV)
3444                 mnt_flags |= MNT_NODEV;
3445         if (flags & MS_NOEXEC)
3446                 mnt_flags |= MNT_NOEXEC;
3447         if (flags & MS_NOATIME)
3448                 mnt_flags |= MNT_NOATIME;
3449         if (flags & MS_NODIRATIME)
3450                 mnt_flags |= MNT_NODIRATIME;
3451         if (flags & MS_STRICTATIME)
3452                 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
3453         if (flags & MS_RDONLY)
3454                 mnt_flags |= MNT_READONLY;
3455         if (flags & MS_NOSYMFOLLOW)
3456                 mnt_flags |= MNT_NOSYMFOLLOW;
3457
3458         /* The default atime for remount is preservation */
3459         if ((flags & MS_REMOUNT) &&
3460             ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
3461                        MS_STRICTATIME)) == 0)) {
3462                 mnt_flags &= ~MNT_ATIME_MASK;
3463                 mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
3464         }
3465
3466         sb_flags = flags & (SB_RDONLY |
3467                             SB_SYNCHRONOUS |
3468                             SB_MANDLOCK |
3469                             SB_DIRSYNC |
3470                             SB_SILENT |
3471                             SB_POSIXACL |
3472                             SB_LAZYTIME |
3473                             SB_I_VERSION);
3474
3475         if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
3476                 return do_reconfigure_mnt(path, mnt_flags);
3477         if (flags & MS_REMOUNT)
3478                 return do_remount(path, flags, sb_flags, mnt_flags, data_page);
3479         if (flags & MS_BIND)
3480                 return do_loopback(path, dev_name, flags & MS_REC);
3481         if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
3482                 return do_change_type(path, flags);
3483         if (flags & MS_MOVE)
3484                 return do_move_mount_old(path, dev_name);
3485
3486         return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
3487                             data_page);
3488 }
3489
3490 long do_mount(const char *dev_name, const char __user *dir_name,
3491                 const char *type_page, unsigned long flags, void *data_page)
3492 {
3493         struct path path;
3494         int ret;
3495
3496         ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
3497         if (ret)
3498                 return ret;
3499         ret = path_mount(dev_name, &path, type_page, flags, data_page);
3500         path_put(&path);
3501         return ret;
3502 }
3503
3504 static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
3505 {
3506         return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
3507 }
3508
3509 static void dec_mnt_namespaces(struct ucounts *ucounts)
3510 {
3511         dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
3512 }
3513
3514 static void free_mnt_ns(struct mnt_namespace *ns)
3515 {
3516         if (!is_anon_ns(ns))
3517                 ns_free_inum(&ns->ns);
3518         dec_mnt_namespaces(ns->ucounts);
3519         put_user_ns(ns->user_ns);
3520         kfree(ns);
3521 }
3522
3523 /*
3524  * Assign a sequence number so we can detect when we attempt to bind
3525  * mount a reference to an older mount namespace into the current
3526  * mount namespace, preventing reference counting loops.  A 64bit
3527  * number incrementing at 10Ghz will take 12,427 years to wrap which
3528  * is effectively never, so we can ignore the possibility.
3529  */
3530 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
3531
3532 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
3533 {
3534         struct mnt_namespace *new_ns;
3535         struct ucounts *ucounts;
3536         int ret;
3537
3538         ucounts = inc_mnt_namespaces(user_ns);
3539         if (!ucounts)
3540                 return ERR_PTR(-ENOSPC);
3541
3542         new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
3543         if (!new_ns) {
3544                 dec_mnt_namespaces(ucounts);
3545                 return ERR_PTR(-ENOMEM);
3546         }
3547         if (!anon) {
3548                 ret = ns_alloc_inum(&new_ns->ns);
3549                 if (ret) {
3550                         kfree(new_ns);
3551                         dec_mnt_namespaces(ucounts);
3552                         return ERR_PTR(ret);
3553                 }
3554         }
3555         new_ns->ns.ops = &mntns_operations;
3556         if (!anon)
3557                 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
3558         refcount_set(&new_ns->ns.count, 1);
3559         INIT_LIST_HEAD(&new_ns->list);
3560         init_waitqueue_head(&new_ns->poll);
3561         spin_lock_init(&new_ns->ns_lock);
3562         new_ns->user_ns = get_user_ns(user_ns);
3563         new_ns->ucounts = ucounts;
3564         return new_ns;
3565 }
3566
3567 __latent_entropy
3568 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
3569                 struct user_namespace *user_ns, struct fs_struct *new_fs)
3570 {
3571         struct mnt_namespace *new_ns;
3572         struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
3573         struct mount *p, *q;
3574         struct mount *old;
3575         struct mount *new;
3576         int copy_flags;
3577
3578         BUG_ON(!ns);
3579
3580         if (likely(!(flags & CLONE_NEWNS))) {
3581                 get_mnt_ns(ns);
3582                 return ns;
3583         }
3584
3585         old = ns->root;
3586
3587         new_ns = alloc_mnt_ns(user_ns, false);
3588         if (IS_ERR(new_ns))
3589                 return new_ns;
3590
3591         namespace_lock();
3592         /* First pass: copy the tree topology */
3593         copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
3594         if (user_ns != ns->user_ns)
3595                 copy_flags |= CL_SHARED_TO_SLAVE;
3596         new = copy_tree(old, old->mnt.mnt_root, copy_flags);
3597         if (IS_ERR(new)) {
3598                 namespace_unlock();
3599                 free_mnt_ns(new_ns);
3600                 return ERR_CAST(new);
3601         }
3602         if (user_ns != ns->user_ns) {
3603                 lock_mount_hash();
3604                 lock_mnt_tree(new);
3605                 unlock_mount_hash();
3606         }
3607         new_ns->root = new;
3608         list_add_tail(&new_ns->list, &new->mnt_list);
3609
3610         /*
3611          * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
3612          * as belonging to new namespace.  We have already acquired a private
3613          * fs_struct, so tsk->fs->lock is not needed.
3614          */
3615         p = old;
3616         q = new;
3617         while (p) {
3618                 q->mnt_ns = new_ns;
3619                 new_ns->mounts++;
3620                 if (new_fs) {
3621                         if (&p->mnt == new_fs->root.mnt) {
3622                                 new_fs->root.mnt = mntget(&q->mnt);
3623                                 rootmnt = &p->mnt;
3624                         }
3625                         if (&p->mnt == new_fs->pwd.mnt) {
3626                                 new_fs->pwd.mnt = mntget(&q->mnt);
3627                                 pwdmnt = &p->mnt;
3628                         }
3629                 }
3630                 p = next_mnt(p, old);
3631                 q = next_mnt(q, new);
3632                 if (!q)
3633                         break;
3634                 // an mntns binding we'd skipped?
3635                 while (p->mnt.mnt_root != q->mnt.mnt_root)
3636                         p = next_mnt(skip_mnt_tree(p), old);
3637         }
3638         namespace_unlock();
3639
3640         if (rootmnt)
3641                 mntput(rootmnt);
3642         if (pwdmnt)
3643                 mntput(pwdmnt);
3644
3645         return new_ns;
3646 }
3647
3648 struct dentry *mount_subtree(struct vfsmount *m, const char *name)
3649 {
3650         struct mount *mnt = real_mount(m);
3651         struct mnt_namespace *ns;
3652         struct super_block *s;
3653         struct path path;
3654         int err;
3655
3656         ns = alloc_mnt_ns(&init_user_ns, true);
3657         if (IS_ERR(ns)) {
3658                 mntput(m);
3659                 return ERR_CAST(ns);
3660         }
3661         mnt->mnt_ns = ns;
3662         ns->root = mnt;
3663         ns->mounts++;
3664         list_add(&mnt->mnt_list, &ns->list);
3665
3666         err = vfs_path_lookup(m->mnt_root, m,
3667                         name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
3668
3669         put_mnt_ns(ns);
3670
3671         if (err)
3672                 return ERR_PTR(err);
3673
3674         /* trade a vfsmount reference for active sb one */
3675         s = path.mnt->mnt_sb;
3676         atomic_inc(&s->s_active);
3677         mntput(path.mnt);
3678         /* lock the sucker */
3679         down_write(&s->s_umount);
3680         /* ... and return the root of (sub)tree on it */
3681         return path.dentry;
3682 }
3683 EXPORT_SYMBOL(mount_subtree);
3684
3685 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
3686                 char __user *, type, unsigned long, flags, void __user *, data)
3687 {
3688         int ret;
3689         char *kernel_type;
3690         char *kernel_dev;
3691         void *options;
3692
3693         kernel_type = copy_mount_string(type);
3694         ret = PTR_ERR(kernel_type);
3695         if (IS_ERR(kernel_type))
3696                 goto out_type;
3697
3698         kernel_dev = copy_mount_string(dev_name);
3699         ret = PTR_ERR(kernel_dev);
3700         if (IS_ERR(kernel_dev))
3701                 goto out_dev;
3702
3703         options = copy_mount_options(data);
3704         ret = PTR_ERR(options);
3705         if (IS_ERR(options))
3706                 goto out_data;
3707
3708         ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
3709
3710         kfree(options);
3711 out_data:
3712         kfree(kernel_dev);
3713 out_dev:
3714         kfree(kernel_type);
3715 out_type:
3716         return ret;
3717 }
3718
3719 #define FSMOUNT_VALID_FLAGS                                                    \
3720         (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |            \
3721          MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME |       \
3722          MOUNT_ATTR_NOSYMFOLLOW)
3723
3724 #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
3725
3726 #define MOUNT_SETATTR_PROPAGATION_FLAGS \
3727         (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
3728
3729 static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
3730 {
3731         unsigned int mnt_flags = 0;
3732
3733         if (attr_flags & MOUNT_ATTR_RDONLY)
3734                 mnt_flags |= MNT_READONLY;
3735         if (attr_flags & MOUNT_ATTR_NOSUID)
3736                 mnt_flags |= MNT_NOSUID;
3737         if (attr_flags & MOUNT_ATTR_NODEV)
3738                 mnt_flags |= MNT_NODEV;
3739         if (attr_flags & MOUNT_ATTR_NOEXEC)
3740                 mnt_flags |= MNT_NOEXEC;
3741         if (attr_flags & MOUNT_ATTR_NODIRATIME)
3742                 mnt_flags |= MNT_NODIRATIME;
3743         if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
3744                 mnt_flags |= MNT_NOSYMFOLLOW;
3745
3746         return mnt_flags;
3747 }
3748
3749 /*
3750  * Create a kernel mount representation for a new, prepared superblock
3751  * (specified by fs_fd) and attach to an open_tree-like file descriptor.
3752  */
3753 SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
3754                 unsigned int, attr_flags)
3755 {
3756         struct mnt_namespace *ns;
3757         struct fs_context *fc;
3758         struct file *file;
3759         struct path newmount;
3760         struct mount *mnt;
3761         struct fd f;
3762         unsigned int mnt_flags = 0;
3763         long ret;
3764
3765         if (!may_mount())
3766                 return -EPERM;
3767
3768         if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
3769                 return -EINVAL;
3770
3771         if (attr_flags & ~FSMOUNT_VALID_FLAGS)
3772                 return -EINVAL;
3773
3774         mnt_flags = attr_flags_to_mnt_flags(attr_flags);
3775
3776         switch (attr_flags & MOUNT_ATTR__ATIME) {
3777         case MOUNT_ATTR_STRICTATIME:
3778                 break;
3779         case MOUNT_ATTR_NOATIME:
3780                 mnt_flags |= MNT_NOATIME;
3781                 break;
3782         case MOUNT_ATTR_RELATIME:
3783                 mnt_flags |= MNT_RELATIME;
3784                 break;
3785         default:
3786                 return -EINVAL;
3787         }
3788
3789         f = fdget(fs_fd);
3790         if (!f.file)
3791                 return -EBADF;
3792
3793         ret = -EINVAL;
3794         if (f.file->f_op != &fscontext_fops)
3795                 goto err_fsfd;
3796
3797         fc = f.file->private_data;
3798
3799         ret = mutex_lock_interruptible(&fc->uapi_mutex);
3800         if (ret < 0)
3801                 goto err_fsfd;
3802
3803         /* There must be a valid superblock or we can't mount it */
3804         ret = -EINVAL;
3805         if (!fc->root)
3806                 goto err_unlock;
3807
3808         ret = -EPERM;
3809         if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
3810                 pr_warn("VFS: Mount too revealing\n");
3811                 goto err_unlock;
3812         }
3813
3814         ret = -EBUSY;
3815         if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
3816                 goto err_unlock;
3817
3818         if (fc->sb_flags & SB_MANDLOCK)
3819                 warn_mandlock();
3820
3821         newmount.mnt = vfs_create_mount(fc);
3822         if (IS_ERR(newmount.mnt)) {
3823                 ret = PTR_ERR(newmount.mnt);
3824                 goto err_unlock;
3825         }
3826         newmount.dentry = dget(fc->root);
3827         newmount.mnt->mnt_flags = mnt_flags;
3828
3829         /* We've done the mount bit - now move the file context into more or
3830          * less the same state as if we'd done an fspick().  We don't want to
3831          * do any memory allocation or anything like that at this point as we
3832          * don't want to have to handle any errors incurred.
3833          */
3834         vfs_clean_context(fc);
3835
3836         ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
3837         if (IS_ERR(ns)) {
3838                 ret = PTR_ERR(ns);
3839                 goto err_path;
3840         }
3841         mnt = real_mount(newmount.mnt);
3842         mnt->mnt_ns = ns;
3843         ns->root = mnt;
3844         ns->mounts = 1;
3845         list_add(&mnt->mnt_list, &ns->list);
3846         mntget(newmount.mnt);
3847
3848         /* Attach to an apparent O_PATH fd with a note that we need to unmount
3849          * it, not just simply put it.
3850          */
3851         file = dentry_open(&newmount, O_PATH, fc->cred);
3852         if (IS_ERR(file)) {
3853                 dissolve_on_fput(newmount.mnt);
3854                 ret = PTR_ERR(file);
3855                 goto err_path;
3856         }
3857         file->f_mode |= FMODE_NEED_UNMOUNT;
3858
3859         ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
3860         if (ret >= 0)
3861                 fd_install(ret, file);
3862         else
3863                 fput(file);
3864
3865 err_path:
3866         path_put(&newmount);
3867 err_unlock:
3868         mutex_unlock(&fc->uapi_mutex);
3869 err_fsfd:
3870         fdput(f);
3871         return ret;
3872 }
3873
3874 /*
3875  * Move a mount from one place to another.  In combination with
3876  * fsopen()/fsmount() this is used to install a new mount and in combination
3877  * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
3878  * a mount subtree.
3879  *
3880  * Note the flags value is a combination of MOVE_MOUNT_* flags.
3881  */
3882 SYSCALL_DEFINE5(move_mount,
3883                 int, from_dfd, const char __user *, from_pathname,
3884                 int, to_dfd, const char __user *, to_pathname,
3885                 unsigned int, flags)
3886 {
3887         struct path from_path, to_path;
3888         unsigned int lflags;
3889         int ret = 0;
3890
3891         if (!may_mount())
3892                 return -EPERM;
3893
3894         if (flags & ~MOVE_MOUNT__MASK)
3895                 return -EINVAL;
3896
3897         /* If someone gives a pathname, they aren't permitted to move
3898          * from an fd that requires unmount as we can't get at the flag
3899          * to clear it afterwards.
3900          */
3901         lflags = 0;
3902         if (flags & MOVE_MOUNT_F_SYMLINKS)      lflags |= LOOKUP_FOLLOW;
3903         if (flags & MOVE_MOUNT_F_AUTOMOUNTS)    lflags |= LOOKUP_AUTOMOUNT;
3904         if (flags & MOVE_MOUNT_F_EMPTY_PATH)    lflags |= LOOKUP_EMPTY;
3905
3906         ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
3907         if (ret < 0)
3908                 return ret;
3909
3910         lflags = 0;
3911         if (flags & MOVE_MOUNT_T_SYMLINKS)      lflags |= LOOKUP_FOLLOW;
3912         if (flags & MOVE_MOUNT_T_AUTOMOUNTS)    lflags |= LOOKUP_AUTOMOUNT;
3913         if (flags & MOVE_MOUNT_T_EMPTY_PATH)    lflags |= LOOKUP_EMPTY;
3914
3915         ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
3916         if (ret < 0)
3917                 goto out_from;
3918
3919         ret = security_move_mount(&from_path, &to_path);
3920         if (ret < 0)
3921                 goto out_to;
3922
3923         if (flags & MOVE_MOUNT_SET_GROUP)
3924                 ret = do_set_group(&from_path, &to_path);
3925         else
3926                 ret = do_move_mount(&from_path, &to_path);
3927
3928 out_to:
3929         path_put(&to_path);
3930 out_from:
3931         path_put(&from_path);
3932         return ret;
3933 }
3934
3935 /*
3936  * Return true if path is reachable from root
3937  *
3938  * namespace_sem or mount_lock is held
3939  */
3940 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
3941                          const struct path *root)
3942 {
3943         while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
3944                 dentry = mnt->mnt_mountpoint;
3945                 mnt = mnt->mnt_parent;
3946         }
3947         return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
3948 }
3949
3950 bool path_is_under(const struct path *path1, const struct path *path2)
3951 {
3952         bool res;
3953         read_seqlock_excl(&mount_lock);
3954         res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
3955         read_sequnlock_excl(&mount_lock);
3956         return res;
3957 }
3958 EXPORT_SYMBOL(path_is_under);
3959
3960 /*
3961  * pivot_root Semantics:
3962  * Moves the root file system of the current process to the directory put_old,
3963  * makes new_root as the new root file system of the current process, and sets
3964  * root/cwd of all processes which had them on the current root to new_root.
3965  *
3966  * Restrictions:
3967  * The new_root and put_old must be directories, and  must not be on the
3968  * same file  system as the current process root. The put_old  must  be
3969  * underneath new_root,  i.e. adding a non-zero number of /.. to the string
3970  * pointed to by put_old must yield the same directory as new_root. No other
3971  * file system may be mounted on put_old. After all, new_root is a mountpoint.
3972  *
3973  * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
3974  * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
3975  * in this situation.
3976  *
3977  * Notes:
3978  *  - we don't move root/cwd if they are not at the root (reason: if something
3979  *    cared enough to change them, it's probably wrong to force them elsewhere)
3980  *  - it's okay to pick a root that isn't the root of a file system, e.g.
3981  *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
3982  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
3983  *    first.
3984  */
3985 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
3986                 const char __user *, put_old)
3987 {
3988         struct path new, old, root;
3989         struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
3990         struct mountpoint *old_mp, *root_mp;
3991         int error;
3992
3993         if (!may_mount())
3994                 return -EPERM;
3995
3996         error = user_path_at(AT_FDCWD, new_root,
3997                              LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
3998         if (error)
3999                 goto out0;
4000
4001         error = user_path_at(AT_FDCWD, put_old,
4002                              LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
4003         if (error)
4004                 goto out1;
4005
4006         error = security_sb_pivotroot(&old, &new);
4007         if (error)
4008                 goto out2;
4009
4010         get_fs_root(current->fs, &root);
4011         old_mp = lock_mount(&old);
4012         error = PTR_ERR(old_mp);
4013         if (IS_ERR(old_mp))
4014                 goto out3;
4015
4016         error = -EINVAL;
4017         new_mnt = real_mount(new.mnt);
4018         root_mnt = real_mount(root.mnt);
4019         old_mnt = real_mount(old.mnt);
4020         ex_parent = new_mnt->mnt_parent;
4021         root_parent = root_mnt->mnt_parent;
4022         if (IS_MNT_SHARED(old_mnt) ||
4023                 IS_MNT_SHARED(ex_parent) ||
4024                 IS_MNT_SHARED(root_parent))
4025                 goto out4;
4026         if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
4027                 goto out4;
4028         if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
4029                 goto out4;
4030         error = -ENOENT;
4031         if (d_unlinked(new.dentry))
4032                 goto out4;
4033         error = -EBUSY;
4034         if (new_mnt == root_mnt || old_mnt == root_mnt)
4035                 goto out4; /* loop, on the same file system  */
4036         error = -EINVAL;
4037         if (root.mnt->mnt_root != root.dentry)
4038                 goto out4; /* not a mountpoint */
4039         if (!mnt_has_parent(root_mnt))
4040                 goto out4; /* not attached */
4041         if (new.mnt->mnt_root != new.dentry)
4042                 goto out4; /* not a mountpoint */
4043         if (!mnt_has_parent(new_mnt))
4044                 goto out4; /* not attached */
4045         /* make sure we can reach put_old from new_root */
4046         if (!is_path_reachable(old_mnt, old.dentry, &new))
4047                 goto out4;
4048         /* make certain new is below the root */
4049         if (!is_path_reachable(new_mnt, new.dentry, &root))
4050                 goto out4;
4051         lock_mount_hash();
4052         umount_mnt(new_mnt);
4053         root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
4054         if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
4055                 new_mnt->mnt.mnt_flags |= MNT_LOCKED;
4056                 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
4057         }
4058         /* mount old root on put_old */
4059         attach_mnt(root_mnt, old_mnt, old_mp);
4060         /* mount new_root on / */
4061         attach_mnt(new_mnt, root_parent, root_mp);
4062         mnt_add_count(root_parent, -1);
4063         touch_mnt_namespace(current->nsproxy->mnt_ns);
4064         /* A moved mount should not expire automatically */
4065         list_del_init(&new_mnt->mnt_expire);
4066         put_mountpoint(root_mp);
4067         unlock_mount_hash();
4068         chroot_fs_refs(&root, &new);
4069         error = 0;
4070 out4:
4071         unlock_mount(old_mp);
4072         if (!error)
4073                 mntput_no_expire(ex_parent);
4074 out3:
4075         path_put(&root);
4076 out2:
4077         path_put(&old);
4078 out1:
4079         path_put(&new);
4080 out0:
4081         return error;
4082 }
4083
4084 static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
4085 {
4086         unsigned int flags = mnt->mnt.mnt_flags;
4087
4088         /*  flags to clear */
4089         flags &= ~kattr->attr_clr;
4090         /* flags to raise */
4091         flags |= kattr->attr_set;
4092
4093         return flags;
4094 }
4095
4096 static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
4097 {
4098         struct vfsmount *m = &mnt->mnt;
4099         struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
4100
4101         if (!kattr->mnt_idmap)
4102                 return 0;
4103
4104         /*
4105          * Creating an idmapped mount with the filesystem wide idmapping
4106          * doesn't make sense so block that. We don't allow mushy semantics.
4107          */
4108         if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns)
4109                 return -EINVAL;
4110
4111         /*
4112          * Once a mount has been idmapped we don't allow it to change its
4113          * mapping. It makes things simpler and callers can just create
4114          * another bind-mount they can idmap if they want to.
4115          */
4116         if (is_idmapped_mnt(m))
4117                 return -EPERM;
4118
4119         /* The underlying filesystem doesn't support idmapped mounts yet. */
4120         if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
4121                 return -EINVAL;
4122
4123         /* We're not controlling the superblock. */
4124         if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
4125                 return -EPERM;
4126
4127         /* Mount has already been visible in the filesystem hierarchy. */
4128         if (!is_anon_ns(mnt->mnt_ns))
4129                 return -EINVAL;
4130
4131         return 0;
4132 }
4133
4134 /**
4135  * mnt_allow_writers() - check whether the attribute change allows writers
4136  * @kattr: the new mount attributes
4137  * @mnt: the mount to which @kattr will be applied
4138  *
4139  * Check whether thew new mount attributes in @kattr allow concurrent writers.
4140  *
4141  * Return: true if writers need to be held, false if not
4142  */
4143 static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
4144                                      const struct mount *mnt)
4145 {
4146         return (!(kattr->attr_set & MNT_READONLY) ||
4147                 (mnt->mnt.mnt_flags & MNT_READONLY)) &&
4148                !kattr->mnt_idmap;
4149 }
4150
4151 static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
4152 {
4153         struct mount *m;
4154         int err;
4155
4156         for (m = mnt; m; m = next_mnt(m, mnt)) {
4157                 if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
4158                         err = -EPERM;
4159                         break;
4160                 }
4161
4162                 err = can_idmap_mount(kattr, m);
4163                 if (err)
4164                         break;
4165
4166                 if (!mnt_allow_writers(kattr, m)) {
4167                         err = mnt_hold_writers(m);
4168                         if (err)
4169                                 break;
4170                 }
4171
4172                 if (!kattr->recurse)
4173                         return 0;
4174         }
4175
4176         if (err) {
4177                 struct mount *p;
4178
4179                 /*
4180                  * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
4181                  * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
4182                  * mounts and needs to take care to include the first mount.
4183                  */
4184                 for (p = mnt; p; p = next_mnt(p, mnt)) {
4185                         /* If we had to hold writers unblock them. */
4186                         if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
4187                                 mnt_unhold_writers(p);
4188
4189                         /*
4190                          * We're done once the first mount we changed got
4191                          * MNT_WRITE_HOLD unset.
4192                          */
4193                         if (p == m)
4194                                 break;
4195                 }
4196         }
4197         return err;
4198 }
4199
4200 static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
4201 {
4202         if (!kattr->mnt_idmap)
4203                 return;
4204
4205         /*
4206          * Pairs with smp_load_acquire() in mnt_idmap().
4207          *
4208          * Since we only allow a mount to change the idmapping once and
4209          * verified this in can_idmap_mount() we know that the mount has
4210          * @nop_mnt_idmap attached to it. So there's no need to drop any
4211          * references.
4212          */
4213         smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
4214 }
4215
4216 static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
4217 {
4218         struct mount *m;
4219
4220         for (m = mnt; m; m = next_mnt(m, mnt)) {
4221                 unsigned int flags;
4222
4223                 do_idmap_mount(kattr, m);
4224                 flags = recalc_flags(kattr, m);
4225                 WRITE_ONCE(m->mnt.mnt_flags, flags);
4226
4227                 /* If we had to hold writers unblock them. */
4228                 if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
4229                         mnt_unhold_writers(m);
4230
4231                 if (kattr->propagation)
4232                         change_mnt_propagation(m, kattr->propagation);
4233                 if (!kattr->recurse)
4234                         break;
4235         }
4236         touch_mnt_namespace(mnt->mnt_ns);
4237 }
4238
4239 static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
4240 {
4241         struct mount *mnt = real_mount(path->mnt);
4242         int err = 0;
4243
4244         if (path->dentry != mnt->mnt.mnt_root)
4245                 return -EINVAL;
4246
4247         if (kattr->mnt_userns) {
4248                 struct mnt_idmap *mnt_idmap;
4249
4250                 mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
4251                 if (IS_ERR(mnt_idmap))
4252                         return PTR_ERR(mnt_idmap);
4253                 kattr->mnt_idmap = mnt_idmap;
4254         }
4255
4256         if (kattr->propagation) {
4257                 /*
4258                  * Only take namespace_lock() if we're actually changing
4259                  * propagation.
4260                  */
4261                 namespace_lock();
4262                 if (kattr->propagation == MS_SHARED) {
4263                         err = invent_group_ids(mnt, kattr->recurse);
4264                         if (err) {
4265                                 namespace_unlock();
4266                                 return err;
4267                         }
4268                 }
4269         }
4270
4271         err = -EINVAL;
4272         lock_mount_hash();
4273
4274         /* Ensure that this isn't anything purely vfs internal. */
4275         if (!is_mounted(&mnt->mnt))
4276                 goto out;
4277
4278         /*
4279          * If this is an attached mount make sure it's located in the callers
4280          * mount namespace. If it's not don't let the caller interact with it.
4281          * If this is a detached mount make sure it has an anonymous mount
4282          * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
4283          */
4284         if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
4285                 goto out;
4286
4287         /*
4288          * First, we get the mount tree in a shape where we can change mount
4289          * properties without failure. If we succeeded to do so we commit all
4290          * changes and if we failed we clean up.
4291          */
4292         err = mount_setattr_prepare(kattr, mnt);
4293         if (!err)
4294                 mount_setattr_commit(kattr, mnt);
4295
4296 out:
4297         unlock_mount_hash();
4298
4299         if (kattr->propagation) {
4300                 namespace_unlock();
4301                 if (err)
4302                         cleanup_group_ids(mnt, NULL);
4303         }
4304
4305         return err;
4306 }
4307
4308 static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
4309                                 struct mount_kattr *kattr, unsigned int flags)
4310 {
4311         int err = 0;
4312         struct ns_common *ns;
4313         struct user_namespace *mnt_userns;
4314         struct file *file;
4315
4316         if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
4317                 return 0;
4318
4319         /*
4320          * We currently do not support clearing an idmapped mount. If this ever
4321          * is a use-case we can revisit this but for now let's keep it simple
4322          * and not allow it.
4323          */
4324         if (attr->attr_clr & MOUNT_ATTR_IDMAP)
4325                 return -EINVAL;
4326
4327         if (attr->userns_fd > INT_MAX)
4328                 return -EINVAL;
4329
4330         file = fget(attr->userns_fd);
4331         if (!file)
4332                 return -EBADF;
4333
4334         if (!proc_ns_file(file)) {
4335                 err = -EINVAL;
4336                 goto out_fput;
4337         }
4338
4339         ns = get_proc_ns(file_inode(file));
4340         if (ns->ops->type != CLONE_NEWUSER) {
4341                 err = -EINVAL;
4342                 goto out_fput;
4343         }
4344
4345         /*
4346          * The initial idmapping cannot be used to create an idmapped
4347          * mount. We use the initial idmapping as an indicator of a mount
4348          * that is not idmapped. It can simply be passed into helpers that
4349          * are aware of idmapped mounts as a convenient shortcut. A user
4350          * can just create a dedicated identity mapping to achieve the same
4351          * result.
4352          */
4353         mnt_userns = container_of(ns, struct user_namespace, ns);
4354         if (initial_idmapping(mnt_userns)) {
4355                 err = -EPERM;
4356                 goto out_fput;
4357         }
4358
4359         /* We're not controlling the target namespace. */
4360         if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
4361                 err = -EPERM;
4362                 goto out_fput;
4363         }
4364
4365         kattr->mnt_userns = get_user_ns(mnt_userns);
4366
4367 out_fput:
4368         fput(file);
4369         return err;
4370 }
4371
4372 static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
4373                              struct mount_kattr *kattr, unsigned int flags)
4374 {
4375         unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
4376
4377         if (flags & AT_NO_AUTOMOUNT)
4378                 lookup_flags &= ~LOOKUP_AUTOMOUNT;
4379         if (flags & AT_SYMLINK_NOFOLLOW)
4380                 lookup_flags &= ~LOOKUP_FOLLOW;
4381         if (flags & AT_EMPTY_PATH)
4382                 lookup_flags |= LOOKUP_EMPTY;
4383
4384         *kattr = (struct mount_kattr) {
4385                 .lookup_flags   = lookup_flags,
4386                 .recurse        = !!(flags & AT_RECURSIVE),
4387         };
4388
4389         if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
4390                 return -EINVAL;
4391         if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
4392                 return -EINVAL;
4393         kattr->propagation = attr->propagation;
4394
4395         if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
4396                 return -EINVAL;
4397
4398         kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
4399         kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
4400
4401         /*
4402          * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
4403          * users wanting to transition to a different atime setting cannot
4404          * simply specify the atime setting in @attr_set, but must also
4405          * specify MOUNT_ATTR__ATIME in the @attr_clr field.
4406          * So ensure that MOUNT_ATTR__ATIME can't be partially set in
4407          * @attr_clr and that @attr_set can't have any atime bits set if
4408          * MOUNT_ATTR__ATIME isn't set in @attr_clr.
4409          */
4410         if (attr->attr_clr & MOUNT_ATTR__ATIME) {
4411                 if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
4412                         return -EINVAL;
4413
4414                 /*
4415                  * Clear all previous time settings as they are mutually
4416                  * exclusive.
4417                  */
4418                 kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
4419                 switch (attr->attr_set & MOUNT_ATTR__ATIME) {
4420                 case MOUNT_ATTR_RELATIME:
4421                         kattr->attr_set |= MNT_RELATIME;
4422                         break;
4423                 case MOUNT_ATTR_NOATIME:
4424                         kattr->attr_set |= MNT_NOATIME;
4425                         break;
4426                 case MOUNT_ATTR_STRICTATIME:
4427                         break;
4428                 default:
4429                         return -EINVAL;
4430                 }
4431         } else {
4432                 if (attr->attr_set & MOUNT_ATTR__ATIME)
4433                         return -EINVAL;
4434         }
4435
4436         return build_mount_idmapped(attr, usize, kattr, flags);
4437 }
4438
4439 static void finish_mount_kattr(struct mount_kattr *kattr)
4440 {
4441         put_user_ns(kattr->mnt_userns);
4442         kattr->mnt_userns = NULL;
4443
4444         if (kattr->mnt_idmap)
4445                 mnt_idmap_put(kattr->mnt_idmap);
4446 }
4447
4448 SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
4449                 unsigned int, flags, struct mount_attr __user *, uattr,
4450                 size_t, usize)
4451 {
4452         int err;
4453         struct path target;
4454         struct mount_attr attr;
4455         struct mount_kattr kattr;
4456
4457         BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
4458
4459         if (flags & ~(AT_EMPTY_PATH |
4460                       AT_RECURSIVE |
4461                       AT_SYMLINK_NOFOLLOW |
4462                       AT_NO_AUTOMOUNT))
4463                 return -EINVAL;
4464
4465         if (unlikely(usize > PAGE_SIZE))
4466                 return -E2BIG;
4467         if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
4468                 return -EINVAL;
4469
4470         if (!may_mount())
4471                 return -EPERM;
4472
4473         err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
4474         if (err)
4475                 return err;
4476
4477         /* Don't bother walking through the mounts if this is a nop. */
4478         if (attr.attr_set == 0 &&
4479             attr.attr_clr == 0 &&
4480             attr.propagation == 0)
4481                 return 0;
4482
4483         err = build_mount_kattr(&attr, usize, &kattr, flags);
4484         if (err)
4485                 return err;
4486
4487         err = user_path_at(dfd, path, kattr.lookup_flags, &target);
4488         if (!err) {
4489                 err = do_mount_setattr(&target, &kattr);
4490                 path_put(&target);
4491         }
4492         finish_mount_kattr(&kattr);
4493         return err;
4494 }
4495
4496 static void __init init_mount_tree(void)
4497 {
4498         struct vfsmount *mnt;
4499         struct mount *m;
4500         struct mnt_namespace *ns;
4501         struct path root;
4502
4503         mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
4504         if (IS_ERR(mnt))
4505                 panic("Can't create rootfs");
4506
4507         ns = alloc_mnt_ns(&init_user_ns, false);
4508         if (IS_ERR(ns))
4509                 panic("Can't allocate initial namespace");
4510         m = real_mount(mnt);
4511         m->mnt_ns = ns;
4512         ns->root = m;
4513         ns->mounts = 1;
4514         list_add(&m->mnt_list, &ns->list);
4515         init_task.nsproxy->mnt_ns = ns;
4516         get_mnt_ns(ns);
4517
4518         root.mnt = mnt;
4519         root.dentry = mnt->mnt_root;
4520         mnt->mnt_flags |= MNT_LOCKED;
4521
4522         set_fs_pwd(current->fs, &root);
4523         set_fs_root(current->fs, &root);
4524 }
4525
4526 void __init mnt_init(void)
4527 {
4528         int err;
4529
4530         mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
4531                         0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
4532
4533         mount_hashtable = alloc_large_system_hash("Mount-cache",
4534                                 sizeof(struct hlist_head),
4535                                 mhash_entries, 19,
4536                                 HASH_ZERO,
4537                                 &m_hash_shift, &m_hash_mask, 0, 0);
4538         mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
4539                                 sizeof(struct hlist_head),
4540                                 mphash_entries, 19,
4541                                 HASH_ZERO,
4542                                 &mp_hash_shift, &mp_hash_mask, 0, 0);
4543
4544         if (!mount_hashtable || !mountpoint_hashtable)
4545                 panic("Failed to allocate mount hash table\n");
4546
4547         kernfs_init();
4548
4549         err = sysfs_init();
4550         if (err)
4551                 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
4552                         __func__, err);
4553         fs_kobj = kobject_create_and_add("fs", NULL);
4554         if (!fs_kobj)
4555                 printk(KERN_WARNING "%s: kobj create error\n", __func__);
4556         shmem_init();
4557         init_rootfs();
4558         init_mount_tree();
4559 }
4560
4561 void put_mnt_ns(struct mnt_namespace *ns)
4562 {
4563         if (!refcount_dec_and_test(&ns->ns.count))
4564                 return;
4565         drop_collected_mounts(&ns->root->mnt);
4566         free_mnt_ns(ns);
4567 }
4568
4569 struct vfsmount *kern_mount(struct file_system_type *type)
4570 {
4571         struct vfsmount *mnt;
4572         mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
4573         if (!IS_ERR(mnt)) {
4574                 /*
4575                  * it is a longterm mount, don't release mnt until
4576                  * we unmount before file sys is unregistered
4577                 */
4578                 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
4579         }
4580         return mnt;
4581 }
4582 EXPORT_SYMBOL_GPL(kern_mount);
4583
4584 void kern_unmount(struct vfsmount *mnt)
4585 {
4586         /* release long term mount so mount point can be released */
4587         if (!IS_ERR(mnt)) {
4588                 mnt_make_shortterm(mnt);
4589                 synchronize_rcu();      /* yecchhh... */
4590                 mntput(mnt);
4591         }
4592 }
4593 EXPORT_SYMBOL(kern_unmount);
4594
4595 void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
4596 {
4597         unsigned int i;
4598
4599         for (i = 0; i < num; i++)
4600                 mnt_make_shortterm(mnt[i]);
4601         synchronize_rcu_expedited();
4602         for (i = 0; i < num; i++)
4603                 mntput(mnt[i]);
4604 }
4605 EXPORT_SYMBOL(kern_unmount_array);
4606
4607 bool our_mnt(struct vfsmount *mnt)
4608 {
4609         return check_mnt(real_mount(mnt));
4610 }
4611
4612 bool current_chrooted(void)
4613 {
4614         /* Does the current process have a non-standard root */
4615         struct path ns_root;
4616         struct path fs_root;
4617         bool chrooted;
4618
4619         /* Find the namespace root */
4620         ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
4621         ns_root.dentry = ns_root.mnt->mnt_root;
4622         path_get(&ns_root);
4623         while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
4624                 ;
4625
4626         get_fs_root(current->fs, &fs_root);
4627
4628         chrooted = !path_equal(&fs_root, &ns_root);
4629
4630         path_put(&fs_root);
4631         path_put(&ns_root);
4632
4633         return chrooted;
4634 }
4635
4636 static bool mnt_already_visible(struct mnt_namespace *ns,
4637                                 const struct super_block *sb,
4638                                 int *new_mnt_flags)
4639 {
4640         int new_flags = *new_mnt_flags;
4641         struct mount *mnt;
4642         bool visible = false;
4643
4644         down_read(&namespace_sem);
4645         lock_ns_list(ns);
4646         list_for_each_entry(mnt, &ns->list, mnt_list) {
4647                 struct mount *child;
4648                 int mnt_flags;
4649
4650                 if (mnt_is_cursor(mnt))
4651                         continue;
4652
4653                 if (mnt->mnt.mnt_sb->s_type != sb->s_type)
4654                         continue;
4655
4656                 /* This mount is not fully visible if it's root directory
4657                  * is not the root directory of the filesystem.
4658                  */
4659                 if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
4660                         continue;
4661
4662                 /* A local view of the mount flags */
4663                 mnt_flags = mnt->mnt.mnt_flags;
4664
4665                 /* Don't miss readonly hidden in the superblock flags */
4666                 if (sb_rdonly(mnt->mnt.mnt_sb))
4667                         mnt_flags |= MNT_LOCK_READONLY;
4668
4669                 /* Verify the mount flags are equal to or more permissive
4670                  * than the proposed new mount.
4671                  */
4672                 if ((mnt_flags & MNT_LOCK_READONLY) &&
4673                     !(new_flags & MNT_READONLY))
4674                         continue;
4675                 if ((mnt_flags & MNT_LOCK_ATIME) &&
4676                     ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
4677                         continue;
4678
4679                 /* This mount is not fully visible if there are any
4680                  * locked child mounts that cover anything except for
4681                  * empty directories.
4682                  */
4683                 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
4684                         struct inode *inode = child->mnt_mountpoint->d_inode;
4685                         /* Only worry about locked mounts */
4686                         if (!(child->mnt.mnt_flags & MNT_LOCKED))
4687                                 continue;
4688                         /* Is the directory permanetly empty? */
4689                         if (!is_empty_dir_inode(inode))
4690                                 goto next;
4691                 }
4692                 /* Preserve the locked attributes */
4693                 *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
4694                                                MNT_LOCK_ATIME);
4695                 visible = true;
4696                 goto found;
4697         next:   ;
4698         }
4699 found:
4700         unlock_ns_list(ns);
4701         up_read(&namespace_sem);
4702         return visible;
4703 }
4704
4705 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
4706 {
4707         const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
4708         struct mnt_namespace *ns = current->nsproxy->mnt_ns;
4709         unsigned long s_iflags;
4710
4711         if (ns->user_ns == &init_user_ns)
4712                 return false;
4713
4714         /* Can this filesystem be too revealing? */
4715         s_iflags = sb->s_iflags;
4716         if (!(s_iflags & SB_I_USERNS_VISIBLE))
4717                 return false;
4718
4719         if ((s_iflags & required_iflags) != required_iflags) {
4720                 WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
4721                           required_iflags);
4722                 return true;
4723         }
4724
4725         return !mnt_already_visible(ns, sb, new_mnt_flags);
4726 }
4727
4728 bool mnt_may_suid(struct vfsmount *mnt)
4729 {
4730         /*
4731          * Foreign mounts (accessed via fchdir or through /proc
4732          * symlinks) are always treated as if they are nosuid.  This
4733          * prevents namespaces from trusting potentially unsafe
4734          * suid/sgid bits, file caps, or security labels that originate
4735          * in other namespaces.
4736          */
4737         return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
4738                current_in_userns(mnt->mnt_sb->s_user_ns);
4739 }
4740
4741 static struct ns_common *mntns_get(struct task_struct *task)
4742 {
4743         struct ns_common *ns = NULL;
4744         struct nsproxy *nsproxy;
4745
4746         task_lock(task);
4747         nsproxy = task->nsproxy;
4748         if (nsproxy) {
4749                 ns = &nsproxy->mnt_ns->ns;
4750                 get_mnt_ns(to_mnt_ns(ns));
4751         }
4752         task_unlock(task);
4753
4754         return ns;
4755 }
4756
4757 static void mntns_put(struct ns_common *ns)
4758 {
4759         put_mnt_ns(to_mnt_ns(ns));
4760 }
4761
4762 static int mntns_install(struct nsset *nsset, struct ns_common *ns)
4763 {
4764         struct nsproxy *nsproxy = nsset->nsproxy;
4765         struct fs_struct *fs = nsset->fs;
4766         struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
4767         struct user_namespace *user_ns = nsset->cred->user_ns;
4768         struct path root;
4769         int err;
4770
4771         if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
4772             !ns_capable(user_ns, CAP_SYS_CHROOT) ||
4773             !ns_capable(user_ns, CAP_SYS_ADMIN))
4774                 return -EPERM;
4775
4776         if (is_anon_ns(mnt_ns))
4777                 return -EINVAL;
4778
4779         if (fs->users != 1)
4780                 return -EINVAL;
4781
4782         get_mnt_ns(mnt_ns);
4783         old_mnt_ns = nsproxy->mnt_ns;
4784         nsproxy->mnt_ns = mnt_ns;
4785
4786         /* Find the root */
4787         err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
4788                                 "/", LOOKUP_DOWN, &root);
4789         if (err) {
4790                 /* revert to old namespace */
4791                 nsproxy->mnt_ns = old_mnt_ns;
4792                 put_mnt_ns(mnt_ns);
4793                 return err;
4794         }
4795
4796         put_mnt_ns(old_mnt_ns);
4797
4798         /* Update the pwd and root */
4799         set_fs_pwd(fs, &root);
4800         set_fs_root(fs, &root);
4801
4802         path_put(&root);
4803         return 0;
4804 }
4805
4806 static struct user_namespace *mntns_owner(struct ns_common *ns)
4807 {
4808         return to_mnt_ns(ns)->user_ns;
4809 }
4810
4811 const struct proc_ns_operations mntns_operations = {
4812         .name           = "mnt",
4813         .type           = CLONE_NEWNS,
4814         .get            = mntns_get,
4815         .put            = mntns_put,
4816         .install        = mntns_install,
4817         .owner          = mntns_owner,
4818 };
4819
4820 #ifdef CONFIG_SYSCTL
4821 static struct ctl_table fs_namespace_sysctls[] = {
4822         {
4823                 .procname       = "mount-max",
4824                 .data           = &sysctl_mount_max,
4825                 .maxlen         = sizeof(unsigned int),
4826                 .mode           = 0644,
4827                 .proc_handler   = proc_dointvec_minmax,
4828                 .extra1         = SYSCTL_ONE,
4829         },
4830         { }
4831 };
4832
4833 static int __init init_fs_namespace_sysctls(void)
4834 {
4835         register_sysctl_init("fs", fs_namespace_sysctls);
4836         return 0;
4837 }
4838 fs_initcall(init_fs_namespace_sysctls);
4839
4840 #endif /* CONFIG_SYSCTL */